diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index e0cbd9777da41a..0f41ad622f8e82 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -5685,6 +5685,13 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree) // These intrinsics are "ins reg/mem, xmm" ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); attr = emitActualTypeSize(baseType); +#if defined(TARGET_X86) + if (varTypeIsLong(baseType)) + { + ins = INS_movq; + attr = EA_8BYTE; + } +#endif // TARGET_X86 break; } diff --git a/src/coreclr/jit/decomposelongs.cpp b/src/coreclr/jit/decomposelongs.cpp index 844263d400cfdb..0461a12d181d3a 100644 --- a/src/coreclr/jit/decomposelongs.cpp +++ b/src/coreclr/jit/decomposelongs.cpp @@ -78,11 +78,11 @@ void DecomposeLongs::DecomposeBlock(BasicBlock* block) // Return Value: // None. // -void DecomposeLongs::DecomposeRange(Compiler* compiler, LIR::Range& range) +void DecomposeLongs::DecomposeRange(Compiler* compiler, Lowering* lowering, LIR::Range& range) { assert(compiler != nullptr); - DecomposeLongs decomposer(compiler); + DecomposeLongs decomposer(compiler, lowering); decomposer.m_range = ⦥ decomposer.DecomposeRangeHelper(); @@ -90,7 +90,7 @@ void DecomposeLongs::DecomposeRange(Compiler* compiler, LIR::Range& range) //------------------------------------------------------------------------ // DecomposeLongs::DecomposeRangeHelper: -// Decompiose each node in the current range. +// Decompose each node in the current range. // // Decomposition is done as an execution-order walk. Decomposition of // a particular node can create new nodes that need to be further @@ -122,44 +122,76 @@ void DecomposeLongs::DecomposeRangeHelper() GenTree* DecomposeLongs::DecomposeNode(GenTree* tree) { // Handle the case where we are implicitly using the lower half of a long lclVar. - if ((tree->TypeGet() == TYP_INT) && tree->OperIsLocal()) + if (tree->TypeIs(TYP_INT) && tree->OperIsLocal()) { LclVarDsc* varDsc = m_compiler->lvaGetDesc(tree->AsLclVarCommon()); if (varTypeIsLong(varDsc) && varDsc->lvPromoted) { -#ifdef DEBUG - if (m_compiler->verbose) - { - printf("Changing implicit reference to lo half of long lclVar to an explicit reference of its promoted " - "half:\n"); - m_compiler->gtDispTreeRange(Range(), tree); - } -#endif // DEBUG + JITDUMP("Changing implicit reference to lo half of long lclVar to an explicit reference of its promoted " + "half:\n"); + DISPTREERANGE(Range(), tree); + unsigned loVarNum = varDsc->lvFieldLclStart; tree->AsLclVarCommon()->SetLclNum(loVarNum); return tree->gtNext; } } - if (tree->TypeGet() != TYP_LONG) + if (!tree->TypeIs(TYP_LONG)) { return tree->gtNext; } -#ifdef DEBUG - if (m_compiler->verbose) - { - printf("Decomposing TYP_LONG tree. BEFORE:\n"); - m_compiler->gtDispTreeRange(Range(), tree); - } -#endif // DEBUG - LIR::Use use; if (!Range().TryGetUse(tree, &use)) { LIR::Use::MakeDummyUse(Range(), tree, &use); } +#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86) + if (!use.IsDummyUse()) + { + // HWIntrinsics can consume/produce a long directly, provided its source/target is memory. + // Here we do a conservative check for specific cases where it is certain the load/store + // can be contained. In those cases, we can skip decomposition. + + GenTree* user = use.User(); + + if (user->OperIsHWIntrinsic()) + { + if (tree->OperIs(GT_CNS_LNG) || + (tree->OperIs(GT_IND, GT_LCL_FLD) && m_lowering->IsSafeToContainMem(user, tree))) + { + NamedIntrinsic intrinsicId = user->AsHWIntrinsic()->GetHWIntrinsicId(); + assert(HWIntrinsicInfo::IsVectorCreate(intrinsicId) || + HWIntrinsicInfo::IsVectorCreateScalar(intrinsicId) || + HWIntrinsicInfo::IsVectorCreateScalarUnsafe(intrinsicId)); + + return tree->gtNext; + } + } + else if (user->OperIs(GT_STOREIND) && tree->OperIsHWIntrinsic() && m_compiler->opts.OptimizationEnabled()) + { + NamedIntrinsic intrinsicId = tree->AsHWIntrinsic()->GetHWIntrinsicId(); + if (HWIntrinsicInfo::IsVectorToScalar(intrinsicId) && m_lowering->IsSafeToContainMem(user, tree)) + { + return tree->gtNext; + } + } + } + + if (tree->OperIs(GT_STOREIND) && tree->AsStoreInd()->Data()->OperIsHWIntrinsic()) + { + // We should only get here if we matched the second pattern above. + assert(HWIntrinsicInfo::IsVectorToScalar(tree->AsStoreInd()->Data()->AsHWIntrinsic()->GetHWIntrinsicId())); + + return tree->gtNext; + } +#endif // FEATURE_HW_INTRINSICS && TARGET_X86 + + JITDUMP("Decomposing TYP_LONG tree. BEFORE:\n"); + DISPTREERANGE(Range(), tree); + GenTree* nextNode = nullptr; switch (tree->OperGet()) { @@ -270,19 +302,14 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree) // If we replaced the argument to a GT_FIELD_LIST element with a GT_LONG node, split that field list // element into two elements: one for each half of the GT_LONG. - if ((use.Def()->OperGet() == GT_LONG) && !use.IsDummyUse() && (use.User()->OperGet() == GT_FIELD_LIST)) + if (use.Def()->OperIs(GT_LONG) && !use.IsDummyUse() && use.User()->OperIs(GT_FIELD_LIST)) { DecomposeFieldList(use.User()->AsFieldList(), use.Def()->AsOp()); } -#ifdef DEBUG - if (m_compiler->verbose) - { - // NOTE: st_lcl_var doesn't dump properly afterwards. - printf("Decomposing TYP_LONG tree. AFTER:\n"); - m_compiler->gtDispTreeRange(Range(), use.Def()); - } -#endif + // NOTE: st_lcl_var doesn't dump properly afterwards. + JITDUMP("Decomposing TYP_LONG tree. AFTER:\n"); + DISPTREERANGE(Range(), use.Def()); // When casting from a decomposed long to a smaller integer we can discard the high part. if (m_compiler->opts.OptimizationEnabled() && !use.IsDummyUse() && use.User()->OperIs(GT_CAST) && @@ -1707,6 +1734,13 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsic(LIR::Use& use) return DecomposeHWIntrinsicGetElement(use, hwintrinsicTree); } + case NI_Vector128_ToScalar: + case NI_Vector256_ToScalar: + case NI_Vector512_ToScalar: + { + return DecomposeHWIntrinsicToScalar(use, hwintrinsicTree); + } + case NI_EVEX_MoveMask: { return DecomposeHWIntrinsicMoveMask(use, hwintrinsicTree); @@ -1751,9 +1785,7 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsicGetElement(LIR::Use& use, GenTreeHW { assert(node == use.Def()); assert(varTypeIsLong(node)); - assert((node->GetHWIntrinsicId() == NI_Vector128_GetElement) || - (node->GetHWIntrinsicId() == NI_Vector256_GetElement) || - (node->GetHWIntrinsicId() == NI_Vector512_GetElement)); + assert(HWIntrinsicInfo::IsVectorGetElement(node->GetHWIntrinsicId())); GenTree* op1 = node->Op(1); GenTree* op2 = node->Op(2); @@ -1835,6 +1867,75 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsicGetElement(LIR::Use& use, GenTreeHW return FinalizeDecomposition(use, loResult, hiResult, hiResult); } +//------------------------------------------------------------------------ +// DecomposeHWIntrinsicToScalar: Decompose GT_HWINTRINSIC -- NI_Vector*_ToScalar. +// +// create: +// +// tmp_simd_var = simd_var +// lo_result = GT_HWINTRINSIC{ToScalar}[int](tmp_simd_var) +// hi_result = GT_HWINTRINSIC{GetElement}[int](tmp_simd_var, 1) +// - or - +// GT_HWINTRINSIC{ToScalar}[int](GT_RSZ(tmp_simd_var, 32)) +// return: GT_LONG(lo_result, hi_result) +// +// Arguments: +// use - the LIR::Use object for the def that needs to be decomposed. +// node - the hwintrinsic node to decompose +// +// Return Value: +// The GT_LONG node wrapping the upper and lower halves. +// +GenTree* DecomposeLongs::DecomposeHWIntrinsicToScalar(LIR::Use& use, GenTreeHWIntrinsic* node) +{ + assert(node == use.Def()); + assert(varTypeIsLong(node)); + assert(HWIntrinsicInfo::IsVectorToScalar(node->GetHWIntrinsicId())); + + GenTree* op1 = node->Op(1); + NamedIntrinsic intrinsicId = node->GetHWIntrinsicId(); + var_types simdBaseType = node->GetSimdBaseType(); + unsigned simdSize = node->GetSimdSize(); + + assert(varTypeIsLong(simdBaseType)); + assert(varTypeIsSIMD(op1)); + + GenTree* simdTmpVar = RepresentOpAsLocalVar(op1, node, &node->Op(1)); + unsigned simdTmpVarNum = simdTmpVar->AsLclVarCommon()->GetLclNum(); + JITDUMP("[DecomposeHWIntrinsicToScalar]: Saving op1 tree to a temp var:\n"); + DISPTREERANGE(Range(), simdTmpVar); + + GenTree* loResult = m_compiler->gtNewSimdToScalarNode(TYP_INT, simdTmpVar, CORINFO_TYPE_INT, simdSize); + Range().InsertAfter(simdTmpVar, loResult); + + simdTmpVar = m_compiler->gtNewLclLNode(simdTmpVarNum, simdTmpVar->TypeGet()); + Range().InsertAfter(loResult, simdTmpVar); + + GenTree* hiResult; + if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_SSE41)) + { + GenTree* one = m_compiler->gtNewIconNode(1); + hiResult = m_compiler->gtNewSimdGetElementNode(TYP_INT, simdTmpVar, one, CORINFO_TYPE_INT, simdSize); + + Range().InsertAfter(simdTmpVar, one, hiResult); + } + else + { + assert(m_compiler->compIsaSupportedDebugOnly(InstructionSet_SSE2)); + + GenTree* thirtyTwo = m_compiler->gtNewIconNode(32); + GenTree* shift = m_compiler->gtNewSimdBinOpNode(GT_RSZ, op1->TypeGet(), simdTmpVar, thirtyTwo, + node->GetSimdBaseJitType(), simdSize); + hiResult = m_compiler->gtNewSimdToScalarNode(TYP_INT, shift, CORINFO_TYPE_INT, simdSize); + + Range().InsertAfter(simdTmpVar, thirtyTwo, shift, hiResult); + } + + Range().Remove(node); + + return FinalizeDecomposition(use, loResult, hiResult, hiResult); +} + //------------------------------------------------------------------------ // DecomposeHWIntrinsicMoveMask: Decompose GT_HWINTRINSIC -- NI_EVEX_MoveMask // @@ -2262,6 +2363,13 @@ void DecomposeLongs::TryPromoteLongVar(unsigned lclNum) { return; } +#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86) + if (varDsc->lvIsParam) + { + // Promotion blocks combined read optimizations for SIMD loads of long params + return; + } +#endif // FEATURE_HW_INTRINSICS && TARGET_X86 varDsc->lvFieldCnt = 2; varDsc->lvFieldLclStart = m_compiler->lvaCount; diff --git a/src/coreclr/jit/decomposelongs.h b/src/coreclr/jit/decomposelongs.h index 02681322a552e9..e879292abf4996 100644 --- a/src/coreclr/jit/decomposelongs.h +++ b/src/coreclr/jit/decomposelongs.h @@ -14,19 +14,21 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #define _DECOMPOSELONGS_H_ #include "compiler.h" +#include "lower.h" class DecomposeLongs { public: - DecomposeLongs(Compiler* compiler) + DecomposeLongs(Compiler* compiler, Lowering* lowering) : m_compiler(compiler) + , m_lowering(lowering) { } void PrepareForDecomposition(); void DecomposeBlock(BasicBlock* block); - static void DecomposeRange(Compiler* compiler, LIR::Range& range); + static void DecomposeRange(Compiler* compiler, Lowering* lowering, LIR::Range& range); private: inline LIR::Range& Range() const @@ -64,6 +66,7 @@ class DecomposeLongs #ifdef FEATURE_HW_INTRINSICS GenTree* DecomposeHWIntrinsic(LIR::Use& use); GenTree* DecomposeHWIntrinsicGetElement(LIR::Use& use, GenTreeHWIntrinsic* node); + GenTree* DecomposeHWIntrinsicToScalar(LIR::Use& use, GenTreeHWIntrinsic* node); GenTree* DecomposeHWIntrinsicMoveMask(LIR::Use& use, GenTreeHWIntrinsic* node); #endif // FEATURE_HW_INTRINSICS @@ -80,6 +83,7 @@ class DecomposeLongs // Data Compiler* m_compiler; + Lowering* m_lowering; LIR::Range* m_range; }; diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index ff4d19fee3fa46..178d1b0232b20e 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -20772,22 +20772,31 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; case INS_movd: - case INS_movq: // only MOVQ xmm, xmm is different (emitted by Sse2.MoveScalar, should use MOVDQU instead) + case INS_movq: if (memAccessKind == PERFSCORE_MEMORY_NONE) { - // movd r32, xmm or xmm, r32 - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - result.insLatency = PERFSCORE_LATENCY_3C; + if (isFloatReg(id->idReg1()) && isFloatReg(id->idReg2())) + { + // movq xmm, xmm + result.insThroughput = PERFSCORE_THROUGHPUT_3X; + result.insLatency = PERFSCORE_LATENCY_1C; + } + else + { + // movd r32/64, xmm or xmm, r32/64 + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_3C; + } } else if (memAccessKind == PERFSCORE_MEMORY_READ) { - // movd xmm, m32 + // ins xmm, m32/64 result.insThroughput = PERFSCORE_THROUGHPUT_2X; result.insLatency += PERFSCORE_LATENCY_2C; } else { - // movd m32, xmm + // ins m32/64, xmm assert(memAccessKind == PERFSCORE_MEMORY_WRITE); result.insThroughput = PERFSCORE_THROUGHPUT_1C; result.insLatency += PERFSCORE_LATENCY_2C; diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index ff2c9583ea8af1..c2cd287c306248 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -20260,7 +20260,7 @@ var_types GenTreeJitIntrinsic::GetSimdBaseType() const // isCommutativeHWIntrinsic: Checks if the intrinsic is commutative // // Return Value: -// true if the intrisic is commutative +// true if the intrinsic is commutative // bool GenTree::isCommutativeHWIntrinsic() const { @@ -20422,6 +20422,9 @@ bool GenTree::isContainableHWIntrinsic() const return true; } + case NI_Vector128_CreateScalar: + case NI_Vector256_CreateScalar: + case NI_Vector512_CreateScalar: case NI_Vector128_CreateScalarUnsafe: case NI_Vector256_CreateScalarUnsafe: case NI_Vector512_CreateScalarUnsafe: @@ -21568,7 +21571,7 @@ GenTree* Compiler::gtNewSimdCeilNode(var_types type, GenTree* op1, CorInfoType s #if defined(FEATURE_MASKED_HW_INTRINSICS) //------------------------------------------------------------------------ -// gtNewSimdCvtMaskToVectorNode: Convert a HW instrinsic mask node to a vector +// gtNewSimdCvtMaskToVectorNode: Convert a HW intrinsic mask node to a vector // // Arguments: // type -- The type of the node to convert to @@ -21993,7 +21996,7 @@ GenTree* Compiler::gtNewSimdCvtNativeNode(var_types type, #if defined(FEATURE_MASKED_HW_INTRINSICS) //------------------------------------------------------------------------ -// gtNewSimdCvtVectorToMaskNode: Convert a HW instrinsic vector node to a mask +// gtNewSimdCvtVectorToMaskNode: Convert a HW intrinsic vector node to a mask // // Arguments: // type -- The type of the mask to produce. @@ -22747,16 +22750,6 @@ GenTree* Compiler::gtNewSimdCreateBroadcastNode(var_types type, } #if defined(TARGET_XARCH) -#if defined(TARGET_X86) - if (varTypeIsLong(simdBaseType) && !op1->IsIntegralConst()) - { - // TODO-XARCH-CQ: It may be beneficial to emit the movq - // instruction, which takes a 64-bit memory address and - // works on 32-bit x86 systems. - unreached(); - } -#endif // TARGET_X86 - if (simdSize == 64) { hwIntrinsicID = NI_Vector512_Create; @@ -22860,16 +22853,6 @@ GenTree* Compiler::gtNewSimdCreateScalarNode(var_types type, } #if defined(TARGET_XARCH) -#if defined(TARGET_X86) - if (varTypeIsLong(simdBaseType) && !op1->IsIntegralConst()) - { - // TODO-XARCH-CQ: It may be beneficial to emit the movq - // instruction, which takes a 64-bit memory address and - // works on 32-bit x86 systems. - unreached(); - } -#endif // TARGET_X86 - if (simdSize == 32) { hwIntrinsicID = NI_Vector256_CreateScalar; @@ -23005,16 +22988,6 @@ GenTree* Compiler::gtNewSimdCreateScalarUnsafeNode(var_types type, } #if defined(TARGET_XARCH) -#if defined(TARGET_X86) - if (varTypeIsLong(simdBaseType) && !op1->IsIntegralConst()) - { - // TODO-XARCH-CQ: It may be beneficial to emit the movq - // instruction, which takes a 64-bit memory address and - // works on 32-bit x86 systems. - unreached(); - } -#endif // TARGET_X86 - if (simdSize == 32) { hwIntrinsicID = NI_Vector256_CreateScalarUnsafe; @@ -23051,7 +23024,7 @@ GenTree* Compiler::gtNewSimdCreateScalarUnsafeNode(var_types type, GenTree* Compiler::gtNewSimdCreateSequenceNode( var_types type, GenTree* op1, GenTree* op2, CorInfoType simdBaseJitType, unsigned simdSize) { - // This effectively doees: (Indices * op2) + Create(op1) + // This effectively does: (Indices * op2) + Create(op1) // // When both op2 and op1 are constant we can fully fold this to a constant. Additionally, // if only op2 is a constant we can simplify the computation by a lot. However, if only op1 @@ -23383,14 +23356,7 @@ GenTree* Compiler::gtNewSimdGetElementNode( assert(varTypeIsArithmetic(simdBaseType)); #if defined(TARGET_XARCH) - bool useToScalar = op2->IsIntegralConst(0); - -#if defined(TARGET_X86) - // We handle decomposition via GetElement for simplicity - useToScalar &= !varTypeIsLong(simdBaseType); -#endif // TARGET_X86 - - if (useToScalar) + if (op2->IsIntegralConst(0)) { return gtNewSimdToScalarNode(type, op1, simdBaseJitType, simdSize); } @@ -26368,18 +26334,6 @@ GenTree* Compiler::gtNewSimdToScalarNode(var_types type, GenTree* op1, CorInfoTy NamedIntrinsic intrinsic = NI_Illegal; #ifdef TARGET_XARCH -#if defined(TARGET_X86) - if (varTypeIsLong(simdBaseType)) - { - // We need SSE41 to handle long, use software fallback - assert(compIsaSupportedDebugOnly(InstructionSet_SSE41)); - - // Create a GetElement node which handles decomposition - GenTree* op2 = gtNewIconNode(0); - return gtNewSimdGetElementNode(type, op1, op2, simdBaseJitType, simdSize); - } -#endif // TARGET_X86 - if (simdSize == 64) { assert(IsBaselineVector512IsaSupportedDebugOnly()); @@ -27539,7 +27493,7 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad(GenTree** pAddr) const } //------------------------------------------------------------------------ -// OperIsMemoryLoad: Does this HWI node have memory store semantics? +// OperIsMemoryStore: Does this HWI node have memory store semantics? // // Arguments: // pAddr - optional [out] parameter for the address @@ -27674,7 +27628,7 @@ bool GenTreeHWIntrinsic::OperIsMemoryStoreOrBarrier() const } //------------------------------------------------------------------------ -// OperIsEmbBroadcastCompatible: Checks if the intrinsic is a embedded broadcast compatible inintrsic. +// OperIsEmbBroadcastCompatible: Checks if the intrinsic is a embedded broadcast compatible intrinsic. // // Return Value: // true if the intrinsic node lowering instruction is embedded broadcast compatible. @@ -27740,37 +27694,6 @@ bool GenTreeHWIntrinsic::OperIsBroadcastScalar() const #endif } -//------------------------------------------------------------------------ -// OperIsCreateScalarUnsafe: Is this HWIntrinsic a CreateScalarUnsafe node. -// -// Return Value: -// Whether "this" is a CreateScalarUnsafe node. -// -bool GenTreeHWIntrinsic::OperIsCreateScalarUnsafe() const -{ - NamedIntrinsic intrinsicId = GetHWIntrinsicId(); - - switch (intrinsicId) - { -#if defined(TARGET_ARM64) - case NI_Vector64_CreateScalarUnsafe: -#endif // TARGET_ARM64 - case NI_Vector128_CreateScalarUnsafe: -#if defined(TARGET_XARCH) - case NI_Vector256_CreateScalarUnsafe: - case NI_Vector512_CreateScalarUnsafe: -#endif // TARGET_XARCH - { - return true; - } - - default: - { - return false; - } - } -} - //------------------------------------------------------------------------ // OperIsBitwiseHWIntrinsic: Is the operation a bitwise logic operation. // diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 6af651b6860882..9f22ca88372380 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -6434,7 +6434,6 @@ struct GenTreeHWIntrinsic : public GenTreeJitIntrinsic bool OperIsMemoryStoreOrBarrier() const; bool OperIsEmbBroadcastCompatible() const; bool OperIsBroadcastScalar() const; - bool OperIsCreateScalarUnsafe() const; bool OperIsBitwiseHWIntrinsic() const; bool OperIsEmbRoundingEnabled() const; @@ -6788,26 +6787,25 @@ struct GenTreeVecCon : public GenTree case TYP_LONG: case TYP_ULONG: { -#if defined(TARGET_64BIT) - if (arg->IsCnsIntOrI()) + if (arg->IsIntegralConst()) { - simdVal.i64[argIdx] = static_cast(arg->AsIntCon()->gtIconVal); + simdVal.i64[argIdx] = arg->AsIntConCommon()->IntegralValue(); return true; } -#else - if (arg->OperIsLong() && arg->AsOp()->gtOp1->IsCnsIntOrI() && arg->AsOp()->gtOp2->IsCnsIntOrI()) +#if !defined(TARGET_64BIT) + else if (arg->OperIsLong() && arg->gtGetOp1()->IsCnsIntOrI() && arg->gtGetOp2()->IsCnsIntOrI()) { - // 32-bit targets will decompose GT_CNS_LNG into two GT_CNS_INT + // 32-bit targets may decompose GT_CNS_LNG into two GT_CNS_INT // We need to reconstruct the 64-bit value in order to handle this - INT64 gtLconVal = arg->AsOp()->gtOp2->AsIntCon()->gtIconVal; + INT64 gtLconVal = arg->gtGetOp2()->AsIntCon()->gtIconVal; gtLconVal <<= 32; - gtLconVal |= arg->AsOp()->gtOp1->AsIntCon()->gtIconVal; + gtLconVal |= static_cast(arg->gtGetOp1()->AsIntCon()->gtIconVal); simdVal.i64[argIdx] = gtLconVal; return true; } -#endif // TARGET_64BIT +#endif // !TARGET_64BIT else { // We expect the constant to have been already zeroed diff --git a/src/coreclr/jit/hwintrinsic.h b/src/coreclr/jit/hwintrinsic.h index d8bf386eb6009d..d936d579d8e25a 100644 --- a/src/coreclr/jit/hwintrinsic.h +++ b/src/coreclr/jit/hwintrinsic.h @@ -923,6 +923,96 @@ struct HWIntrinsicInfo return false; } + static bool IsVectorCreate(NamedIntrinsic id) + { + switch (id) + { +#if defined(TARGET_ARM64) + case NI_Vector64_Create: +#endif // TARGET_ARM64 + case NI_Vector128_Create: +#if defined(TARGET_XARCH) + case NI_Vector256_Create: + case NI_Vector512_Create: +#endif // TARGET_XARCH + return true; + default: + return false; + } + } + + static bool IsVectorCreateScalar(NamedIntrinsic id) + { + switch (id) + { +#if defined(TARGET_ARM64) + case NI_Vector64_CreateScalar: +#endif // TARGET_ARM64 + case NI_Vector128_CreateScalar: +#if defined(TARGET_XARCH) + case NI_Vector256_CreateScalar: + case NI_Vector512_CreateScalar: +#endif // TARGET_XARCH + return true; + default: + return false; + } + } + + static bool IsVectorCreateScalarUnsafe(NamedIntrinsic id) + { + switch (id) + { +#if defined(TARGET_ARM64) + case NI_Vector64_CreateScalarUnsafe: +#endif // TARGET_ARM64 + case NI_Vector128_CreateScalarUnsafe: +#if defined(TARGET_XARCH) + case NI_Vector256_CreateScalarUnsafe: + case NI_Vector512_CreateScalarUnsafe: +#endif // TARGET_XARCH + return true; + default: + return false; + } + } + + static bool IsVectorGetElement(NamedIntrinsic id) + { + switch (id) + { +#if defined(TARGET_ARM64) + case NI_Vector64_GetElement: +#endif // TARGET_ARM64 + case NI_Vector128_GetElement: +#if defined(TARGET_XARCH) + case NI_Vector256_GetElement: + case NI_Vector512_GetElement: +#endif // TARGET_XARCH + return true; + default: + return false; + } + } + + static bool IsVectorToScalar(NamedIntrinsic id) + { + switch (id) + { +#if defined(TARGET_ARM64) + case NI_Vector64_ToScalar: +#endif // TARGET_ARM64 + case NI_Vector128_ToScalar: +#if defined(TARGET_XARCH) + case NI_Vector256_ToScalar: + case NI_Vector512_ToScalar: +#endif // TARGET_XARCH + return true; + default: + return false; + } + } + static bool HasImmediateOperand(NamedIntrinsic id) { #if defined(TARGET_ARM64) diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index e96f6c0675b8f3..19792a61c4083e 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -1080,7 +1080,7 @@ void CodeGen::genHWIntrinsic_R_RM( if (((instOptions & INS_OPTS_EVEX_b_MASK) != 0) && (rmOpDesc.GetKind() == OperandKind::Reg)) { - // As embedded rounding only appies in R_R case, we can skip other checks for different paths. + // As embedded rounding only applies in R_R case, we can skip other checks for different paths. regNumber op1Reg = rmOp->GetRegNum(); assert(op1Reg != REG_NA); @@ -1171,7 +1171,7 @@ void CodeGen::genHWIntrinsic_R_RM( // that failed and we either didn't get marked regOptional or we did and didn't get spilled // // As such, we need to emulate the removed CreateScalarUnsafe to ensure that op1 is in a - // SIMD register so the broadcast instruction can execute succesfully. We'll just move + // SIMD register so the broadcast instruction can execute successfully. We'll just move // the value into the target register and then broadcast it out from that. emitAttr movdAttr = emitActualTypeSize(node->GetSimdBaseType()); @@ -1435,7 +1435,7 @@ void CodeGen::genHWIntrinsic_R_R_R_RM(instruction ins, if (((instOptions & INS_OPTS_EVEX_b_MASK) != 0) && (op3Desc.GetKind() == OperandKind::Reg)) { - // As embedded rounding only appies in R_R case, we can skip other checks for different paths. + // As embedded rounding only applies in R_R case, we can skip other checks for different paths. regNumber op3Reg = op3->GetRegNum(); assert(op3Reg != REG_NA); @@ -1616,7 +1616,7 @@ void CodeGen::genHWIntrinsicJumpTableFallback(NamedIntrinsic intrinsi { assert(nonConstImmReg != REG_NA); // AVX2 Gather intrinsics use managed non-const fallback since they have discrete imm8 value range - // that does work with the current compiler generated jump-table fallback + // that does not work with the current compiler generated jump-table fallback assert(!HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsic)); emitter* emit = GetEmitter(); @@ -1843,13 +1843,67 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions) switch (intrinsicId) { + case NI_Vector128_CreateScalar: + case NI_Vector256_CreateScalar: + case NI_Vector512_CreateScalar: case NI_Vector128_CreateScalarUnsafe: case NI_Vector256_CreateScalarUnsafe: case NI_Vector512_CreateScalarUnsafe: { if (varTypeIsIntegral(baseType)) { - genHWIntrinsic_R_RM(node, ins, emitActualTypeSize(baseType), targetReg, op1, instOptions); + emitAttr baseAttr = emitActualTypeSize(baseType); + +#if defined(TARGET_X86) + if (varTypeIsLong(baseType)) + { + assert(op1->isContained()); + + if (op1->OperIsLong()) + { + node->SetSimdBaseJitType(CORINFO_TYPE_INT); + + bool canCombineLoad = false; + GenTree* loPart = op1->gtGetOp1(); + GenTree* hiPart = op1->gtGetOp2(); + + if ((loPart->isContained() && hiPart->isContained()) && + (loPart->OperIs(GT_LCL_FLD) && hiPart->OperIs(GT_LCL_FLD))) + { + GenTreeLclFld* loFld = loPart->AsLclFld(); + GenTreeLclFld* hiFld = hiPart->AsLclFld(); + + canCombineLoad = (hiFld->GetLclNum() == loFld->GetLclNum()) && + (hiFld->GetLclOffs() == (loFld->GetLclOffs() + 4)); + } + + if (!canCombineLoad) + { + if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE41)) + { + genHWIntrinsic_R_RM(node, ins, baseAttr, targetReg, loPart, instOptions); + inst_RV_RV_TT_IV(INS_pinsrd, EA_16BYTE, targetReg, targetReg, hiPart, 0x01, + !compiler->canUseVexEncoding(), instOptions); + } + else + { + regNumber tmpReg = internalRegisters.GetSingle(node); + genHWIntrinsic_R_RM(node, ins, baseAttr, targetReg, loPart, instOptions); + genHWIntrinsic_R_RM(node, ins, baseAttr, tmpReg, hiPart, instOptions); + emit->emitIns_R_R(INS_punpckldq, EA_16BYTE, targetReg, tmpReg, instOptions); + } + break; + } + + op1 = loPart; + } + + ins = INS_movq; + baseAttr = EA_8BYTE; + } +#endif // TARGET_X86 + + genHWIntrinsic_R_RM(node, ins, baseAttr, targetReg, op1, instOptions); } else { @@ -1864,6 +1918,45 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions) else { assert(instOptions == INS_OPTS_NONE); + + if (HWIntrinsicInfo::IsVectorCreateScalar(intrinsicId)) + { + // If this is CreateScalar, we need to ensure the upper elements are zeroed. + // Scalar integer loads and loads from memory always zero the upper elements, + // so reg to reg copies of floating types are the only place we need to + // do anything different. + + if (baseType == TYP_FLOAT) + { + if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE41)) + { + // insertps imm8 is: + // * Bits 0-3: zmask + // * Bits 4-5: count_d + // * Bits 6-7: count_s (register form only) + // + // We want zmask 0b1110 (0xE) to zero elements 1/2/3 + // We want count_d 0b00 (0x0) to insert the value to element 0 + // We want count_s 0b00 (0x0) as we're just taking element 0 of the source + + emit->emitIns_SIMD_R_R_R_I(INS_insertps, attr, targetReg, targetReg, op1Reg, 0x0E, + instOptions); + } + else + { + assert(targetReg != op1Reg); + emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg, instOptions); + emit->emitIns_Mov(INS_movss, attr, targetReg, op1Reg, /* canSkip */ false); + } + } + else + { + // `movq xmm xmm` zeroes the upper 64 bits. + genHWIntrinsic_R_RM(node, INS_movq, attr, targetReg, op1, instOptions); + } + break; + } + // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs emit->emitIns_Mov(INS_movaps, attr, targetReg, op1Reg, /* canSkip */ true); } @@ -2045,6 +2138,20 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions) } genHWIntrinsic_R_RM(node, ins, attr, targetReg, op1, instOptions); } + else if (varTypeIsIntegral(baseType)) + { + assert(!varTypeIsLong(baseType) || TargetArchitecture::Is64Bit); + assert(HWIntrinsicInfo::IsVectorToScalar(intrinsicId)); + + attr = emitActualTypeSize(baseType); + genHWIntrinsic_R_RM(node, ins, attr, targetReg, op1, instOptions); + + if (varTypeIsSmall(baseType)) + { + emit->emitIns_Mov(ins_Move_Extend(baseType, /* srcInReg */ true), emitTypeSize(baseType), targetReg, + targetReg, /* canSkip */ false); + } + } else { assert(varTypeIsFloating(baseType)); diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 7d572587bbaab4..b684fd2b946cbb 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -62,7 +62,7 @@ HARDWARE_INTRINSIC(Vector128, ConvertToUInt32Native, HARDWARE_INTRINSIC(Vector128, ConvertToUInt64, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector128, ConvertToUInt64Native, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector128, Create, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, CreateScalar, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, CreateScalar, 16, 1, {INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movss, INS_movsd_simd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector128, CreateScalarUnsafe, 16, 1, {INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movss, INS_movsd_simd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector128, CreateSequence, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector128, Dot, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) @@ -114,7 +114,7 @@ HARDWARE_INTRINSIC(Vector128, StoreAligned, HARDWARE_INTRINSIC(Vector128, StoreAlignedNonTemporal, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector128, StoreUnsafe, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector128, Sum, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(Vector128, ToScalar, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsd_simd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(Vector128, ToScalar, 16, 1, {INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movss, INS_movsd_simd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector128, ToVector256, 16, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector128, ToVector256Unsafe, 16, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector128, ToVector512, 16, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_vmovdqu64, INS_vmovdqu64, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) @@ -179,7 +179,7 @@ HARDWARE_INTRINSIC(Vector256, ConvertToUInt32Native, HARDWARE_INTRINSIC(Vector256, ConvertToUInt64, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector256, ConvertToUInt64Native, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector256, Create, 32, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible) -HARDWARE_INTRINSIC(Vector256, CreateScalar, 32, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible) +HARDWARE_INTRINSIC(Vector256, CreateScalar, 32, 1, {INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movss, INS_movsd_simd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_AvxOnlyCompatible) HARDWARE_INTRINSIC(Vector256, CreateScalarUnsafe, 32, 1, {INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movss, INS_movsd_simd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_AvxOnlyCompatible) HARDWARE_INTRINSIC(Vector256, CreateSequence, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_AvxOnlyCompatible) HARDWARE_INTRINSIC(Vector256, Dot, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) @@ -233,7 +233,7 @@ HARDWARE_INTRINSIC(Vector256, StoreAligned, HARDWARE_INTRINSIC(Vector256, StoreAlignedNonTemporal, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg|HW_Flag_AvxOnlyCompatible) HARDWARE_INTRINSIC(Vector256, StoreUnsafe, 32, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg|HW_Flag_AvxOnlyCompatible) HARDWARE_INTRINSIC(Vector256, Sum, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(Vector256, ToScalar, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsd_simd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_AvxOnlyCompatible) +HARDWARE_INTRINSIC(Vector256, ToScalar, 32, 1, {INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movss, INS_movsd_simd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_AvxOnlyCompatible) HARDWARE_INTRINSIC(Vector256, ToVector512, 32, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_vmovdqu64, INS_vmovdqu64, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector256, ToVector512Unsafe, 32, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_vmovdqu64, INS_vmovdqu64, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector256, Truncate, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) @@ -299,7 +299,7 @@ HARDWARE_INTRINSIC(Vector512, ConvertToUInt32Native, HARDWARE_INTRINSIC(Vector512, ConvertToUInt64, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector512, ConvertToUInt64Native, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector512, Create, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector512, CreateScalar, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, CreateScalar, 64, 1, {INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movss, INS_movsd_simd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Vector512, CreateScalarUnsafe, 64, 1, {INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movss, INS_movsd_simd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Vector512, CreateSequence, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector512, Dot, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) @@ -354,7 +354,7 @@ HARDWARE_INTRINSIC(Vector512, StoreAligned, HARDWARE_INTRINSIC(Vector512, StoreAlignedNonTemporal, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector512, StoreUnsafe, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector512, Sum, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(Vector512, ToScalar, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsd_simd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector512, ToScalar, 64, 1, {INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movss, INS_movsd_simd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector512, Truncate, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector512, WidenLower, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector512, WidenUpper, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 33278184d02d29..c1b04a3ddf9ba3 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -2125,16 +2125,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { if (sig->numArgs == 1) { -#if defined(TARGET_X86) - if (varTypeIsLong(simdBaseType) && !impStackTop(0).val->IsIntegralConst()) - { - // TODO-XARCH-CQ: It may be beneficial to emit the movq - // instruction, which takes a 64-bit memory address and - // works on 32-bit x86 systems. - break; - } -#endif // TARGET_X86 - op1 = impPopStack().val; retNode = gtNewSimdCreateBroadcastNode(retType, op1, simdBaseJitType, simdSize); break; @@ -2266,16 +2256,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } -#if defined(TARGET_X86) - if (varTypeIsLong(simdBaseType)) - { - // TODO-XARCH-CQ: It may be beneficial to emit the movq - // instruction, which takes a 64-bit memory address and - // works on 32-bit x86 systems. - break; - } -#endif // TARGET_X86 - IntrinsicNodeBuilder nodeBuilder(getAllocator(CMK_ASTNode), sig->numArgs); // TODO-CQ: We don't handle contiguous args for anything except TYP_FLOAT today @@ -2321,16 +2301,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { assert(sig->numArgs == 1); -#if defined(TARGET_X86) - if (varTypeIsLong(simdBaseType) && !impStackTop(0).val->IsIntegralConst()) - { - // TODO-XARCH-CQ: It may be beneficial to emit the movq - // instruction, which takes a 64-bit memory address and - // works on 32-bit x86 systems. - break; - } -#endif // TARGET_X86 - op1 = impPopStack().val; retNode = gtNewSimdCreateScalarNode(retType, op1, simdBaseJitType, simdSize); break; @@ -2342,16 +2312,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { assert(sig->numArgs == 1); -#if defined(TARGET_X86) - if (varTypeIsLong(simdBaseType) && !impStackTop(0).val->IsIntegralConst()) - { - // TODO-XARCH-CQ: It may be beneficial to emit the movq - // instruction, which takes a 64-bit memory address and - // works on 32-bit x86 systems. - break; - } -#endif // TARGET_X86 - op1 = impPopStack().val; retNode = gtNewSimdCreateScalarUnsafeNode(retType, op1, simdBaseJitType, simdSize); break; @@ -2376,27 +2336,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } } - - if (varTypeIsLong(simdBaseType)) - { - if (!impStackTop(0).val->OperIsConst()) - { - // When op2 is a constant, we can skip the multiplication allowing us to always - // generate better code. However, if it isn't then we need to fallback in the - // cases where multiplication isn't supported. - - if ((simdSize != 64) && !canUseEvexEncoding()) - { - // TODO-XARCH-CQ: We should support long/ulong multiplication - break; - } - } - -#if defined(TARGET_X86) - // TODO-XARCH-CQ: We need to support 64-bit CreateBroadcast - break; -#endif // TARGET_X86 - } } impSpillSideEffect(true, stackState.esStackDepth - @@ -2462,14 +2401,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } -#if defined(TARGET_X86) - if (varTypeIsLong(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_SSE41)) - { - // We need SSE41 to handle long, use software fallback - break; - } -#endif // TARGET_X86 - op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); @@ -2765,13 +2696,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case TYP_LONG: case TYP_ULONG: { - bool useToScalar = op2->IsIntegralConst(0); - -#if defined(TARGET_X86) - useToScalar &= !varTypeIsLong(simdBaseType); -#endif // TARGET_X86 - - if (!useToScalar && !compOpportunisticallyDependsOn(InstructionSet_SSE41)) + if (!op2->IsIntegralConst(0) && !compOpportunisticallyDependsOn(InstructionSet_SSE41)) { // Using software fallback if simdBaseType is not supported by hardware return nullptr; @@ -3349,15 +3274,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } -#if defined(TARGET_X86) - if (varTypeIsLong(simdBaseType)) - { - // TODO-XARCH-CQ: We can't handle long here, only because one of the args might - // be scalar, and gtNewSimdCreateBroadcastNode doesn't handle long on x86. - break; - } -#endif // TARGET_X86 - CORINFO_ARG_LIST_HANDLE arg1 = sig->args; CORINFO_ARG_LIST_HANDLE arg2 = info.compCompHnd->getArgNext(arg1); var_types argType = TYP_UNKNOWN; @@ -3514,18 +3430,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { assert(sig->numArgs == 2); -#if defined(TARGET_X86) - if ((simdBaseType == TYP_LONG) || (simdBaseType == TYP_DOUBLE)) - { - if (!compOpportunisticallyDependsOn(InstructionSet_EVEX) && !impStackTop(0).val->IsCnsIntOrI()) - { - // If vpsraq is available, we can use that. We can also trivially emulate arithmetic shift by const - // amount. Otherwise, more work is required for long types, so we fall back to managed for now. - break; - } - } -#endif // TARGET_X86 - if ((simdSize != 32) || compOpportunisticallyDependsOn(InstructionSet_AVX2)) { genTreeOps op = varTypeIsUnsigned(simdBaseType) ? GT_RSZ : GT_RSH; @@ -3781,14 +3685,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { assert(sig->numArgs == 1); -#if defined(TARGET_X86) - if (varTypeIsLong(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_SSE41)) - { - // We need SSE41 to handle long, use software fallback - break; - } -#endif // TARGET_X86 - op1 = impSIMDPopStack(); retNode = gtNewSimdSumNode(retType, op1, simdBaseJitType, simdSize); break; @@ -3800,14 +3696,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { assert(sig->numArgs == 1); -#if defined(TARGET_X86) - if (varTypeIsLong(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_SSE41)) - { - // We need SSE41 to handle long, use software fallback - break; - } -#endif // TARGET_X86 - op1 = impSIMDPopStack(); retNode = gtNewSimdToScalarNode(retType, op1, simdBaseJitType, simdSize); break; diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index fe3da0a63eb904..bd614016bdb24d 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -920,6 +920,9 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) break; } + case NI_Vector128_CreateScalar: + case NI_Vector256_CreateScalar: + case NI_Vector512_CreateScalar: case NI_Vector128_CreateScalarUnsafe: case NI_Vector256_CreateScalarUnsafe: case NI_Vector512_CreateScalarUnsafe: @@ -927,7 +930,7 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) // The hwintrinsic should be contained and its // op1 should be either contained or spilled. This // allows us to transparently "look through" the - // CreateScalarUnsafe and treat it directly like + // CreateScalar/Unsafe and treat it directly like // a load from memory. assert(hwintrinsic->isContained()); diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 965493f39f346a..8b1dc5d10b1e06 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -7968,7 +7968,7 @@ PhaseStatus Lowering::DoPhase() } #if !defined(TARGET_64BIT) - DecomposeLongs decomp(comp); // Initialize the long decomposition class. + DecomposeLongs decomp(comp, this); // Initialize the long decomposition class. if (comp->compLongUsed) { decomp.PrepareForDecomposition(); diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 611ceb09339233..d44880bd947554 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -464,61 +464,6 @@ class Lowering final : public Phase unsigned simdSize); #endif // FEATURE_HW_INTRINSICS - //---------------------------------------------------------------------------------------------- - // TryRemoveCastIfPresent: Removes op it is a cast operation and the size of its input is at - // least the size of expectedType - // - // Arguments: - // expectedType - The expected type of the cast operation input if it is to be removed - // op - The tree to remove if it is a cast op whose input is at least the size of expectedType - // - // Returns: - // op if it was not a cast node or if its input is not at least the size of expected type; - // Otherwise, it returns the underlying operation that was being casted - GenTree* TryRemoveCastIfPresent(var_types expectedType, GenTree* op) - { - if (!op->OperIs(GT_CAST) || !comp->opts.OptimizationEnabled()) - { - return op; - } - - GenTreeCast* cast = op->AsCast(); - GenTree* castOp = cast->CastOp(); - - // FP <-> INT casts should be kept - if (varTypeIsFloating(castOp) ^ varTypeIsFloating(expectedType)) - { - return op; - } - - // Keep casts which can overflow - if (cast->gtOverflow()) - { - return op; - } - - // Keep casts with operands usable from memory. - if (castOp->isContained() || castOp->IsRegOptional()) - { - return op; - } - - if (genTypeSize(cast->CastToType()) >= genTypeSize(expectedType)) - { -#ifndef TARGET_64BIT - // Don't expose TYP_LONG on 32bit - if (castOp->TypeIs(TYP_LONG)) - { - return op; - } -#endif - BlockRange().Remove(op); - return castOp; - } - - return op; - } - // Utility functions public: static bool IndirsAreEquivalent(GenTree* pTreeA, GenTree* pTreeB); @@ -568,6 +513,13 @@ class Lowering final : public Phase bool IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTree* childNode, bool* supportsRegOptional); #endif // FEATURE_HW_INTRINSICS + // Checks for memory conflicts in the instructions between childNode and parentNode, and returns true if childNode + // can be contained. + bool IsSafeToContainMem(GenTree* parentNode, GenTree* childNode) const; + + // Similar to above, but allows bypassing a "transparent" parent. + bool IsSafeToContainMem(GenTree* grandparentNode, GenTree* parentNode, GenTree* childNode) const; + static void TransformUnusedIndirection(GenTreeIndir* ind, Compiler* comp, BasicBlock* block); private: @@ -599,13 +551,6 @@ class Lowering final : public Phase GenTree* endExclusive, GenTree* ignoreNode) const; - // Checks for memory conflicts in the instructions between childNode and parentNode, and returns true if childNode - // can be contained. - bool IsSafeToContainMem(GenTree* parentNode, GenTree* childNode) const; - - // Similar to above, but allows bypassing a "transparent" parent. - bool IsSafeToContainMem(GenTree* grandparentNode, GenTree* parentNode, GenTree* childNode) const; - // Check if marking an operand of a node as reg-optional is safe. bool IsSafeToMarkRegOptional(GenTree* parentNode, GenTree* node) const; diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 963d0c4d1fbb7a..8c2528de2c2e65 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -2015,11 +2015,9 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) // bool Lowering::IsValidConstForMovImm(GenTreeHWIntrinsic* node) { - assert((node->GetHWIntrinsicId() == NI_Vector64_Create) || (node->GetHWIntrinsicId() == NI_Vector128_Create) || - (node->GetHWIntrinsicId() == NI_Vector64_CreateScalar) || - (node->GetHWIntrinsicId() == NI_Vector128_CreateScalar) || - (node->GetHWIntrinsicId() == NI_Vector64_CreateScalarUnsafe) || - (node->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe) || + assert(HWIntrinsicInfo::IsVectorCreate(node->GetHWIntrinsicId()) || + HWIntrinsicInfo::IsVectorCreateScalar(node->GetHWIntrinsicId()) || + HWIntrinsicInfo::IsVectorCreateScalarUnsafe(node->GetHWIntrinsicId()) || (node->GetHWIntrinsicId() == NI_AdvSimd_DuplicateToVector64) || (node->GetHWIntrinsicId() == NI_AdvSimd_DuplicateToVector128) || (node->GetHWIntrinsicId() == NI_AdvSimd_Arm64_DuplicateToVector64) || @@ -2278,7 +2276,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) assert(simdSize != 0); bool isConstant = GenTreeVecCon::IsHWIntrinsicCreateConstant(node, simdVal); - bool isCreateScalar = (intrinsicId == NI_Vector64_CreateScalar) || (intrinsicId == NI_Vector128_CreateScalar); + bool isCreateScalar = HWIntrinsicInfo::IsVectorCreateScalar(intrinsicId); size_t argCnt = node->GetOperandCount(); // Check if we have a cast that we can remove. Note that "IsValidConstForMovImm" diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index aa6e18f682781e..743218ecede33b 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -2044,26 +2044,14 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) break; } - case NI_SSE2_Insert: case NI_SSE41_Insert: - case NI_SSE41_X64_Insert: { assert(node->GetOperandCount() == 3); - var_types simdBaseType = node->GetSimdBaseType(); - - // Insert takes either a 32-bit register or a memory operand. - // In either case, only SimdBaseType bits are read and so - // widening or narrowing the operand may be unnecessary and it - // can just be used directly. - - node->Op(2) = TryRemoveCastIfPresent(simdBaseType, node->Op(2)); - - if (simdBaseType != TYP_FLOAT) + if (node->GetSimdBaseType() != TYP_FLOAT) { break; } - assert(intrinsicId == NI_SSE41_Insert); // We have Sse41.Insert in which case we can specially handle // a couple of interesting scenarios involving chains of Inserts @@ -2272,19 +2260,6 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) break; } - case NI_SSE42_Crc32: - { - assert(node->GetOperandCount() == 2); - - // Crc32 takes either a bit register or a memory operand. - // In either case, only gtType bits are read and so widening - // or narrowing the operand may be unnecessary and it can - // just be used directly. - - node->Op(2) = TryRemoveCastIfPresent(node->TypeGet(), node->Op(2)); - break; - } - case NI_SSE2_CompareGreaterThan: { if (node->GetSimdBaseType() != TYP_DOUBLE) @@ -3082,6 +3057,7 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm comp->gtNewSimdCreateBroadcastNode(simdType, broadcastOp, op1Intrinsic->GetSimdBaseJitType(), simdSize); + assert(vecCns->IsCnsVec()); BlockRange().InsertAfter(broadcastOp, vecCns); nestedOp2 = vecCns; @@ -4032,10 +4008,9 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) GenTree* tmp2 = nullptr; GenTree* tmp3 = nullptr; - bool isConstant = GenTreeVecCon::IsHWIntrinsicCreateConstant(node, simdVal); - bool isCreateScalar = (intrinsicId == NI_Vector128_CreateScalar) || (intrinsicId == NI_Vector256_CreateScalar) || - (intrinsicId == NI_Vector512_CreateScalar); - size_t argCnt = node->GetOperandCount(); + bool isConstant = GenTreeVecCon::IsHWIntrinsicCreateConstant(node, simdVal); + bool isCreateScalar = HWIntrinsicInfo::IsVectorCreateScalar(intrinsicId); + size_t argCnt = node->GetOperandCount(); if (isConstant) { @@ -4046,8 +4021,8 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) #if !defined(TARGET_64BIT) if (arg->OperIsLong()) { - BlockRange().Remove(arg->AsOp()->gtGetOp1()); - BlockRange().Remove(arg->AsOp()->gtGetOp2()); + BlockRange().Remove(arg->gtGetOp1()); + BlockRange().Remove(arg->gtGetOp2()); } #endif // !TARGET_64BIT BlockRange().Remove(arg); @@ -4075,165 +4050,61 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) { if (isCreateScalar) { - node->gtType = TYP_SIMD16; - node->SetSimdSize(16); - switch (simdBaseType) { case TYP_BYTE: case TYP_UBYTE: - { - // Types need to be explicitly zero-extended to ensure upper-bits are zero - // - // We need to explicitly use TYP_UBYTE since unsigned is ignored for small types - // Explicitly handle both BYTE and UBYTE to account for reinterpret casts and the like - // - // The from type is INT since that is the input type tracked by IR, where-as the target - // type needs to be UBYTE so it implicitly zero-extends back to TYP_INT - - tmp1 = comp->gtNewCastNode(TYP_INT, op1, /* unsigned */ true, TYP_UBYTE); - BlockRange().InsertAfter(op1, tmp1); - LowerNode(tmp1); - - node->ChangeHWIntrinsicId(NI_SSE2_ConvertScalarToVector128Int32, tmp1); - node->SetSimdBaseJitType(CORINFO_TYPE_INT); - break; - } - case TYP_SHORT: case TYP_USHORT: { - // Types need to be explicitly zero-extended to ensure upper-bits are zero + // The smallest scalar SIMD load that zeroes upper elements is 32 bits, so for CreateScalar, + // we must ensure that the upper bits of that 32-bit value are zero if the base type is small. // - // We need to explicitly use TYP_USHORT since unsigned is ignored for small types - // Explicitly handle both SHORT and USHORT to account for reinterpret casts and the like + // The most likely case is that op1 is a cast from int/long to the base type: + // * CAST int <- short <- int/long + // If the base type is signed, that cast will be sign-extending, but we need zero extension, + // so we can simply retype the cast to the unsigned type of the same size. // - // The from type is INT since that is the input type tracked by IR, where-as the target - // type needs to be USHORT so it implicitly zero-extends back to TYP_INT - - tmp1 = comp->gtNewCastNode(TYP_INT, op1, /* unsigned */ true, TYP_USHORT); - BlockRange().InsertAfter(op1, tmp1); - LowerNode(tmp1); + // It's also possible we have a memory load of the base type: + // * IND short + // We can likewise change the type of the indir to force zero extension on load. + // + // If we can't safely retype one of the above patterns and don't already have a cast to the + // correct unsigned type, we will insert our own cast. - node->ChangeHWIntrinsicId(NI_SSE2_ConvertScalarToVector128Int32, tmp1); node->SetSimdBaseJitType(CORINFO_TYPE_INT); - break; - } - - case TYP_INT: - { - node->ChangeHWIntrinsicId(NI_SSE2_ConvertScalarToVector128Int32); - break; - } - case TYP_UINT: - { - node->ChangeHWIntrinsicId(NI_SSE2_ConvertScalarToVector128UInt32); - break; - } + var_types unsignedType = varTypeToUnsigned(simdBaseType); -#if defined(TARGET_AMD64) - case TYP_LONG: - { - node->ChangeHWIntrinsicId(NI_SSE2_X64_ConvertScalarToVector128Int64); - break; - } - - case TYP_ULONG: - { - node->ChangeHWIntrinsicId(NI_SSE2_X64_ConvertScalarToVector128UInt64); - break; - } -#endif // TARGET_AMD64 - - case TYP_FLOAT: - { - tmp1 = comp->gtNewZeroConNode(simdType); - BlockRange().InsertBefore(op1, tmp1); - LowerNode(tmp1); - - if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41)) + if (op1->OperIs(GT_CAST) && !op1->gtOverflow()) { - // Sse41.Insert has: - // * Bits 0-3: zmask - // * Bits 4-5: count_d - // * Bits 6-7: count_s (register form only) - // - // We want zmask 0b1110 (0xE) to zero elements 1/2/3 - // We want count_d 0b00 (0x0) to insert the value to element 0 - // We want count_s 0b00 (0x0) as we're just taking element 0 of the source - - idx = comp->gtNewIconNode(0x0E); - BlockRange().InsertAfter(op1, idx); - LowerNode(idx); - - node->ResetHWIntrinsicId(NI_SSE41_Insert, comp, tmp1, op1, idx); + assert(op1->TypeIs(TYP_INT) && (genTypeSize(op1->CastToType()) == genTypeSize(simdBaseType))); + op1->AsCast()->gtCastType = unsignedType; } - else + else if (op1->OperIs(GT_IND, GT_LCL_FLD)) { - node->ResetHWIntrinsicId(NI_SSE_MoveScalar, comp, tmp1, op1); + assert(genTypeSize(op1) == genTypeSize(simdBaseType)); + op1->gtType = unsignedType; + } + else if (!op1->OperIs(GT_CAST) || (op1->AsCast()->CastToType() != unsignedType)) + { + tmp1 = comp->gtNewCastNode(TYP_INT, op1, /* fromUnsigned */ false, unsignedType); + node->Op(1) = tmp1; + BlockRange().InsertAfter(op1, tmp1); + LowerNode(tmp1); } - break; - } - - case TYP_DOUBLE: - { - tmp1 = comp->gtNewZeroConNode(simdType); - BlockRange().InsertBefore(op1, tmp1); - LowerNode(tmp1); - node->ResetHWIntrinsicId(NI_SSE2_MoveScalar, comp, tmp1, op1); break; } default: { - unreached(); - } - } - - if (simdSize > 16) - { - assert((simdSize == 32) || (simdSize == 64)); - - // We're creating a Vector256/512 scalar so we need to treat the original op as Vector128, - // we need to unsafely extend up to Vector256/512 (which is actually safe since the 128-bit - // op will zero extend up to 256/512-bits), and then we need to replace the original use - // with the new TYP_SIMD32/64 node. - - node->ChangeType(TYP_SIMD16); - node->SetSimdSize(16); - LowerNode(node); - - LIR::Use use; - bool foundUse = BlockRange().TryGetUse(node, &use); - - tmp2 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD32, node, NI_Vector128_ToVector256Unsafe, simdBaseJitType, - 16); - BlockRange().InsertAfter(node, tmp2); - - if (simdSize == 64) - { - tmp3 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD64, tmp2, NI_Vector256_ToVector512Unsafe, - simdBaseJitType, 32); - BlockRange().InsertAfter(tmp2, tmp3); - tmp2 = tmp3; - } - - if (foundUse) - { - use.ReplaceWith(tmp2); - } - else - { - node->ClearUnusedValue(); - tmp2->SetUnusedValue(); + break; } - - node = tmp2->AsHWIntrinsic(); } - return LowerNode(node); + ContainCheckHWIntrinsic(node); + return node->gtNext; } // We have the following (where simd is simd16, simd32 or simd64): @@ -4509,40 +4380,6 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) break; } -#if defined(TARGET_AMD64) - case TYP_LONG: - case TYP_ULONG: - { - // We will be constructing the following parts: - // ... - // /--* tmp1 simd16 - // * STORE_LCL_VAR simd16 - // tmp1 = LCL_VAR simd16 - // tmp2 = LCL_VAR simd16 - // /--* tmp1 simd16 - // +--* tmp2 simd16 - // node = * HWINTRINSIC simd16 ulong UnpackLow - - // This is roughly the following managed code: - // ... - // var tmp2 = tmp1; - // return Sse2.UnpackLow(tmp1, tmp2); - - assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2)); - - node->Op(1) = tmp1; - LIR::Use tmp1Use(BlockRange(), &node->Op(1), node); - ReplaceWithLclVar(tmp1Use); - tmp1 = node->Op(1); - - tmp2 = comp->gtClone(tmp1); - BlockRange().InsertAfter(tmp1, tmp2); - - node->ResetHWIntrinsicId(NI_SSE2_UnpackLow, tmp1, tmp2); - break; - } -#endif // TARGET_AMD64 - case TYP_FLOAT: { if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX)) @@ -4599,9 +4436,12 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) break; } + case TYP_LONG: + case TYP_ULONG: case TYP_DOUBLE: { - if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE3)) + if ((IsContainableMemoryOp(op1) || simdBaseType == TYP_DOUBLE) && + comp->compOpportunisticallyDependsOn(InstructionSet_SSE3)) { // We will be constructing the following parts: // ... @@ -4613,6 +4453,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) // return Sse3.MoveAndDuplicate(tmp1); node->ChangeHWIntrinsicId(NI_SSE3_MoveAndDuplicate, tmp1); + node->SetSimdBaseJitType(CORINFO_TYPE_DOUBLE); break; } @@ -4626,12 +4467,12 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) // tmp2 = LCL_VAR simd16 // /--* tmp1 simd16 // +--* tmp2 simd16 - // node = * HWINTRINSIC simd16 float MoveLowToHigh + // node = * HWINTRINSIC simd16 T UnpackLow // This is roughly the following managed code: // ... // var tmp2 = tmp1; - // return Sse.MoveLowToHigh(tmp1, tmp2); + // return Sse2.UnpackLow(tmp1, tmp2); node->Op(1) = tmp1; LIR::Use tmp1Use(BlockRange(), &node->Op(1), node); @@ -4641,8 +4482,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) tmp2 = comp->gtClone(tmp1); BlockRange().InsertAfter(tmp1, tmp2); - node->ResetHWIntrinsicId(NI_SSE_MoveLowToHigh, tmp1, tmp2); - node->SetSimdBaseJitType(CORINFO_TYPE_FLOAT); + node->ResetHWIntrinsicId(NI_SSE2_UnpackLow, tmp1, tmp2); break; } @@ -4655,19 +4495,16 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) return LowerNode(node); } - GenTree* op2 = node->Op(2); - - // TODO-XArch-AVX512 : Merge the NI_Vector512_Create and NI_Vector256_Create paths below. - // We have the following (where simd is simd16 or simd32): - // /--* op1 T - // +--* ... T - // +--* opN T - // node = * HWINTRINSIC simd T Create - if (intrinsicId == NI_Vector512_Create) + if (intrinsicId == NI_Vector512_Create || intrinsicId == NI_Vector256_Create) { - assert(comp->IsBaselineVector512IsaSupportedDebugOnly()); + assert(argCnt >= (simdSize / genTypeSize(TYP_LONG))); + assert(((simdSize == 64) && comp->IsBaselineVector512IsaSupportedDebugOnly()) || + ((simdSize == 32) && comp->IsBaselineVector256IsaSupportedDebugOnly())); - // We will be constructing the following parts: + // The larger vector implementation is simplified by splitting the + // job in half and delegating to the next smaller vector size. + // + // For example, for Vector512, we construct the following: // /--* op1 T // +--* ... T // lo = * HWINTRINSIC simd32 T Create @@ -4697,86 +4534,35 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) // lo = Vector256.Create(op1, ..., op16); // hi = Vector256.Create(op17, ..., op32); + var_types halfType = comp->getSIMDTypeForSize(simdSize / 2); + NamedIntrinsic halfCreate = (simdSize == 64) ? NI_Vector256_Create : NI_Vector128_Create; + NamedIntrinsic withUpper = (simdSize == 64) ? NI_Vector512_WithUpper : NI_Vector256_WithUpper; + size_t halfArgCnt = argCnt / 2; assert((halfArgCnt * 2) == argCnt); GenTree* loInsertionPoint = LIR::LastNode(node->GetOperandArray(), halfArgCnt); - - GenTree* lo = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD32, node->GetOperandArray(), halfArgCnt, - NI_Vector256_Create, simdBaseJitType, 32); - BlockRange().InsertAfter(loInsertionPoint, lo); - GenTree* hiInsertionPoint = LIR::LastNode(node->GetOperandArray(halfArgCnt), halfArgCnt); - GenTree* hi = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD32, node->GetOperandArray(halfArgCnt), halfArgCnt, - NI_Vector256_Create, simdBaseJitType, 32); - BlockRange().InsertAfter(hiInsertionPoint, hi); - - assert(argCnt >= 7); - node->ResetHWIntrinsicId(NI_Vector512_WithUpper, comp, lo, hi); - - LowerNode(lo); - LowerNode(hi); - return LowerNode(node); - } - else if (intrinsicId == NI_Vector256_Create) - { - assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX)); + GenTree* lo = comp->gtNewSimdHWIntrinsicNode(halfType, node->GetOperandArray(), halfArgCnt, halfCreate, + simdBaseJitType, simdSize / 2); - // We will be constructing the following parts: - // /--* op1 T - // +--* ... T - // lo = * HWINTRINSIC simd16 T Create - // /--* ... T - // +--* opN T - // hi = * HWINTRINSIC simd16 T Create - // /--* lo simd32 - // +--* hi simd16 - // node = * HWINTRINSIC simd32 T WithUpper + GenTree* hi = comp->gtNewSimdHWIntrinsicNode(halfType, node->GetOperandArray(halfArgCnt), halfArgCnt, + halfCreate, simdBaseJitType, simdSize / 2); - // This is roughly the following managed code: - // ... - // var lo = Vector128.Create(op1, ...); - // var hi = Vector128.Create(..., opN); - // return lo.WithUpper(hi); - - // Each Vector128.Create call gets half the operands. That is: - // lo = Vector128.Create(op1, op2); - // hi = Vector128.Create(op3, op4); - // -or- - // lo = Vector128.Create(op1, ..., op4); - // hi = Vector128.Create(op5, ..., op8); - // -or- - // lo = Vector128.Create(op1, ..., op8); - // hi = Vector128.Create(op9, ..., op16); - // -or- - // lo = Vector128.Create(op1, ..., op16); - // hi = Vector128.Create(op17, ..., op32); + node->ResetHWIntrinsicId(withUpper, comp, lo, hi); - size_t halfArgCnt = argCnt / 2; - assert((halfArgCnt * 2) == argCnt); - - GenTree* loInsertionPoint = LIR::LastNode(node->GetOperandArray(), halfArgCnt); - - GenTree* lo = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, node->GetOperandArray(), halfArgCnt, - NI_Vector128_Create, simdBaseJitType, 16); BlockRange().InsertAfter(loInsertionPoint, lo); - - GenTree* hiInsertionPoint = LIR::LastNode(node->GetOperandArray(halfArgCnt), halfArgCnt); - - GenTree* hi = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, node->GetOperandArray(halfArgCnt), halfArgCnt, - NI_Vector128_Create, simdBaseJitType, 16); BlockRange().InsertAfter(hiInsertionPoint, hi); - assert(argCnt >= 3); - node->ResetHWIntrinsicId(NI_Vector256_WithUpper, comp, lo, hi); - LowerNode(lo); LowerNode(hi); return LowerNode(node); } + assert(intrinsicId == NI_Vector128_Create); + // We will be constructing the following parts: // /--* op1 T // tmp1 = * HWINTRINSIC simd16 T CreateScalarUnsafe @@ -4975,54 +4761,6 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) break; } -#if defined(TARGET_AMD64) - case TYP_LONG: - case TYP_ULONG: - { - if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41_X64)) - { - // We will be constructing the following parts: - // ... - // idx = CNS_INT int 1 - // /--* tmp1 simd16 - // +--* op2 T - // +--* idx int - // node = * HWINTRINSIC simd16 T Insert - - // This is roughly the following managed code: - // ... - // return Sse41.X64.Insert(tmp1, op2, 0x01); - - idx = comp->gtNewIconNode(0x01, TYP_INT); - BlockRange().InsertBefore(node, idx); - - node->ResetHWIntrinsicId(NI_SSE41_X64_Insert, comp, tmp1, op2, idx); - break; - } - - // We will be constructing the following parts: - // ... - // /--* op2 T - // tmp2 = * HWINTRINSIC simd16 T CreateScalarUnsafe - // /--* tmp1 simd16 - // +--* tmp2 simd16 - // node = * HWINTRINSIC simd16 T UnpackLow - - // This is roughly the following managed code: - // ... - // var tmp2 = Vector128.CreateScalarUnsafe(op2); - // return Sse2.UnpackLow(tmp1, tmp2); - - assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2)); - - tmp2 = InsertNewSimdCreateScalarUnsafeNode(TYP_SIMD16, op2, simdBaseJitType, 16); - LowerNode(tmp2); - - node->ResetHWIntrinsicId(NI_SSE2_UnpackLow, tmp1, tmp2); - break; - } -#endif // TARGET_AMD64 - case TYP_FLOAT: { unsigned N = 0; @@ -5162,28 +4900,52 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) break; } + case TYP_LONG: + case TYP_ULONG: case TYP_DOUBLE: { + GenTree* op2 = node->Op(2); + + if (varTypeIsLong(simdBaseType) && comp->compOpportunisticallyDependsOn(InstructionSet_SSE41_X64)) + { + // We will be constructing the following parts: + // ... + // idx = CNS_INT int 1 + // /--* tmp1 simd16 + // +--* op2 T + // +--* idx int + // node = * HWINTRINSIC simd16 T Insert + + // This is roughly the following managed code: + // ... + // return Sse41.X64.Insert(tmp1, op2, 0x01); + + idx = comp->gtNewIconNode(0x01, TYP_INT); + BlockRange().InsertBefore(node, idx); + + node->ResetHWIntrinsicId(NI_SSE41_X64_Insert, comp, tmp1, op2, idx); + break; + } + // We will be constructing the following parts: // ... // /--* op2 T // tmp2 = * HWINTRINSIC simd16 T CreateScalarUnsafe // /--* tmp1 simd16 // +--* tmp2 simd16 - // node = * HWINTRINSIC simd16 T MoveLowToHigh + // node = * HWINTRINSIC simd16 T UnpackLow // This is roughly the following managed code: // ... // var tmp2 = Vector128.CreateScalarUnsafe(op2); - // return Sse.MoveLowToHigh(tmp1, tmp2); + // return Sse.UnpackLow(tmp1, tmp2); assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2)); tmp2 = InsertNewSimdCreateScalarUnsafeNode(TYP_SIMD16, op2, simdBaseJitType, 16); LowerNode(tmp2); - node->ResetHWIntrinsicId(NI_SSE_MoveLowToHigh, tmp1, tmp2); - node->SetSimdBaseJitType(CORINFO_TYPE_FLOAT); + node->ResetHWIntrinsicId(NI_SSE2_UnpackLow, tmp1, tmp2); break; } @@ -5210,9 +4972,7 @@ GenTree* Lowering::LowerHWIntrinsicGetElement(GenTreeHWIntrinsic* node) var_types simdBaseType = node->GetSimdBaseType(); unsigned simdSize = node->GetSimdSize(); - assert((intrinsicId == NI_Vector128_GetElement) || (intrinsicId == NI_Vector256_GetElement) || - (intrinsicId == NI_Vector512_GetElement)); - + assert(HWIntrinsicInfo::IsVectorGetElement(intrinsicId)); assert(!varTypeIsSIMD(simdType)); assert(varTypeIsArithmetic(simdBaseType)); assert(simdSize != 0); @@ -6835,24 +6595,25 @@ GenTree* Lowering::LowerHWIntrinsicToScalar(GenTreeHWIntrinsic* node) unsigned simdSize = node->GetSimdSize(); var_types simdType = Compiler::getSIMDTypeForSize(simdSize); - assert((intrinsicId == NI_Vector128_ToScalar) || (intrinsicId == NI_Vector256_ToScalar) || - (intrinsicId == NI_Vector512_ToScalar)); - + assert(HWIntrinsicInfo::IsVectorToScalar(intrinsicId)); assert(varTypeIsSIMD(simdType)); assert(varTypeIsArithmetic(simdBaseType)); assert(simdSize != 0); GenTree* op1 = node->Op(1); - if (IsContainableMemoryOp(op1)) + if (IsContainableMemoryOp(op1) && (!varTypeIsLong(simdBaseType) || TargetArchitecture::Is64Bit)) { - // We will specially handle ToScalar when op1 is already in memory + // If op1 is already in memory, we'd like the consumer of ToScalar to be able to look + // through to the memory directly. Early folding is preferable, as it unlocks additional + // containment opportunities for the consuming nodes. If we can't fold away ToScalar, + // we will still contain op1 if possible, and let codegen try to peek through to it. + // + // However, we specifically need to avoid doing this for long on 32-bit because we are + // already past DecomposeLongs, and codegen wouldn't be able to handle it. if (op1->OperIs(GT_IND)) { - // We want to optimize ToScalar down to an Indir where possible as - // this unlocks additional containment opportunities for various nodes - GenTreeIndir* indir = op1->AsIndir(); GenTreeIndir* newIndir = @@ -6879,9 +6640,6 @@ GenTree* Lowering::LowerHWIntrinsicToScalar(GenTreeHWIntrinsic* node) { uint32_t elemSize = genTypeSize(simdBaseType); - // We want to optimize ToScalar down to a LclFld where possible as - // this unlocks additional containment opportunities for various nodes - GenTreeLclVarCommon* lclVar = op1->AsLclVarCommon(); uint32_t lclOffs = lclVar->GetLclOffs() + (0 * elemSize); LclVarDsc* lclDsc = comp->lvaGetDesc(lclVar); @@ -6908,92 +6666,10 @@ GenTree* Lowering::LowerHWIntrinsicToScalar(GenTreeHWIntrinsic* node) return LowerNode(lclFld); } } - - if (IsSafeToContainMem(node, op1)) - { - // Handle other cases in codegen - ContainCheckHWIntrinsic(node); - return node->gtNext; - } } - switch (simdBaseType) - { - case TYP_BYTE: - case TYP_SHORT: - case TYP_INT: - { - node->gtType = TYP_INT; - node->SetSimdBaseJitType(CORINFO_TYPE_INT); - node->ChangeHWIntrinsicId(NI_SSE2_ConvertToInt32); - break; - } - - case TYP_UBYTE: - case TYP_USHORT: - case TYP_UINT: - { - node->gtType = TYP_INT; - node->SetSimdBaseJitType(CORINFO_TYPE_UINT); - node->ChangeHWIntrinsicId(NI_SSE2_ConvertToUInt32); - break; - } - -#if defined(TARGET_AMD64) - case TYP_LONG: - { - node->ChangeHWIntrinsicId(NI_SSE2_X64_ConvertToInt64); - break; - } - - case TYP_ULONG: - { - node->ChangeHWIntrinsicId(NI_SSE2_X64_ConvertToUInt64); - break; - } -#endif // TARGET_AMD64 - - case TYP_FLOAT: - case TYP_DOUBLE: - { - ContainCheckHWIntrinsic(node); - return node->gtNext; - } - - default: - { - unreached(); - } - } - - GenTree* next = LowerNode(node); - - if (genTypeSize(simdBaseType) < 4) - { - // The move intrinsics do not touch the upper bits, so we need an explicit - // cast to ensure the result is properly sign extended - - LIR::Use use; - - bool foundUse = BlockRange().TryGetUse(node, &use); - bool fromUnsigned = varTypeIsUnsigned(simdBaseType); - - GenTreeCast* cast = comp->gtNewCastNode(TYP_INT, node, fromUnsigned, simdBaseType); - BlockRange().InsertAfter(node, cast); - - if (foundUse) - { - use.ReplaceWith(cast); - } - else - { - node->ClearUnusedValue(); - cast->SetUnusedValue(); - } - next = LowerNode(cast); - } - - return next; + ContainCheckHWIntrinsic(node); + return node->gtNext; } //---------------------------------------------------------------------------------------------- @@ -8016,20 +7692,38 @@ void Lowering::ContainCheckStoreIndir(GenTreeStoreInd* node) case NI_Vector256_ToScalar: case NI_Vector512_ToScalar: { - if (varTypeIsFloating(simdBaseType)) + // These intrinsics are "ins reg/mem, xmm" or "ins xmm, reg/mem" + // + // In the case we are coming from and going to memory, we want to + // preserve the original containment as we'll end up emitting a pair + // of scalar moves. e.g. for float: + // movss xmm0, [addr1] ; Size: 4, Latency: 4-7, TP: 0.5 + // movss [addr2], xmm0 ; Size: 4, Latency: 4-10, TP: 1 + // + // However, we want to prefer containing the store over allowing the + // input to be regOptional, so track and clear containment if required. + + clearContainedNode = hwintrinsic->Op(1); + isContainable = !clearContainedNode->isContained(); + + if (isContainable && varTypeIsIntegral(simdBaseType)) { - // These intrinsics are "ins reg/mem, xmm" or "ins xmm, reg/mem" - // - // In the case we are coming from and going to memory, we want to - // preserve the original containment as we'll end up emitting: - // movss xmm0, [addr1] ; Size: 4, Latency: 4-7, TP: 0.5 - // movss [addr2], xmm0 ; Size: 4, Latency: 4-10, TP: 1 - // - // However, we want to prefer containing the store over allowing the - // input to be regOptional, so track and clear containment if required. + isContainable = (genTypeSize(simdBaseType) == genTypeSize(node)) && + (!varTypeIsSmall(simdBaseType) || + comp->compOpportunisticallyDependsOn(InstructionSet_SSE41)); - clearContainedNode = hwintrinsic->Op(1); - isContainable = !clearContainedNode->isContained(); + if (isContainable && varTypeIsSmall(simdBaseType)) + { + CorInfoType baseJitType = varTypeIsByte(node) ? CORINFO_TYPE_UBYTE : CORINFO_TYPE_USHORT; + intrinsicId = varTypeIsByte(node) ? NI_SSE41_Extract : NI_SSE2_Extract; + + GenTree* zero = comp->gtNewZeroConNode(TYP_INT); + BlockRange().InsertBefore(hwintrinsic, zero); + + hwintrinsic->SetSimdBaseJitType(baseJitType); + hwintrinsic->ResetHWIntrinsicId(intrinsicId, hwintrinsic->Op(1), zero); + zero->SetContained(); + } } break; } @@ -9310,6 +9004,9 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre switch (parentIntrinsicId) { + case NI_Vector128_CreateScalar: + case NI_Vector256_CreateScalar: + case NI_Vector512_CreateScalar: case NI_Vector128_CreateScalarUnsafe: case NI_Vector256_CreateScalarUnsafe: case NI_Vector512_CreateScalarUnsafe: @@ -9414,6 +9111,9 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre switch (intrinsicId) { + case NI_Vector128_CreateScalar: + case NI_Vector256_CreateScalar: + case NI_Vector512_CreateScalar: case NI_Vector128_CreateScalarUnsafe: case NI_Vector256_CreateScalarUnsafe: case NI_Vector512_CreateScalarUnsafe: @@ -9509,7 +9209,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre { GenTreeHWIntrinsic* hwintrinsicOperand = broadcastOperand->AsHWIntrinsic(); - if (hwintrinsicOperand->OperIsCreateScalarUnsafe()) + if (HWIntrinsicInfo::IsVectorCreateScalarUnsafe(hwintrinsicOperand->GetHWIntrinsicId())) { // CreateScalarUnsafe can contain non-memory operands such as enregistered // locals, so we want to check if its operand is containable instead. This @@ -9856,9 +9556,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) if ((simdSize == 8) || (simdSize == 12)) { // We want to handle GetElement/ToScalar still for Vector2/3 - if ((intrinsicId != NI_Vector128_GetElement) && (intrinsicId != NI_Vector128_ToScalar) && - (intrinsicId != NI_Vector256_GetElement) && (intrinsicId != NI_Vector256_ToScalar) && - (intrinsicId != NI_Vector512_GetElement) && (intrinsicId != NI_Vector512_ToScalar)) + if (!HWIntrinsicInfo::IsVectorToScalar(intrinsicId) && !HWIntrinsicInfo::IsVectorGetElement(intrinsicId)) { // TODO-XArch-CQ: Ideally we would key this off of the size the containing node // expects vs the size node actually is or would be if spilled to the stack @@ -9967,7 +9665,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) { GenTreeHWIntrinsic* childNode = op1->AsHWIntrinsic(); - if (childNode->OperIsCreateScalarUnsafe()) + if (HWIntrinsicInfo::IsVectorCreateScalarUnsafe(childNode->GetHWIntrinsicId())) { // We have a very special case of BroadcastScalarToVector(CreateScalarUnsafe(op1)) // @@ -9979,6 +9677,12 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) // op1 directly, we'll then special case the codegen to materialize the value into a // SIMD register in the case it is marked optional and doesn't get spilled. + if (childNode->Op(1)->OperIsLong()) + { + // Decomposed longs require special codegen + return; + } + node->Op(1) = childNode->Op(1); BlockRange().Remove(op1); @@ -10083,6 +9787,50 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) return; } +#ifdef TARGET_X86 + case NI_Vector128_CreateScalar: + case NI_Vector256_CreateScalar: + case NI_Vector512_CreateScalar: + case NI_Vector128_CreateScalarUnsafe: + case NI_Vector256_CreateScalarUnsafe: + case NI_Vector512_CreateScalarUnsafe: + { + if (op1->OperIsLong()) + { + // Contain decomposed longs and handle them in codegen + assert(varTypeIsLong(simdBaseType)); + + for (GenTree* longOp : op1->Operands()) + { + if (IsContainableMemoryOp(longOp) && IsSafeToContainMem(node, longOp)) + { + MakeSrcContained(node, longOp); + } + else if (IsSafeToMarkRegOptional(node, longOp)) + { + MakeSrcRegOptional(node, longOp); + } + } + + MakeSrcContained(node, op1); + return; + } + break; + } + + case NI_Vector128_ToScalar: + case NI_Vector256_ToScalar: + case NI_Vector512_ToScalar: + { + // These will be contained by a STOREIND + if (varTypeIsLong(simdBaseType)) + { + return; + } + break; + } +#endif + default: { break; diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index eb427bea5ab886..3ef6952fb8aec5 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -2045,39 +2045,47 @@ int LinearScan::BuildIntrinsic(GenTree* tree) #ifdef FEATURE_HW_INTRINSICS //------------------------------------------------------------------------ -// SkipContainedCreateScalarUnsafe: Skips a contained CreateScalarUnsafe node +// SkipContainedUnaryOp: Skips a contained non-memory or const node // and gets the underlying op1 instead // // Arguments: // node - The node to handle // // Return Value: -// If node is a contained CreateScalarUnsafe, it's op1 is returned; +// If node is a contained non-memory or const unary op, its op1 is returned; // otherwise node is returned unchanged. -static GenTree* SkipContainedCreateScalarUnsafe(GenTree* node) +static GenTree* SkipContainedUnaryOp(GenTree* node) { - if (!node->OperIsHWIntrinsic() || !node->isContained()) + if (!node->isContained()) { return node; } - GenTreeHWIntrinsic* hwintrinsic = node->AsHWIntrinsic(); - NamedIntrinsic intrinsicId = hwintrinsic->GetHWIntrinsicId(); - - switch (intrinsicId) + if (node->OperIsHWIntrinsic()) { - case NI_Vector128_CreateScalarUnsafe: - case NI_Vector256_CreateScalarUnsafe: - case NI_Vector512_CreateScalarUnsafe: - { - return hwintrinsic->Op(1); - } + GenTreeHWIntrinsic* hwintrinsic = node->AsHWIntrinsic(); + NamedIntrinsic intrinsicId = hwintrinsic->GetHWIntrinsicId(); - default: + switch (intrinsicId) { - return node; + case NI_Vector128_CreateScalar: + case NI_Vector256_CreateScalar: + case NI_Vector512_CreateScalar: + case NI_Vector128_CreateScalarUnsafe: + case NI_Vector256_CreateScalarUnsafe: + case NI_Vector512_CreateScalarUnsafe: + { + return hwintrinsic->Op(1); + } + + default: + { + break; + } } } + + return node; } //------------------------------------------------------------------------ @@ -2134,8 +2142,8 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou } else { - // A contained CreateScalarUnsafe is special in that we're not containing it to load from - // memory and it isn't a constant. Instead, its essentially a "transparent" node we're ignoring + // In a few cases, we contain an operand that isn't a load from memory or a constant. Instead, + // it is essentially a "transparent" node we're ignoring or handling specially in codegen // to simplify the overall IR handling. As such, we need to "skip" such nodes when present and // get the underlying op1 so that delayFreeUse and other preferencing remains correct. @@ -2144,37 +2152,37 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou GenTree* op3 = nullptr; GenTree* op4 = nullptr; GenTree* op5 = nullptr; - GenTree* lastOp = SkipContainedCreateScalarUnsafe(intrinsicTree->Op(numArgs)); + GenTree* lastOp = SkipContainedUnaryOp(intrinsicTree->Op(numArgs)); switch (numArgs) { case 5: { - op5 = SkipContainedCreateScalarUnsafe(intrinsicTree->Op(5)); + op5 = SkipContainedUnaryOp(intrinsicTree->Op(5)); FALLTHROUGH; } case 4: { - op4 = SkipContainedCreateScalarUnsafe(intrinsicTree->Op(4)); + op4 = SkipContainedUnaryOp(intrinsicTree->Op(4)); FALLTHROUGH; } case 3: { - op3 = SkipContainedCreateScalarUnsafe(intrinsicTree->Op(3)); + op3 = SkipContainedUnaryOp(intrinsicTree->Op(3)); FALLTHROUGH; } case 2: { - op2 = SkipContainedCreateScalarUnsafe(intrinsicTree->Op(2)); + op2 = SkipContainedUnaryOp(intrinsicTree->Op(2)); FALLTHROUGH; } case 1: { - op1 = SkipContainedCreateScalarUnsafe(intrinsicTree->Op(1)); + op1 = SkipContainedUnaryOp(intrinsicTree->Op(1)); break; } @@ -2223,11 +2231,14 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou // must be handled within the case. switch (intrinsicId) { + case NI_Vector128_CreateScalar: + case NI_Vector256_CreateScalar: + case NI_Vector512_CreateScalar: case NI_Vector128_CreateScalarUnsafe: - case NI_Vector128_ToScalar: case NI_Vector256_CreateScalarUnsafe: - case NI_Vector256_ToScalar: case NI_Vector512_CreateScalarUnsafe: + case NI_Vector128_ToScalar: + case NI_Vector256_ToScalar: case NI_Vector512_ToScalar: { assert(numArgs == 1); @@ -2241,17 +2252,38 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou } else { - // We will either be in memory and need to be moved - // into a register of the appropriate size or we - // are already in an XMM/YMM/ZMM register and can stay - // where we are. + // CreateScalarUnsafe and ToScalar are essentially no-ops for floating point types and can reuse + // the op1 register. CreateScalar needs to clear the upper elements, so if we have a float and + // can't use insertps to zero the upper elements in-place, we'll need a different target reg. - tgtPrefUse = BuildUse(op1); + RefPosition* op1Use = BuildUse(op1); srcCount += 1; + + if ((baseType == TYP_FLOAT) && HWIntrinsicInfo::IsVectorCreateScalar(intrinsicId) && + !compiler->compOpportunisticallyDependsOn(InstructionSet_SSE41)) + { + setDelayFree(op1Use); + } + else + { + tgtPrefUse = op1Use; + } } buildUses = false; } +#if TARGET_X86 + else if (varTypeIsByte(baseType) && HWIntrinsicInfo::IsVectorToScalar(intrinsicId)) + { + dstCandidates = allByteRegs(); + } + else if (varTypeIsLong(baseType) && !compiler->compOpportunisticallyDependsOn(InstructionSet_SSE41)) + { + // For SSE2 fallbacks, we will need a temp register to insert the upper half of a long + buildInternalFloatRegisterDefForNode(intrinsicTree); + setInternalRegsDelayFree = true; + } +#endif // TARGET_X86 break; }