diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
index e0cbd9777da41a..0f41ad622f8e82 100644
--- a/src/coreclr/jit/codegenxarch.cpp
+++ b/src/coreclr/jit/codegenxarch.cpp
@@ -5685,6 +5685,13 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree)
                             // These intrinsics are "ins reg/mem, xmm"
                             ins  = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
                             attr = emitActualTypeSize(baseType);
+#if defined(TARGET_X86)
+                            if (varTypeIsLong(baseType))
+                            {
+                                ins  = INS_movq;
+                                attr = EA_8BYTE;
+                            }
+#endif // TARGET_X86
                             break;
                         }
 
diff --git a/src/coreclr/jit/decomposelongs.cpp b/src/coreclr/jit/decomposelongs.cpp
index 844263d400cfdb..0461a12d181d3a 100644
--- a/src/coreclr/jit/decomposelongs.cpp
+++ b/src/coreclr/jit/decomposelongs.cpp
@@ -78,11 +78,11 @@ void DecomposeLongs::DecomposeBlock(BasicBlock* block)
 // Return Value:
 //    None.
 //
-void DecomposeLongs::DecomposeRange(Compiler* compiler, LIR::Range& range)
+void DecomposeLongs::DecomposeRange(Compiler* compiler, Lowering* lowering, LIR::Range& range)
 {
     assert(compiler != nullptr);
 
-    DecomposeLongs decomposer(compiler);
+    DecomposeLongs decomposer(compiler, lowering);
     decomposer.m_range = &range;
 
     decomposer.DecomposeRangeHelper();
@@ -90,7 +90,7 @@ void DecomposeLongs::DecomposeRange(Compiler* compiler, LIR::Range& range)
 
 //------------------------------------------------------------------------
 // DecomposeLongs::DecomposeRangeHelper:
-//    Decompiose each node in the current range.
+//    Decompose each node in the current range.
 //
 //    Decomposition is done as an execution-order walk. Decomposition of
 //    a particular node can create new nodes that need to be further
@@ -122,44 +122,76 @@ void DecomposeLongs::DecomposeRangeHelper()
 GenTree* DecomposeLongs::DecomposeNode(GenTree* tree)
 {
     // Handle the case where we are implicitly using the lower half of a long lclVar.
-    if ((tree->TypeGet() == TYP_INT) && tree->OperIsLocal())
+    if (tree->TypeIs(TYP_INT) && tree->OperIsLocal())
     {
         LclVarDsc* varDsc = m_compiler->lvaGetDesc(tree->AsLclVarCommon());
         if (varTypeIsLong(varDsc) && varDsc->lvPromoted)
         {
-#ifdef DEBUG
-            if (m_compiler->verbose)
-            {
-                printf("Changing implicit reference to lo half of long lclVar to an explicit reference of its promoted "
-                       "half:\n");
-                m_compiler->gtDispTreeRange(Range(), tree);
-            }
-#endif // DEBUG
+            JITDUMP("Changing implicit reference to lo half of long lclVar to an explicit reference of its promoted "
+                    "half:\n");
+            DISPTREERANGE(Range(), tree);
+
             unsigned loVarNum = varDsc->lvFieldLclStart;
             tree->AsLclVarCommon()->SetLclNum(loVarNum);
             return tree->gtNext;
         }
     }
 
-    if (tree->TypeGet() != TYP_LONG)
+    if (!tree->TypeIs(TYP_LONG))
     {
         return tree->gtNext;
     }
 
-#ifdef DEBUG
-    if (m_compiler->verbose)
-    {
-        printf("Decomposing TYP_LONG tree.  BEFORE:\n");
-        m_compiler->gtDispTreeRange(Range(), tree);
-    }
-#endif // DEBUG
-
     LIR::Use use;
     if (!Range().TryGetUse(tree, &use))
     {
         LIR::Use::MakeDummyUse(Range(), tree, &use);
     }
 
+#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86)
+    if (!use.IsDummyUse())
+    {
+        // HWIntrinsics can consume/produce a long directly, provided its source/target is memory.
+        // Here we do a conservative check for specific cases where it is certain the load/store
+        // can be contained. In those cases, we can skip decomposition.
+
+        GenTree* user = use.User();
+
+        if (user->OperIsHWIntrinsic())
+        {
+            if (tree->OperIs(GT_CNS_LNG) ||
+                (tree->OperIs(GT_IND, GT_LCL_FLD) && m_lowering->IsSafeToContainMem(user, tree)))
+            {
+                NamedIntrinsic intrinsicId = user->AsHWIntrinsic()->GetHWIntrinsicId();
+                assert(HWIntrinsicInfo::IsVectorCreate(intrinsicId) ||
+                       HWIntrinsicInfo::IsVectorCreateScalar(intrinsicId) ||
+                       HWIntrinsicInfo::IsVectorCreateScalarUnsafe(intrinsicId));
+
+                return tree->gtNext;
+            }
+        }
+        else if (user->OperIs(GT_STOREIND) && tree->OperIsHWIntrinsic() && m_compiler->opts.OptimizationEnabled())
+        {
+            NamedIntrinsic intrinsicId = tree->AsHWIntrinsic()->GetHWIntrinsicId();
+            if (HWIntrinsicInfo::IsVectorToScalar(intrinsicId) && m_lowering->IsSafeToContainMem(user, tree))
+            {
+                return tree->gtNext;
+            }
+        }
+    }
+
+    if (tree->OperIs(GT_STOREIND) && tree->AsStoreInd()->Data()->OperIsHWIntrinsic())
+    {
+        // We should only get here if we matched the second pattern above.
+        assert(HWIntrinsicInfo::IsVectorToScalar(tree->AsStoreInd()->Data()->AsHWIntrinsic()->GetHWIntrinsicId()));
+
+        return tree->gtNext;
+    }
+#endif // FEATURE_HW_INTRINSICS && TARGET_X86
+
+    JITDUMP("Decomposing TYP_LONG tree.  BEFORE:\n");
+    DISPTREERANGE(Range(), tree);
+
     GenTree* nextNode = nullptr;
     switch (tree->OperGet())
     {
@@ -270,19 +302,14 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree)
 
     // If we replaced the argument to a GT_FIELD_LIST element with a GT_LONG node, split that field list
     // element into two elements: one for each half of the GT_LONG.
-    if ((use.Def()->OperGet() == GT_LONG) && !use.IsDummyUse() && (use.User()->OperGet() == GT_FIELD_LIST))
+    if (use.Def()->OperIs(GT_LONG) && !use.IsDummyUse() && use.User()->OperIs(GT_FIELD_LIST))
     {
         DecomposeFieldList(use.User()->AsFieldList(), use.Def()->AsOp());
     }
 
-#ifdef DEBUG
-    if (m_compiler->verbose)
-    {
-        // NOTE: st_lcl_var doesn't dump properly afterwards.
-        printf("Decomposing TYP_LONG tree.  AFTER:\n");
-        m_compiler->gtDispTreeRange(Range(), use.Def());
-    }
-#endif
+    // NOTE: st_lcl_var doesn't dump properly afterwards.
+    JITDUMP("Decomposing TYP_LONG tree.  AFTER:\n");
+    DISPTREERANGE(Range(), use.Def());
 
     // When casting from a decomposed long to a smaller integer we can discard the high part.
     if (m_compiler->opts.OptimizationEnabled() && !use.IsDummyUse() && use.User()->OperIs(GT_CAST) &&
@@ -1707,6 +1734,13 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsic(LIR::Use& use)
             return DecomposeHWIntrinsicGetElement(use, hwintrinsicTree);
         }
 
+        case NI_Vector128_ToScalar:
+        case NI_Vector256_ToScalar:
+        case NI_Vector512_ToScalar:
+        {
+            return DecomposeHWIntrinsicToScalar(use, hwintrinsicTree);
+        }
+
         case NI_EVEX_MoveMask:
         {
             return DecomposeHWIntrinsicMoveMask(use, hwintrinsicTree);
@@ -1751,9 +1785,7 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsicGetElement(LIR::Use& use, GenTreeHW
 {
     assert(node == use.Def());
     assert(varTypeIsLong(node));
-    assert((node->GetHWIntrinsicId() == NI_Vector128_GetElement) ||
-           (node->GetHWIntrinsicId() == NI_Vector256_GetElement) ||
-           (node->GetHWIntrinsicId() == NI_Vector512_GetElement));
+    assert(HWIntrinsicInfo::IsVectorGetElement(node->GetHWIntrinsicId()));
 
     GenTree*  op1          = node->Op(1);
     GenTree*  op2          = node->Op(2);
@@ -1835,6 +1867,75 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsicGetElement(LIR::Use& use, GenTreeHW
     return FinalizeDecomposition(use, loResult, hiResult, hiResult);
 }
 
+//------------------------------------------------------------------------
+// DecomposeHWIntrinsicToScalar: Decompose GT_HWINTRINSIC -- NI_Vector*_ToScalar.
+//
+// create:
+//
+// tmp_simd_var = simd_var
+// lo_result = GT_HWINTRINSIC{ToScalar}[int](tmp_simd_var)
+// hi_result = GT_HWINTRINSIC{GetElement}[int](tmp_simd_var, 1)
+//             - or -
+//             GT_HWINTRINSIC{ToScalar}[int](GT_RSZ(tmp_simd_var, 32))
+// return: GT_LONG(lo_result, hi_result)
+//
+// Arguments:
+//    use - the LIR::Use object for the def that needs to be decomposed.
+//   node - the hwintrinsic node to decompose
+//
+// Return Value:
+//    The GT_LONG node wrapping the upper and lower halves.
+//
+GenTree* DecomposeLongs::DecomposeHWIntrinsicToScalar(LIR::Use& use, GenTreeHWIntrinsic* node)
+{
+    assert(node == use.Def());
+    assert(varTypeIsLong(node));
+    assert(HWIntrinsicInfo::IsVectorToScalar(node->GetHWIntrinsicId()));
+
+    GenTree*       op1          = node->Op(1);
+    NamedIntrinsic intrinsicId  = node->GetHWIntrinsicId();
+    var_types      simdBaseType = node->GetSimdBaseType();
+    unsigned       simdSize     = node->GetSimdSize();
+
+    assert(varTypeIsLong(simdBaseType));
+    assert(varTypeIsSIMD(op1));
+
+    GenTree* simdTmpVar    = RepresentOpAsLocalVar(op1, node, &node->Op(1));
+    unsigned simdTmpVarNum = simdTmpVar->AsLclVarCommon()->GetLclNum();
+    JITDUMP("[DecomposeHWIntrinsicToScalar]: Saving op1 tree to a temp var:\n");
+    DISPTREERANGE(Range(), simdTmpVar);
+
+    GenTree* loResult = m_compiler->gtNewSimdToScalarNode(TYP_INT, simdTmpVar, CORINFO_TYPE_INT, simdSize);
+    Range().InsertAfter(simdTmpVar, loResult);
+
+    simdTmpVar = m_compiler->gtNewLclLNode(simdTmpVarNum, simdTmpVar->TypeGet());
+    Range().InsertAfter(loResult, simdTmpVar);
+
+    GenTree* hiResult;
+    if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_SSE41))
+    {
+        GenTree* one = m_compiler->gtNewIconNode(1);
+        hiResult     = m_compiler->gtNewSimdGetElementNode(TYP_INT, simdTmpVar, one, CORINFO_TYPE_INT, simdSize);
+
+        Range().InsertAfter(simdTmpVar, one, hiResult);
+    }
+    else
+    {
+        assert(m_compiler->compIsaSupportedDebugOnly(InstructionSet_SSE2));
+
+        GenTree* thirtyTwo = m_compiler->gtNewIconNode(32);
+        GenTree* shift     = m_compiler->gtNewSimdBinOpNode(GT_RSZ, op1->TypeGet(), simdTmpVar, thirtyTwo,
+                                                            node->GetSimdBaseJitType(), simdSize);
+        hiResult           = m_compiler->gtNewSimdToScalarNode(TYP_INT, shift, CORINFO_TYPE_INT, simdSize);
+
+        Range().InsertAfter(simdTmpVar, thirtyTwo, shift, hiResult);
+    }
+
+    Range().Remove(node);
+
+    return FinalizeDecomposition(use, loResult, hiResult, hiResult);
+}
+
 //------------------------------------------------------------------------
 // DecomposeHWIntrinsicMoveMask: Decompose GT_HWINTRINSIC -- NI_EVEX_MoveMask
 //
@@ -2262,6 +2363,13 @@ void DecomposeLongs::TryPromoteLongVar(unsigned lclNum)
     {
         return;
     }
+#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86)
+    if (varDsc->lvIsParam)
+    {
+        // Promotion blocks combined read optimizations for SIMD loads of long params
+        return;
+    }
+#endif // FEATURE_HW_INTRINSICS && TARGET_X86
 
     varDsc->lvFieldCnt      = 2;
     varDsc->lvFieldLclStart = m_compiler->lvaCount;
diff --git a/src/coreclr/jit/decomposelongs.h b/src/coreclr/jit/decomposelongs.h
index 02681322a552e9..e879292abf4996 100644
--- a/src/coreclr/jit/decomposelongs.h
+++ b/src/coreclr/jit/decomposelongs.h
@@ -14,19 +14,21 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 #define _DECOMPOSELONGS_H_
 
 #include "compiler.h"
+#include "lower.h"
 
 class DecomposeLongs
 {
 public:
-    DecomposeLongs(Compiler* compiler)
+    DecomposeLongs(Compiler* compiler, Lowering* lowering)
         : m_compiler(compiler)
+        , m_lowering(lowering)
     {
     }
 
     void PrepareForDecomposition();
     void DecomposeBlock(BasicBlock* block);
 
-    static void DecomposeRange(Compiler* compiler, LIR::Range& range);
+    static void DecomposeRange(Compiler* compiler, Lowering* lowering, LIR::Range& range);
 
 private:
     inline LIR::Range& Range() const
@@ -64,6 +66,7 @@ class DecomposeLongs
 #ifdef FEATURE_HW_INTRINSICS
     GenTree* DecomposeHWIntrinsic(LIR::Use& use);
     GenTree* DecomposeHWIntrinsicGetElement(LIR::Use& use, GenTreeHWIntrinsic* node);
+    GenTree* DecomposeHWIntrinsicToScalar(LIR::Use& use, GenTreeHWIntrinsic* node);
     GenTree* DecomposeHWIntrinsicMoveMask(LIR::Use& use, GenTreeHWIntrinsic* node);
 #endif // FEATURE_HW_INTRINSICS
 
@@ -80,6 +83,7 @@ class DecomposeLongs
 
     // Data
     Compiler*   m_compiler;
+    Lowering*   m_lowering;
     LIR::Range* m_range;
 };
 
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index ff4d19fee3fa46..178d1b0232b20e 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -20772,22 +20772,31 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
             break;
 
         case INS_movd:
-        case INS_movq: // only MOVQ xmm, xmm is different (emitted by Sse2.MoveScalar, should use MOVDQU instead)
+        case INS_movq:
             if (memAccessKind == PERFSCORE_MEMORY_NONE)
             {
-                // movd   r32, xmm   or  xmm, r32
-                result.insThroughput = PERFSCORE_THROUGHPUT_1C;
-                result.insLatency    = PERFSCORE_LATENCY_3C;
+                if (isFloatReg(id->idReg1()) && isFloatReg(id->idReg2()))
+                {
+                    // movq   xmm, xmm
+                    result.insThroughput = PERFSCORE_THROUGHPUT_3X;
+                    result.insLatency    = PERFSCORE_LATENCY_1C;
+                }
+                else
+                {
+                    // movd   r32/64, xmm   or  xmm, r32/64
+                    result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+                    result.insLatency    = PERFSCORE_LATENCY_3C;
+                }
             }
             else if (memAccessKind == PERFSCORE_MEMORY_READ)
             {
-                // movd   xmm, m32
+                // ins   xmm, m32/64
                 result.insThroughput = PERFSCORE_THROUGHPUT_2X;
                 result.insLatency += PERFSCORE_LATENCY_2C;
             }
             else
             {
-                // movd   m32, xmm
+                // ins   m32/64, xmm
                 assert(memAccessKind == PERFSCORE_MEMORY_WRITE);
                 result.insThroughput = PERFSCORE_THROUGHPUT_1C;
                 result.insLatency += PERFSCORE_LATENCY_2C;
diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp
index ff2c9583ea8af1..c2cd287c306248 100644
--- a/src/coreclr/jit/gentree.cpp
+++ b/src/coreclr/jit/gentree.cpp
@@ -20260,7 +20260,7 @@ var_types GenTreeJitIntrinsic::GetSimdBaseType() const
 // isCommutativeHWIntrinsic: Checks if the intrinsic is commutative
 //
 // Return Value:
-//     true if the intrisic is commutative
+//     true if the intrinsic is commutative
 //
 bool GenTree::isCommutativeHWIntrinsic() const
 {
@@ -20422,6 +20422,9 @@ bool GenTree::isContainableHWIntrinsic() const
             return true;
         }
 
+        case NI_Vector128_CreateScalar:
+        case NI_Vector256_CreateScalar:
+        case NI_Vector512_CreateScalar:
         case NI_Vector128_CreateScalarUnsafe:
         case NI_Vector256_CreateScalarUnsafe:
         case NI_Vector512_CreateScalarUnsafe:
@@ -21568,7 +21571,7 @@ GenTree* Compiler::gtNewSimdCeilNode(var_types type, GenTree* op1, CorInfoType s
 
 #if defined(FEATURE_MASKED_HW_INTRINSICS)
 //------------------------------------------------------------------------
-// gtNewSimdCvtMaskToVectorNode: Convert a HW instrinsic mask node to a vector
+// gtNewSimdCvtMaskToVectorNode: Convert a HW intrinsic mask node to a vector
 //
 // Arguments:
 //    type            -- The type of the node to convert to
@@ -21993,7 +21996,7 @@ GenTree* Compiler::gtNewSimdCvtNativeNode(var_types   type,
 
 #if defined(FEATURE_MASKED_HW_INTRINSICS)
 //------------------------------------------------------------------------
-// gtNewSimdCvtVectorToMaskNode: Convert a HW instrinsic vector node to a mask
+// gtNewSimdCvtVectorToMaskNode: Convert a HW intrinsic vector node to a mask
 //
 // Arguments:
 //    type            -- The type of the mask to produce.
@@ -22747,16 +22750,6 @@ GenTree* Compiler::gtNewSimdCreateBroadcastNode(var_types   type,
     }
 
 #if defined(TARGET_XARCH)
-#if defined(TARGET_X86)
-    if (varTypeIsLong(simdBaseType) && !op1->IsIntegralConst())
-    {
-        // TODO-XARCH-CQ: It may be beneficial to emit the movq
-        // instruction, which takes a 64-bit memory address and
-        // works on 32-bit x86 systems.
-        unreached();
-    }
-#endif // TARGET_X86
-
     if (simdSize == 64)
     {
         hwIntrinsicID = NI_Vector512_Create;
@@ -22860,16 +22853,6 @@ GenTree* Compiler::gtNewSimdCreateScalarNode(var_types   type,
     }
 
 #if defined(TARGET_XARCH)
-#if defined(TARGET_X86)
-    if (varTypeIsLong(simdBaseType) && !op1->IsIntegralConst())
-    {
-        // TODO-XARCH-CQ: It may be beneficial to emit the movq
-        // instruction, which takes a 64-bit memory address and
-        // works on 32-bit x86 systems.
-        unreached();
-    }
-#endif // TARGET_X86
-
     if (simdSize == 32)
     {
         hwIntrinsicID = NI_Vector256_CreateScalar;
@@ -23005,16 +22988,6 @@ GenTree* Compiler::gtNewSimdCreateScalarUnsafeNode(var_types   type,
     }
 
 #if defined(TARGET_XARCH)
-#if defined(TARGET_X86)
-    if (varTypeIsLong(simdBaseType) && !op1->IsIntegralConst())
-    {
-        // TODO-XARCH-CQ: It may be beneficial to emit the movq
-        // instruction, which takes a 64-bit memory address and
-        // works on 32-bit x86 systems.
-        unreached();
-    }
-#endif // TARGET_X86
-
     if (simdSize == 32)
     {
         hwIntrinsicID = NI_Vector256_CreateScalarUnsafe;
@@ -23051,7 +23024,7 @@ GenTree* Compiler::gtNewSimdCreateScalarUnsafeNode(var_types   type,
 GenTree* Compiler::gtNewSimdCreateSequenceNode(
     var_types type, GenTree* op1, GenTree* op2, CorInfoType simdBaseJitType, unsigned simdSize)
 {
-    // This effectively doees: (Indices * op2) + Create(op1)
+    // This effectively does: (Indices * op2) + Create(op1)
     //
     // When both op2 and op1 are constant we can fully fold this to a constant. Additionally,
     // if only op2 is a constant we can simplify the computation by a lot. However, if only op1
@@ -23383,14 +23356,7 @@ GenTree* Compiler::gtNewSimdGetElementNode(
     assert(varTypeIsArithmetic(simdBaseType));
 
 #if defined(TARGET_XARCH)
-    bool useToScalar = op2->IsIntegralConst(0);
-
-#if defined(TARGET_X86)
-    // We handle decomposition via GetElement for simplicity
-    useToScalar &= !varTypeIsLong(simdBaseType);
-#endif // TARGET_X86
-
-    if (useToScalar)
+    if (op2->IsIntegralConst(0))
     {
         return gtNewSimdToScalarNode(type, op1, simdBaseJitType, simdSize);
     }
@@ -26368,18 +26334,6 @@ GenTree* Compiler::gtNewSimdToScalarNode(var_types type, GenTree* op1, CorInfoTy
     NamedIntrinsic intrinsic = NI_Illegal;
 
 #ifdef TARGET_XARCH
-#if defined(TARGET_X86)
-    if (varTypeIsLong(simdBaseType))
-    {
-        // We need SSE41 to handle long, use software fallback
-        assert(compIsaSupportedDebugOnly(InstructionSet_SSE41));
-
-        // Create a GetElement node which handles decomposition
-        GenTree* op2 = gtNewIconNode(0);
-        return gtNewSimdGetElementNode(type, op1, op2, simdBaseJitType, simdSize);
-    }
-#endif // TARGET_X86
-
     if (simdSize == 64)
     {
         assert(IsBaselineVector512IsaSupportedDebugOnly());
@@ -27539,7 +27493,7 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad(GenTree** pAddr) const
 }
 
 //------------------------------------------------------------------------
-// OperIsMemoryLoad: Does this HWI node have memory store semantics?
+// OperIsMemoryStore: Does this HWI node have memory store semantics?
 //
 // Arguments:
 //    pAddr - optional [out] parameter for the address
@@ -27674,7 +27628,7 @@ bool GenTreeHWIntrinsic::OperIsMemoryStoreOrBarrier() const
 }
 
 //------------------------------------------------------------------------
-// OperIsEmbBroadcastCompatible: Checks if the intrinsic is a embedded broadcast compatible inintrsic.
+// OperIsEmbBroadcastCompatible: Checks if the intrinsic is a embedded broadcast compatible intrinsic.
 //
 // Return Value:
 //     true if the intrinsic node lowering instruction is embedded broadcast compatible.
@@ -27740,37 +27694,6 @@ bool GenTreeHWIntrinsic::OperIsBroadcastScalar() const
 #endif
 }
 
-//------------------------------------------------------------------------
-// OperIsCreateScalarUnsafe: Is this HWIntrinsic a CreateScalarUnsafe node.
-//
-// Return Value:
-//    Whether "this" is a CreateScalarUnsafe node.
-//
-bool GenTreeHWIntrinsic::OperIsCreateScalarUnsafe() const
-{
-    NamedIntrinsic intrinsicId = GetHWIntrinsicId();
-
-    switch (intrinsicId)
-    {
-#if defined(TARGET_ARM64)
-        case NI_Vector64_CreateScalarUnsafe:
-#endif // TARGET_ARM64
-        case NI_Vector128_CreateScalarUnsafe:
-#if defined(TARGET_XARCH)
-        case NI_Vector256_CreateScalarUnsafe:
-        case NI_Vector512_CreateScalarUnsafe:
-#endif // TARGET_XARCH
-        {
-            return true;
-        }
-
-        default:
-        {
-            return false;
-        }
-    }
-}
-
 //------------------------------------------------------------------------
 // OperIsBitwiseHWIntrinsic: Is the operation a bitwise logic operation.
 //
diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h
index 6af651b6860882..9f22ca88372380 100644
--- a/src/coreclr/jit/gentree.h
+++ b/src/coreclr/jit/gentree.h
@@ -6434,7 +6434,6 @@ struct GenTreeHWIntrinsic : public GenTreeJitIntrinsic
     bool OperIsMemoryStoreOrBarrier() const;
     bool OperIsEmbBroadcastCompatible() const;
     bool OperIsBroadcastScalar() const;
-    bool OperIsCreateScalarUnsafe() const;
     bool OperIsBitwiseHWIntrinsic() const;
     bool OperIsEmbRoundingEnabled() const;
 
@@ -6788,26 +6787,25 @@ struct GenTreeVecCon : public GenTree
             case TYP_LONG:
             case TYP_ULONG:
             {
-#if defined(TARGET_64BIT)
-                if (arg->IsCnsIntOrI())
+                if (arg->IsIntegralConst())
                 {
-                    simdVal.i64[argIdx] = static_cast<int64_t>(arg->AsIntCon()->gtIconVal);
+                    simdVal.i64[argIdx] = arg->AsIntConCommon()->IntegralValue();
                     return true;
                 }
-#else
-                if (arg->OperIsLong() && arg->AsOp()->gtOp1->IsCnsIntOrI() && arg->AsOp()->gtOp2->IsCnsIntOrI())
+#if !defined(TARGET_64BIT)
+                else if (arg->OperIsLong() && arg->gtGetOp1()->IsCnsIntOrI() && arg->gtGetOp2()->IsCnsIntOrI())
                 {
-                    // 32-bit targets will decompose GT_CNS_LNG into two GT_CNS_INT
+                    // 32-bit targets may decompose GT_CNS_LNG into two GT_CNS_INT
                     // We need to reconstruct the 64-bit value in order to handle this
 
-                    INT64 gtLconVal = arg->AsOp()->gtOp2->AsIntCon()->gtIconVal;
+                    INT64 gtLconVal = arg->gtGetOp2()->AsIntCon()->gtIconVal;
                     gtLconVal <<= 32;
-                    gtLconVal |= arg->AsOp()->gtOp1->AsIntCon()->gtIconVal;
+                    gtLconVal |= static_cast<uint32_t>(arg->gtGetOp1()->AsIntCon()->gtIconVal);
 
                     simdVal.i64[argIdx] = gtLconVal;
                     return true;
                 }
-#endif // TARGET_64BIT
+#endif // !TARGET_64BIT
                 else
                 {
                     // We expect the constant to have been already zeroed
diff --git a/src/coreclr/jit/hwintrinsic.h b/src/coreclr/jit/hwintrinsic.h
index d8bf386eb6009d..d936d579d8e25a 100644
--- a/src/coreclr/jit/hwintrinsic.h
+++ b/src/coreclr/jit/hwintrinsic.h
@@ -923,6 +923,96 @@ struct HWIntrinsicInfo
         return false;
     }
 
+    static bool IsVectorCreate(NamedIntrinsic id)
+    {
+        switch (id)
+        {
+#if defined(TARGET_ARM64)
+            case NI_Vector64_Create:
+#endif // TARGET_ARM64
+            case NI_Vector128_Create:
+#if defined(TARGET_XARCH)
+            case NI_Vector256_Create:
+            case NI_Vector512_Create:
+#endif // TARGET_XARCH
+                return true;
+            default:
+                return false;
+        }
+    }
+
+    static bool IsVectorCreateScalar(NamedIntrinsic id)
+    {
+        switch (id)
+        {
+#if defined(TARGET_ARM64)
+            case NI_Vector64_CreateScalar:
+#endif // TARGET_ARM64
+            case NI_Vector128_CreateScalar:
+#if defined(TARGET_XARCH)
+            case NI_Vector256_CreateScalar:
+            case NI_Vector512_CreateScalar:
+#endif // TARGET_XARCH
+                return true;
+            default:
+                return false;
+        }
+    }
+
+    static bool IsVectorCreateScalarUnsafe(NamedIntrinsic id)
+    {
+        switch (id)
+        {
+#if defined(TARGET_ARM64)
+            case NI_Vector64_CreateScalarUnsafe:
+#endif // TARGET_ARM64
+            case NI_Vector128_CreateScalarUnsafe:
+#if defined(TARGET_XARCH)
+            case NI_Vector256_CreateScalarUnsafe:
+            case NI_Vector512_CreateScalarUnsafe:
+#endif // TARGET_XARCH
+                return true;
+            default:
+                return false;
+        }
+    }
+
+    static bool IsVectorGetElement(NamedIntrinsic id)
+    {
+        switch (id)
+        {
+#if defined(TARGET_ARM64)
+            case NI_Vector64_GetElement:
+#endif // TARGET_ARM64
+            case NI_Vector128_GetElement:
+#if defined(TARGET_XARCH)
+            case NI_Vector256_GetElement:
+            case NI_Vector512_GetElement:
+#endif // TARGET_XARCH
+                return true;
+            default:
+                return false;
+        }
+    }
+
+    static bool IsVectorToScalar(NamedIntrinsic id)
+    {
+        switch (id)
+        {
+#if defined(TARGET_ARM64)
+            case NI_Vector64_ToScalar:
+#endif // TARGET_ARM64
+            case NI_Vector128_ToScalar:
+#if defined(TARGET_XARCH)
+            case NI_Vector256_ToScalar:
+            case NI_Vector512_ToScalar:
+#endif // TARGET_XARCH
+                return true;
+            default:
+                return false;
+        }
+    }
+
     static bool HasImmediateOperand(NamedIntrinsic id)
     {
 #if defined(TARGET_ARM64)
diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp
index e96f6c0675b8f3..19792a61c4083e 100644
--- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp
+++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp
@@ -1080,7 +1080,7 @@ void CodeGen::genHWIntrinsic_R_RM(
 
     if (((instOptions & INS_OPTS_EVEX_b_MASK) != 0) && (rmOpDesc.GetKind() == OperandKind::Reg))
     {
-        // As embedded rounding only appies in R_R case, we can skip other checks for different paths.
+        // As embedded rounding only applies in R_R case, we can skip other checks for different paths.
         regNumber op1Reg = rmOp->GetRegNum();
         assert(op1Reg != REG_NA);
 
@@ -1171,7 +1171,7 @@ void CodeGen::genHWIntrinsic_R_RM(
                         // that failed and we either didn't get marked regOptional or we did and didn't get spilled
                         //
                         // As such, we need to emulate the removed CreateScalarUnsafe to ensure that op1 is in a
-                        // SIMD register so the broadcast instruction can execute succesfully. We'll just move
+                        // SIMD register so the broadcast instruction can execute successfully. We'll just move
                         // the value into the target register and then broadcast it out from that.
 
                         emitAttr movdAttr = emitActualTypeSize(node->GetSimdBaseType());
@@ -1435,7 +1435,7 @@ void CodeGen::genHWIntrinsic_R_R_R_RM(instruction ins,
 
     if (((instOptions & INS_OPTS_EVEX_b_MASK) != 0) && (op3Desc.GetKind() == OperandKind::Reg))
     {
-        // As embedded rounding only appies in R_R case, we can skip other checks for different paths.
+        // As embedded rounding only applies in R_R case, we can skip other checks for different paths.
         regNumber op3Reg = op3->GetRegNum();
         assert(op3Reg != REG_NA);
 
@@ -1616,7 +1616,7 @@ void CodeGen::genHWIntrinsicJumpTableFallback(NamedIntrinsic            intrinsi
 {
     assert(nonConstImmReg != REG_NA);
     // AVX2 Gather intrinsics use managed non-const fallback since they have discrete imm8 value range
-    // that does work with the current compiler generated jump-table fallback
+    // that does not work with the current compiler generated jump-table fallback
     assert(!HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsic));
     emitter* emit = GetEmitter();
 
@@ -1843,13 +1843,67 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions)
 
     switch (intrinsicId)
     {
+        case NI_Vector128_CreateScalar:
+        case NI_Vector256_CreateScalar:
+        case NI_Vector512_CreateScalar:
         case NI_Vector128_CreateScalarUnsafe:
         case NI_Vector256_CreateScalarUnsafe:
         case NI_Vector512_CreateScalarUnsafe:
         {
             if (varTypeIsIntegral(baseType))
             {
-                genHWIntrinsic_R_RM(node, ins, emitActualTypeSize(baseType), targetReg, op1, instOptions);
+                emitAttr baseAttr = emitActualTypeSize(baseType);
+
+#if defined(TARGET_X86)
+                if (varTypeIsLong(baseType))
+                {
+                    assert(op1->isContained());
+
+                    if (op1->OperIsLong())
+                    {
+                        node->SetSimdBaseJitType(CORINFO_TYPE_INT);
+
+                        bool     canCombineLoad = false;
+                        GenTree* loPart         = op1->gtGetOp1();
+                        GenTree* hiPart         = op1->gtGetOp2();
+
+                        if ((loPart->isContained() && hiPart->isContained()) &&
+                            (loPart->OperIs(GT_LCL_FLD) && hiPart->OperIs(GT_LCL_FLD)))
+                        {
+                            GenTreeLclFld* loFld = loPart->AsLclFld();
+                            GenTreeLclFld* hiFld = hiPart->AsLclFld();
+
+                            canCombineLoad = (hiFld->GetLclNum() == loFld->GetLclNum()) &&
+                                             (hiFld->GetLclOffs() == (loFld->GetLclOffs() + 4));
+                        }
+
+                        if (!canCombineLoad)
+                        {
+                            if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE41))
+                            {
+                                genHWIntrinsic_R_RM(node, ins, baseAttr, targetReg, loPart, instOptions);
+                                inst_RV_RV_TT_IV(INS_pinsrd, EA_16BYTE, targetReg, targetReg, hiPart, 0x01,
+                                                 !compiler->canUseVexEncoding(), instOptions);
+                            }
+                            else
+                            {
+                                regNumber tmpReg = internalRegisters.GetSingle(node);
+                                genHWIntrinsic_R_RM(node, ins, baseAttr, targetReg, loPart, instOptions);
+                                genHWIntrinsic_R_RM(node, ins, baseAttr, tmpReg, hiPart, instOptions);
+                                emit->emitIns_R_R(INS_punpckldq, EA_16BYTE, targetReg, tmpReg, instOptions);
+                            }
+                            break;
+                        }
+
+                        op1 = loPart;
+                    }
+
+                    ins      = INS_movq;
+                    baseAttr = EA_8BYTE;
+                }
+#endif // TARGET_X86
+
+                genHWIntrinsic_R_RM(node, ins, baseAttr, targetReg, op1, instOptions);
             }
             else
             {
@@ -1864,6 +1918,45 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions)
                 else
                 {
                     assert(instOptions == INS_OPTS_NONE);
+
+                    if (HWIntrinsicInfo::IsVectorCreateScalar(intrinsicId))
+                    {
+                        // If this is CreateScalar, we need to ensure the upper elements are zeroed.
+                        // Scalar integer loads and loads from memory always zero the upper elements,
+                        // so reg to reg copies of floating types are the only place we need to
+                        // do anything different.
+
+                        if (baseType == TYP_FLOAT)
+                        {
+                            if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE41))
+                            {
+                                // insertps imm8 is:
+                                //  * Bits 0-3: zmask
+                                //  * Bits 4-5: count_d
+                                //  * Bits 6-7: count_s (register form only)
+                                //
+                                // We want zmask 0b1110 (0xE) to zero elements 1/2/3
+                                // We want count_d 0b00 (0x0) to insert the value to element 0
+                                // We want count_s 0b00 (0x0) as we're just taking element 0 of the source
+
+                                emit->emitIns_SIMD_R_R_R_I(INS_insertps, attr, targetReg, targetReg, op1Reg, 0x0E,
+                                                           instOptions);
+                            }
+                            else
+                            {
+                                assert(targetReg != op1Reg);
+                                emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg, instOptions);
+                                emit->emitIns_Mov(INS_movss, attr, targetReg, op1Reg, /* canSkip */ false);
+                            }
+                        }
+                        else
+                        {
+                            // `movq xmm xmm` zeroes the upper 64 bits.
+                            genHWIntrinsic_R_RM(node, INS_movq, attr, targetReg, op1, instOptions);
+                        }
+                        break;
+                    }
+
                     // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
                     emit->emitIns_Mov(INS_movaps, attr, targetReg, op1Reg, /* canSkip */ true);
                 }
@@ -2045,6 +2138,20 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions)
                 }
                 genHWIntrinsic_R_RM(node, ins, attr, targetReg, op1, instOptions);
             }
+            else if (varTypeIsIntegral(baseType))
+            {
+                assert(!varTypeIsLong(baseType) || TargetArchitecture::Is64Bit);
+                assert(HWIntrinsicInfo::IsVectorToScalar(intrinsicId));
+
+                attr = emitActualTypeSize(baseType);
+                genHWIntrinsic_R_RM(node, ins, attr, targetReg, op1, instOptions);
+
+                if (varTypeIsSmall(baseType))
+                {
+                    emit->emitIns_Mov(ins_Move_Extend(baseType, /* srcInReg */ true), emitTypeSize(baseType), targetReg,
+                                      targetReg, /* canSkip */ false);
+                }
+            }
             else
             {
                 assert(varTypeIsFloating(baseType));
diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h
index 7d572587bbaab4..b684fd2b946cbb 100644
--- a/src/coreclr/jit/hwintrinsiclistxarch.h
+++ b/src/coreclr/jit/hwintrinsiclistxarch.h
@@ -62,7 +62,7 @@ HARDWARE_INTRINSIC(Vector128,       ConvertToUInt32Native,
 HARDWARE_INTRINSIC(Vector128,       ConvertToUInt64,                            16,             1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(Vector128,       ConvertToUInt64Native,                      16,             1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(Vector128,       Create,                                     16,            -1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
-HARDWARE_INTRINSIC(Vector128,       CreateScalar,                               16,             1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(Vector128,       CreateScalar,                               16,             1,      {INS_movd,              INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movss,              INS_movsd_simd},        HW_Category_SIMDScalar,             HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(Vector128,       CreateScalarUnsafe,                         16,             1,      {INS_movd,              INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movss,              INS_movsd_simd},        HW_Category_SIMDScalar,             HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(Vector128,       CreateSequence,                             16,             2,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_InvalidNodeId)
 HARDWARE_INTRINSIC(Vector128,       Dot,                                        16,             2,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg)
@@ -114,7 +114,7 @@ HARDWARE_INTRINSIC(Vector128,       StoreAligned,
 HARDWARE_INTRINSIC(Vector128,       StoreAlignedNonTemporal,                    16,             2,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(Vector128,       StoreUnsafe,                                16,            -1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(Vector128,       Sum,                                        16,             1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
-HARDWARE_INTRINSIC(Vector128,       ToScalar,                                   16,             1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_movss,              INS_movsd_simd},        HW_Category_SIMDScalar,             HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(Vector128,       ToScalar,                                   16,             1,      {INS_movd,              INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movss,              INS_movsd_simd},        HW_Category_SIMDScalar,             HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(Vector128,       ToVector256,                                16,             1,      {INS_movdqu,            INS_movdqu,             INS_movdqu,             INS_movdqu,             INS_movdqu,             INS_movdqu,             INS_movdqu,             INS_movdqu,             INS_movups,             INS_movupd},            HW_Category_SimpleSIMD,             HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(Vector128,       ToVector256Unsafe,                          16,             1,      {INS_movdqu,            INS_movdqu,             INS_movdqu,             INS_movdqu,             INS_movdqu,             INS_movdqu,             INS_movdqu,             INS_movdqu,             INS_movups,             INS_movupd},            HW_Category_SimpleSIMD,             HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(Vector128,       ToVector512,                                16,             1,      {INS_movdqu,            INS_movdqu,             INS_movdqu,             INS_movdqu,             INS_movdqu,             INS_movdqu,             INS_vmovdqu64,          INS_vmovdqu64,          INS_movups,             INS_movupd},            HW_Category_SimpleSIMD,             HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
@@ -179,7 +179,7 @@ HARDWARE_INTRINSIC(Vector256,       ConvertToUInt32Native,
 HARDWARE_INTRINSIC(Vector256,       ConvertToUInt64,                            32,             1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(Vector256,       ConvertToUInt64Native,                      32,             1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(Vector256,       Create,                                     32,            -1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible)
-HARDWARE_INTRINSIC(Vector256,       CreateScalar,                               32,            -1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible)
+HARDWARE_INTRINSIC(Vector256,       CreateScalar,                               32,             1,      {INS_movd,              INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movss,              INS_movsd_simd},        HW_Category_SIMDScalar,             HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_AvxOnlyCompatible)
 HARDWARE_INTRINSIC(Vector256,       CreateScalarUnsafe,                         32,             1,      {INS_movd,              INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movss,              INS_movsd_simd},        HW_Category_SIMDScalar,             HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_AvxOnlyCompatible)
 HARDWARE_INTRINSIC(Vector256,       CreateSequence,                             32,             2,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_InvalidNodeId|HW_Flag_AvxOnlyCompatible)
 HARDWARE_INTRINSIC(Vector256,       Dot,                                        32,             2,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg)
@@ -233,7 +233,7 @@ HARDWARE_INTRINSIC(Vector256,       StoreAligned,
 HARDWARE_INTRINSIC(Vector256,       StoreAlignedNonTemporal,                    32,             2,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg|HW_Flag_AvxOnlyCompatible)
 HARDWARE_INTRINSIC(Vector256,       StoreUnsafe,                                32,            -1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg|HW_Flag_AvxOnlyCompatible)
 HARDWARE_INTRINSIC(Vector256,       Sum,                                        32,             1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
-HARDWARE_INTRINSIC(Vector256,       ToScalar,                                   32,             1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_movss,              INS_movsd_simd},        HW_Category_SIMDScalar,             HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_AvxOnlyCompatible)
+HARDWARE_INTRINSIC(Vector256,       ToScalar,                                   32,             1,      {INS_movd,              INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movss,              INS_movsd_simd},        HW_Category_SIMDScalar,             HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_AvxOnlyCompatible)
 HARDWARE_INTRINSIC(Vector256,       ToVector512,                                32,             1,      {INS_movdqu,            INS_movdqu,             INS_movdqu,             INS_movdqu,             INS_movdqu,             INS_movdqu,             INS_vmovdqu64,          INS_vmovdqu64,          INS_movups,             INS_movupd},            HW_Category_SimpleSIMD,             HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(Vector256,       ToVector512Unsafe,                          32,             1,      {INS_movdqu,            INS_movdqu,             INS_movdqu,             INS_movdqu,             INS_movdqu,             INS_movdqu,             INS_vmovdqu64,          INS_vmovdqu64,          INS_movups,             INS_movupd},            HW_Category_SimpleSIMD,             HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(Vector256,       Truncate,                                   32,             1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_InvalidNodeId)
@@ -299,7 +299,7 @@ HARDWARE_INTRINSIC(Vector512,       ConvertToUInt32Native,
 HARDWARE_INTRINSIC(Vector512,       ConvertToUInt64,                            64,             1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(Vector512,       ConvertToUInt64Native,                      64,             1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(Vector512,       Create,                                     64,            -1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
-HARDWARE_INTRINSIC(Vector512,       CreateScalar,                               64,            -1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
+HARDWARE_INTRINSIC(Vector512,       CreateScalar,                               64,             1,      {INS_movd,              INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movss,              INS_movsd_simd},        HW_Category_SIMDScalar,             HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen)
 HARDWARE_INTRINSIC(Vector512,       CreateScalarUnsafe,                         64,             1,      {INS_movd,              INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movss,              INS_movsd_simd},        HW_Category_SIMDScalar,             HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen)
 HARDWARE_INTRINSIC(Vector512,       CreateSequence,                             64,             2,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_InvalidNodeId)
 HARDWARE_INTRINSIC(Vector512,       Dot,                                        64,             2,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg)
@@ -354,7 +354,7 @@ HARDWARE_INTRINSIC(Vector512,       StoreAligned,
 HARDWARE_INTRINSIC(Vector512,       StoreAlignedNonTemporal,                    64,             2,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(Vector512,       StoreUnsafe,                                64,            -1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(Vector512,       Sum,                                        64,             1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
-HARDWARE_INTRINSIC(Vector512,       ToScalar,                                   64,             1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_movss,              INS_movsd_simd},        HW_Category_SIMDScalar,             HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(Vector512,       ToScalar,                                   64,             1,      {INS_movd,              INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movd,               INS_movss,              INS_movsd_simd},        HW_Category_SIMDScalar,             HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(Vector512,       Truncate,                                   64,             1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_InvalidNodeId)
 HARDWARE_INTRINSIC(Vector512,       WidenLower,                                 64,             1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(Vector512,       WidenUpper,                                 64,             1,      {INS_invalid,           INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid,            INS_invalid},           HW_Category_Helper,                 HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp
index 33278184d02d29..c1b04a3ddf9ba3 100644
--- a/src/coreclr/jit/hwintrinsicxarch.cpp
+++ b/src/coreclr/jit/hwintrinsicxarch.cpp
@@ -2125,16 +2125,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic        intrinsic,
         {
             if (sig->numArgs == 1)
             {
-#if defined(TARGET_X86)
-                if (varTypeIsLong(simdBaseType) && !impStackTop(0).val->IsIntegralConst())
-                {
-                    // TODO-XARCH-CQ: It may be beneficial to emit the movq
-                    // instruction, which takes a 64-bit memory address and
-                    // works on 32-bit x86 systems.
-                    break;
-                }
-#endif // TARGET_X86
-
                 op1     = impPopStack().val;
                 retNode = gtNewSimdCreateBroadcastNode(retType, op1, simdBaseJitType, simdSize);
                 break;
@@ -2266,16 +2256,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic        intrinsic,
                 break;
             }
 
-#if defined(TARGET_X86)
-            if (varTypeIsLong(simdBaseType))
-            {
-                // TODO-XARCH-CQ: It may be beneficial to emit the movq
-                // instruction, which takes a 64-bit memory address and
-                // works on 32-bit x86 systems.
-                break;
-            }
-#endif // TARGET_X86
-
             IntrinsicNodeBuilder nodeBuilder(getAllocator(CMK_ASTNode), sig->numArgs);
 
             // TODO-CQ: We don't handle contiguous args for anything except TYP_FLOAT today
@@ -2321,16 +2301,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic        intrinsic,
         {
             assert(sig->numArgs == 1);
 
-#if defined(TARGET_X86)
-            if (varTypeIsLong(simdBaseType) && !impStackTop(0).val->IsIntegralConst())
-            {
-                // TODO-XARCH-CQ: It may be beneficial to emit the movq
-                // instruction, which takes a 64-bit memory address and
-                // works on 32-bit x86 systems.
-                break;
-            }
-#endif // TARGET_X86
-
             op1     = impPopStack().val;
             retNode = gtNewSimdCreateScalarNode(retType, op1, simdBaseJitType, simdSize);
             break;
@@ -2342,16 +2312,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic        intrinsic,
         {
             assert(sig->numArgs == 1);
 
-#if defined(TARGET_X86)
-            if (varTypeIsLong(simdBaseType) && !impStackTop(0).val->IsIntegralConst())
-            {
-                // TODO-XARCH-CQ: It may be beneficial to emit the movq
-                // instruction, which takes a 64-bit memory address and
-                // works on 32-bit x86 systems.
-                break;
-            }
-#endif // TARGET_X86
-
             op1     = impPopStack().val;
             retNode = gtNewSimdCreateScalarUnsafeNode(retType, op1, simdBaseJitType, simdSize);
             break;
@@ -2376,27 +2336,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic        intrinsic,
                         break;
                     }
                 }
-
-                if (varTypeIsLong(simdBaseType))
-                {
-                    if (!impStackTop(0).val->OperIsConst())
-                    {
-                        // When op2 is a constant, we can skip the multiplication allowing us to always
-                        // generate better code. However, if it isn't then we need to fallback in the
-                        // cases where multiplication isn't supported.
-
-                        if ((simdSize != 64) && !canUseEvexEncoding())
-                        {
-                            // TODO-XARCH-CQ: We should support long/ulong multiplication
-                            break;
-                        }
-                    }
-
-#if defined(TARGET_X86)
-                    // TODO-XARCH-CQ: We need to support 64-bit CreateBroadcast
-                    break;
-#endif // TARGET_X86
-                }
             }
 
             impSpillSideEffect(true, stackState.esStackDepth -
@@ -2462,14 +2401,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic        intrinsic,
                 break;
             }
 
-#if defined(TARGET_X86)
-            if (varTypeIsLong(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_SSE41))
-            {
-                // We need SSE41 to handle long, use software fallback
-                break;
-            }
-#endif // TARGET_X86
-
             op2 = impSIMDPopStack();
             op1 = impSIMDPopStack();
 
@@ -2765,13 +2696,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic        intrinsic,
                 case TYP_LONG:
                 case TYP_ULONG:
                 {
-                    bool useToScalar = op2->IsIntegralConst(0);
-
-#if defined(TARGET_X86)
-                    useToScalar &= !varTypeIsLong(simdBaseType);
-#endif // TARGET_X86
-
-                    if (!useToScalar && !compOpportunisticallyDependsOn(InstructionSet_SSE41))
+                    if (!op2->IsIntegralConst(0) && !compOpportunisticallyDependsOn(InstructionSet_SSE41))
                     {
                         // Using software fallback if simdBaseType is not supported by hardware
                         return nullptr;
@@ -3349,15 +3274,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic        intrinsic,
                 break;
             }
 
-#if defined(TARGET_X86)
-            if (varTypeIsLong(simdBaseType))
-            {
-                // TODO-XARCH-CQ: We can't handle long here, only because one of the args might
-                // be scalar, and gtNewSimdCreateBroadcastNode doesn't handle long on x86.
-                break;
-            }
-#endif // TARGET_X86
-
             CORINFO_ARG_LIST_HANDLE arg1     = sig->args;
             CORINFO_ARG_LIST_HANDLE arg2     = info.compCompHnd->getArgNext(arg1);
             var_types               argType  = TYP_UNKNOWN;
@@ -3514,18 +3430,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic        intrinsic,
         {
             assert(sig->numArgs == 2);
 
-#if defined(TARGET_X86)
-            if ((simdBaseType == TYP_LONG) || (simdBaseType == TYP_DOUBLE))
-            {
-                if (!compOpportunisticallyDependsOn(InstructionSet_EVEX) && !impStackTop(0).val->IsCnsIntOrI())
-                {
-                    // If vpsraq is available, we can use that. We can also trivially emulate arithmetic shift by const
-                    // amount. Otherwise, more work is required for long types, so we fall back to managed for now.
-                    break;
-                }
-            }
-#endif // TARGET_X86
-
             if ((simdSize != 32) || compOpportunisticallyDependsOn(InstructionSet_AVX2))
             {
                 genTreeOps op = varTypeIsUnsigned(simdBaseType) ? GT_RSZ : GT_RSH;
@@ -3781,14 +3685,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic        intrinsic,
         {
             assert(sig->numArgs == 1);
 
-#if defined(TARGET_X86)
-            if (varTypeIsLong(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_SSE41))
-            {
-                // We need SSE41 to handle long, use software fallback
-                break;
-            }
-#endif // TARGET_X86
-
             op1     = impSIMDPopStack();
             retNode = gtNewSimdSumNode(retType, op1, simdBaseJitType, simdSize);
             break;
@@ -3800,14 +3696,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic        intrinsic,
         {
             assert(sig->numArgs == 1);
 
-#if defined(TARGET_X86)
-            if (varTypeIsLong(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_SSE41))
-            {
-                // We need SSE41 to handle long, use software fallback
-                break;
-            }
-#endif // TARGET_X86
-
             op1     = impSIMDPopStack();
             retNode = gtNewSimdToScalarNode(retType, op1, simdBaseJitType, simdSize);
             break;
diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp
index fe3da0a63eb904..bd614016bdb24d 100644
--- a/src/coreclr/jit/instr.cpp
+++ b/src/coreclr/jit/instr.cpp
@@ -920,6 +920,9 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op)
                     break;
                 }
 
+                case NI_Vector128_CreateScalar:
+                case NI_Vector256_CreateScalar:
+                case NI_Vector512_CreateScalar:
                 case NI_Vector128_CreateScalarUnsafe:
                 case NI_Vector256_CreateScalarUnsafe:
                 case NI_Vector512_CreateScalarUnsafe:
@@ -927,7 +930,7 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op)
                     // The hwintrinsic should be contained and its
                     // op1 should be either contained or spilled. This
                     // allows us to transparently "look through" the
-                    // CreateScalarUnsafe and treat it directly like
+                    // CreateScalar/Unsafe and treat it directly like
                     // a load from memory.
 
                     assert(hwintrinsic->isContained());
diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp
index 965493f39f346a..8b1dc5d10b1e06 100644
--- a/src/coreclr/jit/lower.cpp
+++ b/src/coreclr/jit/lower.cpp
@@ -7968,7 +7968,7 @@ PhaseStatus Lowering::DoPhase()
     }
 
 #if !defined(TARGET_64BIT)
-    DecomposeLongs decomp(comp); // Initialize the long decomposition class.
+    DecomposeLongs decomp(comp, this); // Initialize the long decomposition class.
     if (comp->compLongUsed)
     {
         decomp.PrepareForDecomposition();
diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h
index 611ceb09339233..d44880bd947554 100644
--- a/src/coreclr/jit/lower.h
+++ b/src/coreclr/jit/lower.h
@@ -464,61 +464,6 @@ class Lowering final : public Phase
                                                  unsigned    simdSize);
 #endif // FEATURE_HW_INTRINSICS
 
-    //----------------------------------------------------------------------------------------------
-    // TryRemoveCastIfPresent: Removes op it is a cast operation and the size of its input is at
-    //                         least the size of expectedType
-    //
-    //  Arguments:
-    //     expectedType - The expected type of the cast operation input if it is to be removed
-    //     op           - The tree to remove if it is a cast op whose input is at least the size of expectedType
-    //
-    //  Returns:
-    //     op if it was not a cast node or if its input is not at least the size of expected type;
-    //     Otherwise, it returns the underlying operation that was being casted
-    GenTree* TryRemoveCastIfPresent(var_types expectedType, GenTree* op)
-    {
-        if (!op->OperIs(GT_CAST) || !comp->opts.OptimizationEnabled())
-        {
-            return op;
-        }
-
-        GenTreeCast* cast   = op->AsCast();
-        GenTree*     castOp = cast->CastOp();
-
-        // FP <-> INT casts should be kept
-        if (varTypeIsFloating(castOp) ^ varTypeIsFloating(expectedType))
-        {
-            return op;
-        }
-
-        // Keep casts which can overflow
-        if (cast->gtOverflow())
-        {
-            return op;
-        }
-
-        // Keep casts with operands usable from memory.
-        if (castOp->isContained() || castOp->IsRegOptional())
-        {
-            return op;
-        }
-
-        if (genTypeSize(cast->CastToType()) >= genTypeSize(expectedType))
-        {
-#ifndef TARGET_64BIT
-            // Don't expose TYP_LONG on 32bit
-            if (castOp->TypeIs(TYP_LONG))
-            {
-                return op;
-            }
-#endif
-            BlockRange().Remove(op);
-            return castOp;
-        }
-
-        return op;
-    }
-
     // Utility functions
 public:
     static bool IndirsAreEquivalent(GenTree* pTreeA, GenTree* pTreeB);
@@ -568,6 +513,13 @@ class Lowering final : public Phase
     bool IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTree* childNode, bool* supportsRegOptional);
 #endif // FEATURE_HW_INTRINSICS
 
+    // Checks for memory conflicts in the instructions between childNode and parentNode, and returns true if childNode
+    // can be contained.
+    bool IsSafeToContainMem(GenTree* parentNode, GenTree* childNode) const;
+
+    // Similar to above, but allows bypassing a "transparent" parent.
+    bool IsSafeToContainMem(GenTree* grandparentNode, GenTree* parentNode, GenTree* childNode) const;
+
     static void TransformUnusedIndirection(GenTreeIndir* ind, Compiler* comp, BasicBlock* block);
 
 private:
@@ -599,13 +551,6 @@ class Lowering final : public Phase
                                  GenTree* endExclusive,
                                  GenTree* ignoreNode) const;
 
-    // Checks for memory conflicts in the instructions between childNode and parentNode, and returns true if childNode
-    // can be contained.
-    bool IsSafeToContainMem(GenTree* parentNode, GenTree* childNode) const;
-
-    // Similar to above, but allows bypassing a "transparent" parent.
-    bool IsSafeToContainMem(GenTree* grandparentNode, GenTree* parentNode, GenTree* childNode) const;
-
     // Check if marking an operand of a node as reg-optional is safe.
     bool IsSafeToMarkRegOptional(GenTree* parentNode, GenTree* node) const;
 
diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp
index 963d0c4d1fbb7a..8c2528de2c2e65 100644
--- a/src/coreclr/jit/lowerarmarch.cpp
+++ b/src/coreclr/jit/lowerarmarch.cpp
@@ -2015,11 +2015,9 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
 //
 bool Lowering::IsValidConstForMovImm(GenTreeHWIntrinsic* node)
 {
-    assert((node->GetHWIntrinsicId() == NI_Vector64_Create) || (node->GetHWIntrinsicId() == NI_Vector128_Create) ||
-           (node->GetHWIntrinsicId() == NI_Vector64_CreateScalar) ||
-           (node->GetHWIntrinsicId() == NI_Vector128_CreateScalar) ||
-           (node->GetHWIntrinsicId() == NI_Vector64_CreateScalarUnsafe) ||
-           (node->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe) ||
+    assert(HWIntrinsicInfo::IsVectorCreate(node->GetHWIntrinsicId()) ||
+           HWIntrinsicInfo::IsVectorCreateScalar(node->GetHWIntrinsicId()) ||
+           HWIntrinsicInfo::IsVectorCreateScalarUnsafe(node->GetHWIntrinsicId()) ||
            (node->GetHWIntrinsicId() == NI_AdvSimd_DuplicateToVector64) ||
            (node->GetHWIntrinsicId() == NI_AdvSimd_DuplicateToVector128) ||
            (node->GetHWIntrinsicId() == NI_AdvSimd_Arm64_DuplicateToVector64) ||
@@ -2278,7 +2276,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
     assert(simdSize != 0);
 
     bool   isConstant     = GenTreeVecCon::IsHWIntrinsicCreateConstant<simd_t>(node, simdVal);
-    bool   isCreateScalar = (intrinsicId == NI_Vector64_CreateScalar) || (intrinsicId == NI_Vector128_CreateScalar);
+    bool   isCreateScalar = HWIntrinsicInfo::IsVectorCreateScalar(intrinsicId);
     size_t argCnt         = node->GetOperandCount();
 
     // Check if we have a cast that we can remove. Note that "IsValidConstForMovImm"
diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp
index aa6e18f682781e..743218ecede33b 100644
--- a/src/coreclr/jit/lowerxarch.cpp
+++ b/src/coreclr/jit/lowerxarch.cpp
@@ -2044,26 +2044,14 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
             break;
         }
 
-        case NI_SSE2_Insert:
         case NI_SSE41_Insert:
-        case NI_SSE41_X64_Insert:
         {
             assert(node->GetOperandCount() == 3);
 
-            var_types simdBaseType = node->GetSimdBaseType();
-
-            // Insert takes either a 32-bit register or a memory operand.
-            // In either case, only SimdBaseType bits are read and so
-            // widening or narrowing the operand may be unnecessary and it
-            // can just be used directly.
-
-            node->Op(2) = TryRemoveCastIfPresent(simdBaseType, node->Op(2));
-
-            if (simdBaseType != TYP_FLOAT)
+            if (node->GetSimdBaseType() != TYP_FLOAT)
             {
                 break;
             }
-            assert(intrinsicId == NI_SSE41_Insert);
 
             // We have Sse41.Insert in which case we can specially handle
             // a couple of interesting scenarios involving chains of Inserts
@@ -2272,19 +2260,6 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
             break;
         }
 
-        case NI_SSE42_Crc32:
-        {
-            assert(node->GetOperandCount() == 2);
-
-            // Crc32 takes either a bit register or a memory operand.
-            // In either case, only gtType bits are read and so widening
-            // or narrowing the operand may be unnecessary and it can
-            // just be used directly.
-
-            node->Op(2) = TryRemoveCastIfPresent(node->TypeGet(), node->Op(2));
-            break;
-        }
-
         case NI_SSE2_CompareGreaterThan:
         {
             if (node->GetSimdBaseType() != TYP_DOUBLE)
@@ -3082,6 +3057,7 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm
                                     comp->gtNewSimdCreateBroadcastNode(simdType, broadcastOp,
                                                                        op1Intrinsic->GetSimdBaseJitType(), simdSize);
 
+                                assert(vecCns->IsCnsVec());
                                 BlockRange().InsertAfter(broadcastOp, vecCns);
                                 nestedOp2 = vecCns;
 
@@ -4032,10 +4008,9 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
     GenTree* tmp2 = nullptr;
     GenTree* tmp3 = nullptr;
 
-    bool isConstant     = GenTreeVecCon::IsHWIntrinsicCreateConstant<simd_t>(node, simdVal);
-    bool isCreateScalar = (intrinsicId == NI_Vector128_CreateScalar) || (intrinsicId == NI_Vector256_CreateScalar) ||
-                          (intrinsicId == NI_Vector512_CreateScalar);
-    size_t argCnt = node->GetOperandCount();
+    bool   isConstant     = GenTreeVecCon::IsHWIntrinsicCreateConstant<simd_t>(node, simdVal);
+    bool   isCreateScalar = HWIntrinsicInfo::IsVectorCreateScalar(intrinsicId);
+    size_t argCnt         = node->GetOperandCount();
 
     if (isConstant)
     {
@@ -4046,8 +4021,8 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
 #if !defined(TARGET_64BIT)
             if (arg->OperIsLong())
             {
-                BlockRange().Remove(arg->AsOp()->gtGetOp1());
-                BlockRange().Remove(arg->AsOp()->gtGetOp2());
+                BlockRange().Remove(arg->gtGetOp1());
+                BlockRange().Remove(arg->gtGetOp2());
             }
 #endif // !TARGET_64BIT
             BlockRange().Remove(arg);
@@ -4075,165 +4050,61 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
     {
         if (isCreateScalar)
         {
-            node->gtType = TYP_SIMD16;
-            node->SetSimdSize(16);
-
             switch (simdBaseType)
             {
                 case TYP_BYTE:
                 case TYP_UBYTE:
-                {
-                    // Types need to be explicitly zero-extended to ensure upper-bits are zero
-                    //
-                    // We need to explicitly use TYP_UBYTE since unsigned is ignored for small types
-                    // Explicitly handle both BYTE and UBYTE to account for reinterpret casts and the like
-                    //
-                    // The from type is INT since that is the input type tracked by IR, where-as the target
-                    // type needs to be UBYTE so it implicitly zero-extends back to TYP_INT
-
-                    tmp1 = comp->gtNewCastNode(TYP_INT, op1, /* unsigned */ true, TYP_UBYTE);
-                    BlockRange().InsertAfter(op1, tmp1);
-                    LowerNode(tmp1);
-
-                    node->ChangeHWIntrinsicId(NI_SSE2_ConvertScalarToVector128Int32, tmp1);
-                    node->SetSimdBaseJitType(CORINFO_TYPE_INT);
-                    break;
-                }
-
                 case TYP_SHORT:
                 case TYP_USHORT:
                 {
-                    // Types need to be explicitly zero-extended to ensure upper-bits are zero
+                    // The smallest scalar SIMD load that zeroes upper elements is 32 bits, so for CreateScalar,
+                    // we must ensure that the upper bits of that 32-bit value are zero if the base type is small.
                     //
-                    // We need to explicitly use TYP_USHORT since unsigned is ignored for small types
-                    // Explicitly handle both SHORT and USHORT to account for reinterpret casts and the like
+                    // The most likely case is that op1 is a cast from int/long to the base type:
+                    // *  CAST      int <- short <- int/long
+                    // If the base type is signed, that cast will be sign-extending, but we need zero extension,
+                    // so we can simply retype the cast to the unsigned type of the same size.
                     //
-                    // The from type is INT since that is the input type tracked by IR, where-as the target
-                    // type needs to be USHORT so it implicitly zero-extends back to TYP_INT
-
-                    tmp1 = comp->gtNewCastNode(TYP_INT, op1, /* unsigned */ true, TYP_USHORT);
-                    BlockRange().InsertAfter(op1, tmp1);
-                    LowerNode(tmp1);
+                    // It's also possible we have a memory load of the base type:
+                    // *  IND       short
+                    // We can likewise change the type of the indir to force zero extension on load.
+                    //
+                    // If we can't safely retype one of the above patterns and don't already have a cast to the
+                    // correct unsigned type, we will insert our own cast.
 
-                    node->ChangeHWIntrinsicId(NI_SSE2_ConvertScalarToVector128Int32, tmp1);
                     node->SetSimdBaseJitType(CORINFO_TYPE_INT);
-                    break;
-                }
-
-                case TYP_INT:
-                {
-                    node->ChangeHWIntrinsicId(NI_SSE2_ConvertScalarToVector128Int32);
-                    break;
-                }
 
-                case TYP_UINT:
-                {
-                    node->ChangeHWIntrinsicId(NI_SSE2_ConvertScalarToVector128UInt32);
-                    break;
-                }
+                    var_types unsignedType = varTypeToUnsigned(simdBaseType);
 
-#if defined(TARGET_AMD64)
-                case TYP_LONG:
-                {
-                    node->ChangeHWIntrinsicId(NI_SSE2_X64_ConvertScalarToVector128Int64);
-                    break;
-                }
-
-                case TYP_ULONG:
-                {
-                    node->ChangeHWIntrinsicId(NI_SSE2_X64_ConvertScalarToVector128UInt64);
-                    break;
-                }
-#endif // TARGET_AMD64
-
-                case TYP_FLOAT:
-                {
-                    tmp1 = comp->gtNewZeroConNode(simdType);
-                    BlockRange().InsertBefore(op1, tmp1);
-                    LowerNode(tmp1);
-
-                    if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
+                    if (op1->OperIs(GT_CAST) && !op1->gtOverflow())
                     {
-                        // Sse41.Insert has:
-                        //  * Bits 0-3: zmask
-                        //  * Bits 4-5: count_d
-                        //  * Bits 6-7: count_s (register form only)
-                        //
-                        // We want zmask 0b1110 (0xE) to zero elements 1/2/3
-                        // We want count_d 0b00 (0x0) to insert the value to element 0
-                        // We want count_s 0b00 (0x0) as we're just taking element 0 of the source
-
-                        idx = comp->gtNewIconNode(0x0E);
-                        BlockRange().InsertAfter(op1, idx);
-                        LowerNode(idx);
-
-                        node->ResetHWIntrinsicId(NI_SSE41_Insert, comp, tmp1, op1, idx);
+                        assert(op1->TypeIs(TYP_INT) && (genTypeSize(op1->CastToType()) == genTypeSize(simdBaseType)));
+                        op1->AsCast()->gtCastType = unsignedType;
                     }
-                    else
+                    else if (op1->OperIs(GT_IND, GT_LCL_FLD))
                     {
-                        node->ResetHWIntrinsicId(NI_SSE_MoveScalar, comp, tmp1, op1);
+                        assert(genTypeSize(op1) == genTypeSize(simdBaseType));
+                        op1->gtType = unsignedType;
+                    }
+                    else if (!op1->OperIs(GT_CAST) || (op1->AsCast()->CastToType() != unsignedType))
+                    {
+                        tmp1        = comp->gtNewCastNode(TYP_INT, op1, /* fromUnsigned */ false, unsignedType);
+                        node->Op(1) = tmp1;
+                        BlockRange().InsertAfter(op1, tmp1);
+                        LowerNode(tmp1);
                     }
-                    break;
-                }
-
-                case TYP_DOUBLE:
-                {
-                    tmp1 = comp->gtNewZeroConNode(simdType);
-                    BlockRange().InsertBefore(op1, tmp1);
-                    LowerNode(tmp1);
 
-                    node->ResetHWIntrinsicId(NI_SSE2_MoveScalar, comp, tmp1, op1);
                     break;
                 }
 
                 default:
                 {
-                    unreached();
-                }
-            }
-
-            if (simdSize > 16)
-            {
-                assert((simdSize == 32) || (simdSize == 64));
-
-                // We're creating a Vector256/512 scalar so we need to treat the original op as Vector128,
-                // we need to unsafely extend up to Vector256/512 (which is actually safe since the 128-bit
-                // op will zero extend up to 256/512-bits), and then we need to replace the original use
-                // with the new TYP_SIMD32/64 node.
-
-                node->ChangeType(TYP_SIMD16);
-                node->SetSimdSize(16);
-                LowerNode(node);
-
-                LIR::Use use;
-                bool     foundUse = BlockRange().TryGetUse(node, &use);
-
-                tmp2 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD32, node, NI_Vector128_ToVector256Unsafe, simdBaseJitType,
-                                                      16);
-                BlockRange().InsertAfter(node, tmp2);
-
-                if (simdSize == 64)
-                {
-                    tmp3 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD64, tmp2, NI_Vector256_ToVector512Unsafe,
-                                                          simdBaseJitType, 32);
-                    BlockRange().InsertAfter(tmp2, tmp3);
-                    tmp2 = tmp3;
-                }
-
-                if (foundUse)
-                {
-                    use.ReplaceWith(tmp2);
-                }
-                else
-                {
-                    node->ClearUnusedValue();
-                    tmp2->SetUnusedValue();
+                    break;
                 }
-
-                node = tmp2->AsHWIntrinsic();
             }
 
-            return LowerNode(node);
+            ContainCheckHWIntrinsic(node);
+            return node->gtNext;
         }
 
         // We have the following (where simd is simd16, simd32 or simd64):
@@ -4509,40 +4380,6 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
                 break;
             }
 
-#if defined(TARGET_AMD64)
-            case TYP_LONG:
-            case TYP_ULONG:
-            {
-                // We will be constructing the following parts:
-                //   ...
-                //          /--*  tmp1 simd16
-                //          *  STORE_LCL_VAR simd16
-                //   tmp1 =    LCL_VAR       simd16
-                //   tmp2 =    LCL_VAR       simd16
-                //          /--*  tmp1 simd16
-                //          +--*  tmp2 simd16
-                //   node = *  HWINTRINSIC simd16 ulong UnpackLow
-
-                // This is roughly the following managed code:
-                //   ...
-                //   var tmp2 = tmp1;
-                //   return Sse2.UnpackLow(tmp1, tmp2);
-
-                assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
-
-                node->Op(1) = tmp1;
-                LIR::Use tmp1Use(BlockRange(), &node->Op(1), node);
-                ReplaceWithLclVar(tmp1Use);
-                tmp1 = node->Op(1);
-
-                tmp2 = comp->gtClone(tmp1);
-                BlockRange().InsertAfter(tmp1, tmp2);
-
-                node->ResetHWIntrinsicId(NI_SSE2_UnpackLow, tmp1, tmp2);
-                break;
-            }
-#endif // TARGET_AMD64
-
             case TYP_FLOAT:
             {
                 if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX))
@@ -4599,9 +4436,12 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
                 break;
             }
 
+            case TYP_LONG:
+            case TYP_ULONG:
             case TYP_DOUBLE:
             {
-                if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE3))
+                if ((IsContainableMemoryOp(op1) || simdBaseType == TYP_DOUBLE) &&
+                    comp->compOpportunisticallyDependsOn(InstructionSet_SSE3))
                 {
                     // We will be constructing the following parts:
                     //   ...
@@ -4613,6 +4453,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
                     //   return Sse3.MoveAndDuplicate(tmp1);
 
                     node->ChangeHWIntrinsicId(NI_SSE3_MoveAndDuplicate, tmp1);
+                    node->SetSimdBaseJitType(CORINFO_TYPE_DOUBLE);
                     break;
                 }
 
@@ -4626,12 +4467,12 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
                 //   tmp2 =    LCL_VAR       simd16
                 //          /--*  tmp1 simd16
                 //          +--*  tmp2 simd16
-                //   node = *  HWINTRINSIC   simd16 float MoveLowToHigh
+                //   node = *  HWINTRINSIC   simd16 T UnpackLow
 
                 // This is roughly the following managed code:
                 //   ...
                 //   var tmp2 = tmp1;
-                //   return Sse.MoveLowToHigh(tmp1, tmp2);
+                //   return Sse2.UnpackLow(tmp1, tmp2);
 
                 node->Op(1) = tmp1;
                 LIR::Use tmp1Use(BlockRange(), &node->Op(1), node);
@@ -4641,8 +4482,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
                 tmp2 = comp->gtClone(tmp1);
                 BlockRange().InsertAfter(tmp1, tmp2);
 
-                node->ResetHWIntrinsicId(NI_SSE_MoveLowToHigh, tmp1, tmp2);
-                node->SetSimdBaseJitType(CORINFO_TYPE_FLOAT);
+                node->ResetHWIntrinsicId(NI_SSE2_UnpackLow, tmp1, tmp2);
                 break;
             }
 
@@ -4655,19 +4495,16 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
         return LowerNode(node);
     }
 
-    GenTree* op2 = node->Op(2);
-
-    // TODO-XArch-AVX512 : Merge the NI_Vector512_Create and NI_Vector256_Create paths below.
-    // We have the following (where simd is simd16 or simd32):
-    //          /--*  op1 T
-    //          +--*  ... T
-    //          +--*  opN T
-    //   node = *  HWINTRINSIC   simd   T Create
-    if (intrinsicId == NI_Vector512_Create)
+    if (intrinsicId == NI_Vector512_Create || intrinsicId == NI_Vector256_Create)
     {
-        assert(comp->IsBaselineVector512IsaSupportedDebugOnly());
+        assert(argCnt >= (simdSize / genTypeSize(TYP_LONG)));
+        assert(((simdSize == 64) && comp->IsBaselineVector512IsaSupportedDebugOnly()) ||
+               ((simdSize == 32) && comp->IsBaselineVector256IsaSupportedDebugOnly()));
 
-        // We will be constructing the following parts:
+        // The larger vector implementation is simplified by splitting the
+        // job in half and delegating to the next smaller vector size.
+        //
+        // For example, for Vector512, we construct the following:
         //          /--*  op1 T
         //          +--*  ... T
         //   lo   = *  HWINTRINSIC   simd32 T Create
@@ -4697,86 +4534,35 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
         //   lo = Vector256.Create(op1,  ..., op16);
         //   hi = Vector256.Create(op17, ..., op32);
 
+        var_types      halfType   = comp->getSIMDTypeForSize(simdSize / 2);
+        NamedIntrinsic halfCreate = (simdSize == 64) ? NI_Vector256_Create : NI_Vector128_Create;
+        NamedIntrinsic withUpper  = (simdSize == 64) ? NI_Vector512_WithUpper : NI_Vector256_WithUpper;
+
         size_t halfArgCnt = argCnt / 2;
         assert((halfArgCnt * 2) == argCnt);
 
         GenTree* loInsertionPoint = LIR::LastNode(node->GetOperandArray(), halfArgCnt);
-
-        GenTree* lo = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD32, node->GetOperandArray(), halfArgCnt,
-                                                     NI_Vector256_Create, simdBaseJitType, 32);
-        BlockRange().InsertAfter(loInsertionPoint, lo);
-
         GenTree* hiInsertionPoint = LIR::LastNode(node->GetOperandArray(halfArgCnt), halfArgCnt);
-        GenTree* hi = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD32, node->GetOperandArray(halfArgCnt), halfArgCnt,
-                                                     NI_Vector256_Create, simdBaseJitType, 32);
-        BlockRange().InsertAfter(hiInsertionPoint, hi);
-
-        assert(argCnt >= 7);
-        node->ResetHWIntrinsicId(NI_Vector512_WithUpper, comp, lo, hi);
-
-        LowerNode(lo);
-        LowerNode(hi);
 
-        return LowerNode(node);
-    }
-    else if (intrinsicId == NI_Vector256_Create)
-    {
-        assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX));
+        GenTree* lo = comp->gtNewSimdHWIntrinsicNode(halfType, node->GetOperandArray(), halfArgCnt, halfCreate,
+                                                     simdBaseJitType, simdSize / 2);
 
-        // We will be constructing the following parts:
-        //          /--*  op1 T
-        //          +--*  ... T
-        //   lo   = *  HWINTRINSIC   simd16 T Create
-        //          /--*  ... T
-        //          +--*  opN T
-        //   hi   = *  HWINTRINSIC   simd16 T Create
-        //          /--*  lo   simd32
-        //          +--*  hi   simd16
-        //   node = *  HWINTRINSIC   simd32 T WithUpper
+        GenTree* hi = comp->gtNewSimdHWIntrinsicNode(halfType, node->GetOperandArray(halfArgCnt), halfArgCnt,
+                                                     halfCreate, simdBaseJitType, simdSize / 2);
 
-        // This is roughly the following managed code:
-        //   ...
-        //   var lo   = Vector128.Create(op1, ...);
-        //   var hi   = Vector128.Create(..., opN);
-        //   return lo.WithUpper(hi);
-
-        // Each Vector128.Create call gets half the operands. That is:
-        //   lo = Vector128.Create(op1, op2);
-        //   hi = Vector128.Create(op3, op4);
-        // -or-
-        //   lo = Vector128.Create(op1,  ..., op4);
-        //   hi = Vector128.Create(op5,  ..., op8);
-        // -or-
-        //   lo = Vector128.Create(op1,  ..., op8);
-        //   hi = Vector128.Create(op9,  ..., op16);
-        // -or-
-        //   lo = Vector128.Create(op1,  ..., op16);
-        //   hi = Vector128.Create(op17, ..., op32);
+        node->ResetHWIntrinsicId(withUpper, comp, lo, hi);
 
-        size_t halfArgCnt = argCnt / 2;
-        assert((halfArgCnt * 2) == argCnt);
-
-        GenTree* loInsertionPoint = LIR::LastNode(node->GetOperandArray(), halfArgCnt);
-
-        GenTree* lo = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, node->GetOperandArray(), halfArgCnt,
-                                                     NI_Vector128_Create, simdBaseJitType, 16);
         BlockRange().InsertAfter(loInsertionPoint, lo);
-
-        GenTree* hiInsertionPoint = LIR::LastNode(node->GetOperandArray(halfArgCnt), halfArgCnt);
-
-        GenTree* hi = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, node->GetOperandArray(halfArgCnt), halfArgCnt,
-                                                     NI_Vector128_Create, simdBaseJitType, 16);
         BlockRange().InsertAfter(hiInsertionPoint, hi);
 
-        assert(argCnt >= 3);
-        node->ResetHWIntrinsicId(NI_Vector256_WithUpper, comp, lo, hi);
-
         LowerNode(lo);
         LowerNode(hi);
 
         return LowerNode(node);
     }
 
+    assert(intrinsicId == NI_Vector128_Create);
+
     // We will be constructing the following parts:
     //          /--*  op1  T
     //   tmp1 = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
@@ -4975,54 +4761,6 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
             break;
         }
 
-#if defined(TARGET_AMD64)
-        case TYP_LONG:
-        case TYP_ULONG:
-        {
-            if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41_X64))
-            {
-                // We will be constructing the following parts:
-                //   ...
-                //   idx  =    CNS_INT       int    1
-                //          /--*  tmp1 simd16
-                //          +--*  op2  T
-                //          +--*  idx  int
-                //   node = *  HWINTRINSIC   simd16 T Insert
-
-                // This is roughly the following managed code:
-                //   ...
-                //   return Sse41.X64.Insert(tmp1, op2, 0x01);
-
-                idx = comp->gtNewIconNode(0x01, TYP_INT);
-                BlockRange().InsertBefore(node, idx);
-
-                node->ResetHWIntrinsicId(NI_SSE41_X64_Insert, comp, tmp1, op2, idx);
-                break;
-            }
-
-            // We will be constructing the following parts:
-            //   ...
-            //          /--*  op2  T
-            //   tmp2 = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
-            //          /--*  tmp1 simd16
-            //          +--*  tmp2 simd16
-            //   node = *  HWINTRINSIC   simd16 T UnpackLow
-
-            // This is roughly the following managed code:
-            //   ...
-            //   var tmp2 = Vector128.CreateScalarUnsafe(op2);
-            //   return Sse2.UnpackLow(tmp1, tmp2);
-
-            assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
-
-            tmp2 = InsertNewSimdCreateScalarUnsafeNode(TYP_SIMD16, op2, simdBaseJitType, 16);
-            LowerNode(tmp2);
-
-            node->ResetHWIntrinsicId(NI_SSE2_UnpackLow, tmp1, tmp2);
-            break;
-        }
-#endif // TARGET_AMD64
-
         case TYP_FLOAT:
         {
             unsigned N   = 0;
@@ -5162,28 +4900,52 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
             break;
         }
 
+        case TYP_LONG:
+        case TYP_ULONG:
         case TYP_DOUBLE:
         {
+            GenTree* op2 = node->Op(2);
+
+            if (varTypeIsLong(simdBaseType) && comp->compOpportunisticallyDependsOn(InstructionSet_SSE41_X64))
+            {
+                // We will be constructing the following parts:
+                //   ...
+                //   idx  =    CNS_INT       int    1
+                //          /--*  tmp1 simd16
+                //          +--*  op2  T
+                //          +--*  idx  int
+                //   node = *  HWINTRINSIC   simd16 T Insert
+
+                // This is roughly the following managed code:
+                //   ...
+                //   return Sse41.X64.Insert(tmp1, op2, 0x01);
+
+                idx = comp->gtNewIconNode(0x01, TYP_INT);
+                BlockRange().InsertBefore(node, idx);
+
+                node->ResetHWIntrinsicId(NI_SSE41_X64_Insert, comp, tmp1, op2, idx);
+                break;
+            }
+
             // We will be constructing the following parts:
             //   ...
             //          /--*  op2  T
             //   tmp2 = *  HWINTRINSIC   simd16 T CreateScalarUnsafe
             //          /--*  tmp1 simd16
             //          +--*  tmp2 simd16
-            //   node = *  HWINTRINSIC   simd16 T MoveLowToHigh
+            //   node = *  HWINTRINSIC   simd16 T UnpackLow
 
             // This is roughly the following managed code:
             //   ...
             //   var tmp2 = Vector128.CreateScalarUnsafe(op2);
-            //   return Sse.MoveLowToHigh(tmp1, tmp2);
+            //   return Sse.UnpackLow(tmp1, tmp2);
 
             assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
 
             tmp2 = InsertNewSimdCreateScalarUnsafeNode(TYP_SIMD16, op2, simdBaseJitType, 16);
             LowerNode(tmp2);
 
-            node->ResetHWIntrinsicId(NI_SSE_MoveLowToHigh, tmp1, tmp2);
-            node->SetSimdBaseJitType(CORINFO_TYPE_FLOAT);
+            node->ResetHWIntrinsicId(NI_SSE2_UnpackLow, tmp1, tmp2);
             break;
         }
 
@@ -5210,9 +4972,7 @@ GenTree* Lowering::LowerHWIntrinsicGetElement(GenTreeHWIntrinsic* node)
     var_types      simdBaseType    = node->GetSimdBaseType();
     unsigned       simdSize        = node->GetSimdSize();
 
-    assert((intrinsicId == NI_Vector128_GetElement) || (intrinsicId == NI_Vector256_GetElement) ||
-           (intrinsicId == NI_Vector512_GetElement));
-
+    assert(HWIntrinsicInfo::IsVectorGetElement(intrinsicId));
     assert(!varTypeIsSIMD(simdType));
     assert(varTypeIsArithmetic(simdBaseType));
     assert(simdSize != 0);
@@ -6835,24 +6595,25 @@ GenTree* Lowering::LowerHWIntrinsicToScalar(GenTreeHWIntrinsic* node)
     unsigned       simdSize        = node->GetSimdSize();
     var_types      simdType        = Compiler::getSIMDTypeForSize(simdSize);
 
-    assert((intrinsicId == NI_Vector128_ToScalar) || (intrinsicId == NI_Vector256_ToScalar) ||
-           (intrinsicId == NI_Vector512_ToScalar));
-
+    assert(HWIntrinsicInfo::IsVectorToScalar(intrinsicId));
     assert(varTypeIsSIMD(simdType));
     assert(varTypeIsArithmetic(simdBaseType));
     assert(simdSize != 0);
 
     GenTree* op1 = node->Op(1);
 
-    if (IsContainableMemoryOp(op1))
+    if (IsContainableMemoryOp(op1) && (!varTypeIsLong(simdBaseType) || TargetArchitecture::Is64Bit))
     {
-        // We will specially handle ToScalar when op1 is already in memory
+        // If op1 is already in memory, we'd like the consumer of ToScalar to be able to look
+        // through to the memory directly. Early folding is preferable, as it unlocks additional
+        // containment opportunities for the consuming nodes. If we can't fold away ToScalar,
+        // we will still contain op1 if possible, and let codegen try to peek through to it.
+        //
+        // However, we specifically need to avoid doing this for long on 32-bit because we are
+        // already past DecomposeLongs, and codegen wouldn't be able to handle it.
 
         if (op1->OperIs(GT_IND))
         {
-            // We want to optimize ToScalar down to an Indir where possible as
-            // this unlocks additional containment opportunities for various nodes
-
             GenTreeIndir* indir = op1->AsIndir();
 
             GenTreeIndir* newIndir =
@@ -6879,9 +6640,6 @@ GenTree* Lowering::LowerHWIntrinsicToScalar(GenTreeHWIntrinsic* node)
         {
             uint32_t elemSize = genTypeSize(simdBaseType);
 
-            // We want to optimize ToScalar down to a LclFld where possible as
-            // this unlocks additional containment opportunities for various nodes
-
             GenTreeLclVarCommon* lclVar  = op1->AsLclVarCommon();
             uint32_t             lclOffs = lclVar->GetLclOffs() + (0 * elemSize);
             LclVarDsc*           lclDsc  = comp->lvaGetDesc(lclVar);
@@ -6908,92 +6666,10 @@ GenTree* Lowering::LowerHWIntrinsicToScalar(GenTreeHWIntrinsic* node)
                 return LowerNode(lclFld);
             }
         }
-
-        if (IsSafeToContainMem(node, op1))
-        {
-            // Handle other cases in codegen
-            ContainCheckHWIntrinsic(node);
-            return node->gtNext;
-        }
     }
 
-    switch (simdBaseType)
-    {
-        case TYP_BYTE:
-        case TYP_SHORT:
-        case TYP_INT:
-        {
-            node->gtType = TYP_INT;
-            node->SetSimdBaseJitType(CORINFO_TYPE_INT);
-            node->ChangeHWIntrinsicId(NI_SSE2_ConvertToInt32);
-            break;
-        }
-
-        case TYP_UBYTE:
-        case TYP_USHORT:
-        case TYP_UINT:
-        {
-            node->gtType = TYP_INT;
-            node->SetSimdBaseJitType(CORINFO_TYPE_UINT);
-            node->ChangeHWIntrinsicId(NI_SSE2_ConvertToUInt32);
-            break;
-        }
-
-#if defined(TARGET_AMD64)
-        case TYP_LONG:
-        {
-            node->ChangeHWIntrinsicId(NI_SSE2_X64_ConvertToInt64);
-            break;
-        }
-
-        case TYP_ULONG:
-        {
-            node->ChangeHWIntrinsicId(NI_SSE2_X64_ConvertToUInt64);
-            break;
-        }
-#endif // TARGET_AMD64
-
-        case TYP_FLOAT:
-        case TYP_DOUBLE:
-        {
-            ContainCheckHWIntrinsic(node);
-            return node->gtNext;
-        }
-
-        default:
-        {
-            unreached();
-        }
-    }
-
-    GenTree* next = LowerNode(node);
-
-    if (genTypeSize(simdBaseType) < 4)
-    {
-        // The move intrinsics do not touch the upper bits, so we need an explicit
-        // cast to ensure the result is properly sign extended
-
-        LIR::Use use;
-
-        bool foundUse     = BlockRange().TryGetUse(node, &use);
-        bool fromUnsigned = varTypeIsUnsigned(simdBaseType);
-
-        GenTreeCast* cast = comp->gtNewCastNode(TYP_INT, node, fromUnsigned, simdBaseType);
-        BlockRange().InsertAfter(node, cast);
-
-        if (foundUse)
-        {
-            use.ReplaceWith(cast);
-        }
-        else
-        {
-            node->ClearUnusedValue();
-            cast->SetUnusedValue();
-        }
-        next = LowerNode(cast);
-    }
-
-    return next;
+    ContainCheckHWIntrinsic(node);
+    return node->gtNext;
 }
 
 //----------------------------------------------------------------------------------------------
@@ -8016,20 +7692,38 @@ void Lowering::ContainCheckStoreIndir(GenTreeStoreInd* node)
                 case NI_Vector256_ToScalar:
                 case NI_Vector512_ToScalar:
                 {
-                    if (varTypeIsFloating(simdBaseType))
+                    // These intrinsics are "ins reg/mem, xmm" or "ins xmm, reg/mem"
+                    //
+                    // In the case we are coming from and going to memory, we want to
+                    // preserve the original containment as we'll end up emitting a pair
+                    // of scalar moves. e.g. for float:
+                    //    movss xmm0, [addr1]           ; Size: 4, Latency: 4-7,  TP: 0.5
+                    //    movss [addr2], xmm0           ; Size: 4, Latency: 4-10, TP: 1
+                    //
+                    // However, we want to prefer containing the store over allowing the
+                    // input to be regOptional, so track and clear containment if required.
+
+                    clearContainedNode = hwintrinsic->Op(1);
+                    isContainable      = !clearContainedNode->isContained();
+
+                    if (isContainable && varTypeIsIntegral(simdBaseType))
                     {
-                        // These intrinsics are "ins reg/mem, xmm" or "ins xmm, reg/mem"
-                        //
-                        // In the case we are coming from and going to memory, we want to
-                        // preserve the original containment as we'll end up emitting:
-                        //    movss xmm0, [addr1]           ; Size: 4, Latency: 4-7,  TP: 0.5
-                        //    movss [addr2], xmm0           ; Size: 4, Latency: 4-10, TP: 1
-                        //
-                        // However, we want to prefer containing the store over allowing the
-                        // input to be regOptional, so track and clear containment if required.
+                        isContainable = (genTypeSize(simdBaseType) == genTypeSize(node)) &&
+                                        (!varTypeIsSmall(simdBaseType) ||
+                                         comp->compOpportunisticallyDependsOn(InstructionSet_SSE41));
 
-                        clearContainedNode = hwintrinsic->Op(1);
-                        isContainable      = !clearContainedNode->isContained();
+                        if (isContainable && varTypeIsSmall(simdBaseType))
+                        {
+                            CorInfoType baseJitType = varTypeIsByte(node) ? CORINFO_TYPE_UBYTE : CORINFO_TYPE_USHORT;
+                            intrinsicId             = varTypeIsByte(node) ? NI_SSE41_Extract : NI_SSE2_Extract;
+
+                            GenTree* zero = comp->gtNewZeroConNode(TYP_INT);
+                            BlockRange().InsertBefore(hwintrinsic, zero);
+
+                            hwintrinsic->SetSimdBaseJitType(baseJitType);
+                            hwintrinsic->ResetHWIntrinsicId(intrinsicId, hwintrinsic->Op(1), zero);
+                            zero->SetContained();
+                        }
                     }
                     break;
                 }
@@ -9310,6 +9004,9 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
 
             switch (parentIntrinsicId)
             {
+                case NI_Vector128_CreateScalar:
+                case NI_Vector256_CreateScalar:
+                case NI_Vector512_CreateScalar:
                 case NI_Vector128_CreateScalarUnsafe:
                 case NI_Vector256_CreateScalarUnsafe:
                 case NI_Vector512_CreateScalarUnsafe:
@@ -9414,6 +9111,9 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
 
     switch (intrinsicId)
     {
+        case NI_Vector128_CreateScalar:
+        case NI_Vector256_CreateScalar:
+        case NI_Vector512_CreateScalar:
         case NI_Vector128_CreateScalarUnsafe:
         case NI_Vector256_CreateScalarUnsafe:
         case NI_Vector512_CreateScalarUnsafe:
@@ -9509,7 +9209,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
                 {
                     GenTreeHWIntrinsic* hwintrinsicOperand = broadcastOperand->AsHWIntrinsic();
 
-                    if (hwintrinsicOperand->OperIsCreateScalarUnsafe())
+                    if (HWIntrinsicInfo::IsVectorCreateScalarUnsafe(hwintrinsicOperand->GetHWIntrinsicId()))
                     {
                         // CreateScalarUnsafe can contain non-memory operands such as enregistered
                         // locals, so we want to check if its operand is containable instead. This
@@ -9856,9 +9556,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
     if ((simdSize == 8) || (simdSize == 12))
     {
         // We want to handle GetElement/ToScalar still for Vector2/3
-        if ((intrinsicId != NI_Vector128_GetElement) && (intrinsicId != NI_Vector128_ToScalar) &&
-            (intrinsicId != NI_Vector256_GetElement) && (intrinsicId != NI_Vector256_ToScalar) &&
-            (intrinsicId != NI_Vector512_GetElement) && (intrinsicId != NI_Vector512_ToScalar))
+        if (!HWIntrinsicInfo::IsVectorToScalar(intrinsicId) && !HWIntrinsicInfo::IsVectorGetElement(intrinsicId))
         {
             // TODO-XArch-CQ: Ideally we would key this off of the size the containing node
             // expects vs the size node actually is or would be if spilled to the stack
@@ -9967,7 +9665,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
                         {
                             GenTreeHWIntrinsic* childNode = op1->AsHWIntrinsic();
 
-                            if (childNode->OperIsCreateScalarUnsafe())
+                            if (HWIntrinsicInfo::IsVectorCreateScalarUnsafe(childNode->GetHWIntrinsicId()))
                             {
                                 // We have a very special case of BroadcastScalarToVector(CreateScalarUnsafe(op1))
                                 //
@@ -9979,6 +9677,12 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
                                 // op1 directly, we'll then special case the codegen to materialize the value into a
                                 // SIMD register in the case it is marked optional and doesn't get spilled.
 
+                                if (childNode->Op(1)->OperIsLong())
+                                {
+                                    // Decomposed longs require special codegen
+                                    return;
+                                }
+
                                 node->Op(1) = childNode->Op(1);
                                 BlockRange().Remove(op1);
 
@@ -10083,6 +9787,50 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
                         return;
                     }
 
+#ifdef TARGET_X86
+                    case NI_Vector128_CreateScalar:
+                    case NI_Vector256_CreateScalar:
+                    case NI_Vector512_CreateScalar:
+                    case NI_Vector128_CreateScalarUnsafe:
+                    case NI_Vector256_CreateScalarUnsafe:
+                    case NI_Vector512_CreateScalarUnsafe:
+                    {
+                        if (op1->OperIsLong())
+                        {
+                            // Contain decomposed longs and handle them in codegen
+                            assert(varTypeIsLong(simdBaseType));
+
+                            for (GenTree* longOp : op1->Operands())
+                            {
+                                if (IsContainableMemoryOp(longOp) && IsSafeToContainMem(node, longOp))
+                                {
+                                    MakeSrcContained(node, longOp);
+                                }
+                                else if (IsSafeToMarkRegOptional(node, longOp))
+                                {
+                                    MakeSrcRegOptional(node, longOp);
+                                }
+                            }
+
+                            MakeSrcContained(node, op1);
+                            return;
+                        }
+                        break;
+                    }
+
+                    case NI_Vector128_ToScalar:
+                    case NI_Vector256_ToScalar:
+                    case NI_Vector512_ToScalar:
+                    {
+                        // These will be contained by a STOREIND
+                        if (varTypeIsLong(simdBaseType))
+                        {
+                            return;
+                        }
+                        break;
+                    }
+#endif
+
                     default:
                     {
                         break;
diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp
index eb427bea5ab886..3ef6952fb8aec5 100644
--- a/src/coreclr/jit/lsraxarch.cpp
+++ b/src/coreclr/jit/lsraxarch.cpp
@@ -2045,39 +2045,47 @@ int LinearScan::BuildIntrinsic(GenTree* tree)
 
 #ifdef FEATURE_HW_INTRINSICS
 //------------------------------------------------------------------------
-// SkipContainedCreateScalarUnsafe: Skips a contained CreateScalarUnsafe node
+// SkipContainedUnaryOp: Skips a contained non-memory or const node
 // and gets the underlying op1 instead
 //
 // Arguments:
 //    node - The node to handle
 //
 // Return Value:
-//    If node is a contained CreateScalarUnsafe, it's op1 is returned;
+//    If node is a contained non-memory or const unary op, its op1 is returned;
 //    otherwise node is returned unchanged.
-static GenTree* SkipContainedCreateScalarUnsafe(GenTree* node)
+static GenTree* SkipContainedUnaryOp(GenTree* node)
 {
-    if (!node->OperIsHWIntrinsic() || !node->isContained())
+    if (!node->isContained())
     {
         return node;
     }
 
-    GenTreeHWIntrinsic* hwintrinsic = node->AsHWIntrinsic();
-    NamedIntrinsic      intrinsicId = hwintrinsic->GetHWIntrinsicId();
-
-    switch (intrinsicId)
+    if (node->OperIsHWIntrinsic())
     {
-        case NI_Vector128_CreateScalarUnsafe:
-        case NI_Vector256_CreateScalarUnsafe:
-        case NI_Vector512_CreateScalarUnsafe:
-        {
-            return hwintrinsic->Op(1);
-        }
+        GenTreeHWIntrinsic* hwintrinsic = node->AsHWIntrinsic();
+        NamedIntrinsic      intrinsicId = hwintrinsic->GetHWIntrinsicId();
 
-        default:
+        switch (intrinsicId)
         {
-            return node;
+            case NI_Vector128_CreateScalar:
+            case NI_Vector256_CreateScalar:
+            case NI_Vector512_CreateScalar:
+            case NI_Vector128_CreateScalarUnsafe:
+            case NI_Vector256_CreateScalarUnsafe:
+            case NI_Vector512_CreateScalarUnsafe:
+            {
+                return hwintrinsic->Op(1);
+            }
+
+            default:
+            {
+                break;
+            }
         }
     }
+
+    return node;
 }
 
 //------------------------------------------------------------------------
@@ -2134,8 +2142,8 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
     }
     else
     {
-        // A contained CreateScalarUnsafe is special in that we're not containing it to load from
-        // memory and it isn't a constant. Instead, its essentially a "transparent" node we're ignoring
+        // In a few cases, we contain an operand that isn't a load from memory or a constant. Instead,
+        // it is essentially a "transparent" node we're ignoring or handling specially in codegen
         // to simplify the overall IR handling. As such, we need to "skip" such nodes when present and
         // get the underlying op1 so that delayFreeUse and other preferencing remains correct.
 
@@ -2144,37 +2152,37 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
         GenTree* op3    = nullptr;
         GenTree* op4    = nullptr;
         GenTree* op5    = nullptr;
-        GenTree* lastOp = SkipContainedCreateScalarUnsafe(intrinsicTree->Op(numArgs));
+        GenTree* lastOp = SkipContainedUnaryOp(intrinsicTree->Op(numArgs));
 
         switch (numArgs)
         {
             case 5:
             {
-                op5 = SkipContainedCreateScalarUnsafe(intrinsicTree->Op(5));
+                op5 = SkipContainedUnaryOp(intrinsicTree->Op(5));
                 FALLTHROUGH;
             }
 
             case 4:
             {
-                op4 = SkipContainedCreateScalarUnsafe(intrinsicTree->Op(4));
+                op4 = SkipContainedUnaryOp(intrinsicTree->Op(4));
                 FALLTHROUGH;
             }
 
             case 3:
             {
-                op3 = SkipContainedCreateScalarUnsafe(intrinsicTree->Op(3));
+                op3 = SkipContainedUnaryOp(intrinsicTree->Op(3));
                 FALLTHROUGH;
             }
 
             case 2:
             {
-                op2 = SkipContainedCreateScalarUnsafe(intrinsicTree->Op(2));
+                op2 = SkipContainedUnaryOp(intrinsicTree->Op(2));
                 FALLTHROUGH;
             }
 
             case 1:
             {
-                op1 = SkipContainedCreateScalarUnsafe(intrinsicTree->Op(1));
+                op1 = SkipContainedUnaryOp(intrinsicTree->Op(1));
                 break;
             }
 
@@ -2223,11 +2231,14 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
         // must be handled within the case.
         switch (intrinsicId)
         {
+            case NI_Vector128_CreateScalar:
+            case NI_Vector256_CreateScalar:
+            case NI_Vector512_CreateScalar:
             case NI_Vector128_CreateScalarUnsafe:
-            case NI_Vector128_ToScalar:
             case NI_Vector256_CreateScalarUnsafe:
-            case NI_Vector256_ToScalar:
             case NI_Vector512_CreateScalarUnsafe:
+            case NI_Vector128_ToScalar:
+            case NI_Vector256_ToScalar:
             case NI_Vector512_ToScalar:
             {
                 assert(numArgs == 1);
@@ -2241,17 +2252,38 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
                     }
                     else
                     {
-                        // We will either be in memory and need to be moved
-                        // into a register of the appropriate size or we
-                        // are already in an XMM/YMM/ZMM register and can stay
-                        // where we are.
+                        // CreateScalarUnsafe and ToScalar are essentially no-ops for floating point types and can reuse
+                        // the op1 register. CreateScalar needs to clear the upper elements, so if we have a float and
+                        // can't use insertps to zero the upper elements in-place, we'll need a different target reg.
 
-                        tgtPrefUse = BuildUse(op1);
+                        RefPosition* op1Use = BuildUse(op1);
                         srcCount += 1;
+
+                        if ((baseType == TYP_FLOAT) && HWIntrinsicInfo::IsVectorCreateScalar(intrinsicId) &&
+                            !compiler->compOpportunisticallyDependsOn(InstructionSet_SSE41))
+                        {
+                            setDelayFree(op1Use);
+                        }
+                        else
+                        {
+                            tgtPrefUse = op1Use;
+                        }
                     }
 
                     buildUses = false;
                 }
+#if TARGET_X86
+                else if (varTypeIsByte(baseType) && HWIntrinsicInfo::IsVectorToScalar(intrinsicId))
+                {
+                    dstCandidates = allByteRegs();
+                }
+                else if (varTypeIsLong(baseType) && !compiler->compOpportunisticallyDependsOn(InstructionSet_SSE41))
+                {
+                    // For SSE2 fallbacks, we will need a temp register to insert the upper half of a long
+                    buildInternalFloatRegisterDefForNode(intrinsicTree);
+                    setInternalRegsDelayFree = true;
+                }
+#endif // TARGET_X86
                 break;
             }