diff --git a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp index d3ba9ae64ff77d..611f76d3a02691 100644 --- a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp @@ -486,6 +486,14 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) break; } + case NI_ArmBase_Arm64_MultiplyLongAdd: + ins = varTypeIsUnsigned(intrin.baseType) ? INS_umaddl : INS_smaddl; + break; + + case NI_ArmBase_Arm64_MultiplyLongSub: + ins = varTypeIsUnsigned(intrin.baseType) ? INS_umsubl : INS_smsubl; + break; + default: ins = HWIntrinsicInfo::lookupIns(intrin.id, intrin.baseType); break; @@ -1112,6 +1120,13 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) GetEmitter()->emitIns_R_R_R(ins, emitSize, targetReg, op2Reg, op3Reg, opt); break; } + + case NI_ArmBase_Arm64_MultiplyLongAdd: + case NI_ArmBase_Arm64_MultiplyLongSub: + assert(opt == INS_OPTS_NONE); + GetEmitter()->emitIns_R_R_R_R(ins, emitSize, targetReg, op1Reg, op2Reg, op3Reg); + break; + default: unreached(); } diff --git a/src/coreclr/jit/hwintrinsiclistarm64.h b/src/coreclr/jit/hwintrinsiclistarm64.h index 97c9b66ad149b6..3aebd051ecbb64 100644 --- a/src/coreclr/jit/hwintrinsiclistarm64.h +++ b/src/coreclr/jit/hwintrinsiclistarm64.h @@ -686,6 +686,9 @@ HARDWARE_INTRINSIC(ArmBase_Arm64, LeadingSignCount, HARDWARE_INTRINSIC(ArmBase_Arm64, LeadingZeroCount, 0, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_clz, INS_clz, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoFloatingPointUsed) HARDWARE_INTRINSIC(ArmBase_Arm64, MultiplyHigh, 0, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_smulh, INS_umulh, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed) HARDWARE_INTRINSIC(ArmBase_Arm64, ReverseElementBits, 0, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rbit, INS_rbit, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed) +HARDWARE_INTRINSIC(ArmBase_Arm64, MultiplyLongAdd, 0, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_smaddl, INS_umaddl, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_SpecialCodeGen|HW_Flag_NoFloatingPointUsed) +HARDWARE_INTRINSIC(ArmBase_Arm64, MultiplyLongSub, 0, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_smsubl, INS_umsubl, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_SpecialCodeGen|HW_Flag_NoFloatingPointUsed) +HARDWARE_INTRINSIC(ArmBase_Arm64, MultiplyLongNeg, 0, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_smnegl, INS_umnegl, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg EncodesExtraTypeArg Instructions Category Flags diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 2f0055cfd37dc0..e6afde89be04bb 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -498,9 +498,16 @@ GenTree* Lowering::LowerNode(GenTree* node) case GT_NEG: #ifdef TARGET_ARM64 + { + GenTree* next = TryLowerNegToMulLongOp(node->AsOp()); + if (next != nullptr) + { + return next; + } ContainCheckNeg(node->AsOp()); + } #endif - break; + break; case GT_SELECT: return LowerSelect(node->AsConditional()); @@ -6316,6 +6323,12 @@ GenTree* Lowering::LowerAdd(GenTreeOp* node) { return next; } + + next = TryLowerAddSubToMulLongOp(node); + if (next != nullptr) + { + return next; + } } #endif // TARGET_ARM64 diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 72f621dbe57409..3331d0a44975cc 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -90,6 +90,8 @@ class Lowering final : public Phase void ContainCheckNeg(GenTreeOp* neg); void TryLowerCnsIntCselToCinc(GenTreeOp* select, GenTree* cond); void TryLowerCselToCSOp(GenTreeOp* select, GenTree* cond); + GenTree* TryLowerAddSubToMulLongOp(GenTreeOp* op); + GenTree* TryLowerNegToMulLongOp(GenTreeOp* op); #endif void ContainCheckSelect(GenTreeOp* select); void ContainCheckBitCast(GenTree* node); diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 99adf319406979..2441f5170f082f 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -532,6 +532,16 @@ GenTree* Lowering::LowerBinaryArithmetic(GenTreeOp* binOp) return next; } } + + if (binOp->OperIs(GT_SUB)) + { + // Attempt to optimize for umsubl/smsubl. + GenTree* next = TryLowerAddSubToMulLongOp(binOp); + if (next != nullptr) + { + return next; + } + } #endif } @@ -2750,6 +2760,158 @@ void Lowering::TryLowerCnsIntCselToCinc(GenTreeOp* select, GenTree* cond) } } } + +//---------------------------------------------------------------------------------------------- +// TryLowerAddSubToCombinedMulOp: Attempt to convert ADD and SUB nodes to a combined multiply +// and add/sub operation. Conversion can only happen if the operands to the +// operation meet the following criteria: +// - One op is a MUL_LONG containing two integer operands, and the other is a long. +// +// Arguments: +// op - The ADD or SUB node to attempt an optimisation on. +// +// Returns: +// A pointer to the next node to evaluate. On no operation, returns nullptr. +// +GenTree* Lowering::TryLowerAddSubToMulLongOp(GenTreeOp* op) +{ + assert(op->OperIs(GT_ADD, GT_SUB)); + + if (!comp->opts.OptimizationEnabled()) + return nullptr; + + if (!JitConfig.EnableHWIntrinsic()) + return nullptr; + + if (op->isContained()) + return nullptr; + + if (!varTypeIsIntegral(op)) + return nullptr; + + if (op->gtFlags & GTF_SET_FLAGS) + return nullptr; + + if (op->gtOverflow()) + return nullptr; + + GenTree* op1 = op->gtGetOp1(); + GenTree* op2 = op->gtGetOp2(); + + // Select which operation is the MUL_LONG and which is the add value. + GenTreeOp* mul; + GenTree* addVal; + if (op1->OperIs(GT_MUL_LONG)) + { + // For subtractions, the multiply must be second, as [u/s]msubl performs: + // addValue - (mulValue1 * mulValue2) + if (op->OperIs(GT_SUB)) + { + return nullptr; + } + + mul = op1->AsOp(); + addVal = op2; + } + else if (op2->OperIs(GT_MUL_LONG)) + { + mul = op2->AsOp(); + addVal = op1; + } + else + { + // Exit if neither operation are GT_MUL_LONG. + return nullptr; + } + + // Additional value must be of long size. + if (!addVal->TypeIs(TYP_LONG)) + return nullptr; + + // Mul values must both be integers. + if (!genActualTypeIsInt(mul->gtOp1) || !genActualTypeIsInt(mul->gtOp2)) + return nullptr; + + // Create the new node and replace the original. + { + NamedIntrinsic intrinsicId = + op->OperIs(GT_ADD) ? NI_ArmBase_Arm64_MultiplyLongAdd : NI_ArmBase_Arm64_MultiplyLongSub; + GenTreeHWIntrinsic* outOp = + comp->gtNewScalarHWIntrinsicNode(TYP_LONG, mul->gtOp1, mul->gtOp2, addVal, intrinsicId); + outOp->SetSimdBaseJitType(mul->IsUnsigned() ? CORINFO_TYPE_ULONG : CORINFO_TYPE_LONG); + op->ReplaceWith(outOp, comp); + } + + // Delete the hanging MUL. + mul->gtOp1 = nullptr; + mul->gtOp2 = nullptr; + BlockRange().Remove(mul); + +#ifdef DEBUG + JITDUMP("Converted to HW_INTRINSIC 'NI_ArmBase_Arm64_MultiplyLong[Add/Sub]'.\n"); + if (comp->verbose) + comp->gtDispNodeName(op); + JITDUMP(":\n"); + DISPTREERANGE(BlockRange(), op); + JITDUMP("\n"); +#endif + + return op; +} + +//---------------------------------------------------------------------------------------------- +// TryLowerNegToCombinedMulOp: Attempt to convert NEG nodes to a combined multiply +// and negate operation. Conversion can only happen if the operands to the +// operation meet one of the following criteria: +// - op1 is a MUL_LONG containing two integer operands. +// +// Arguments: +// op - The NEG node to attempt an optimisation on. +// +// Returns: +// A pointer to the next node to evaluate. On no operation, returns nullptr. +// +GenTree* Lowering::TryLowerNegToMulLongOp(GenTreeOp* op) +{ + assert(op->OperIs(GT_NEG)); + + if (!comp->opts.OptimizationEnabled()) + return nullptr; + + if (op->isContained()) + return nullptr; + + if (!varTypeIsIntegral(op)) + return nullptr; + + if (op->gtFlags & GTF_SET_FLAGS) + return nullptr; + + GenTree* op1 = op->gtGetOp1(); + + // Ensure the negated operand is a MUL_LONG. + if (!op1->OperIs(GT_MUL_LONG)) + return nullptr; + + // Ensure the MUL_LONG contains two integer parameters. + GenTreeOp* mul = op1->AsOp(); + if (!genActualTypeIsInt(mul->gtOp1) || !genActualTypeIsInt(mul->gtOp2)) + return nullptr; + + // Able to optimise, create the new node and replace the original. + { + GenTreeHWIntrinsic* outOp = + comp->gtNewScalarHWIntrinsicNode(TYP_LONG, mul->gtOp1, mul->gtOp2, NI_ArmBase_Arm64_MultiplyLongNeg); + op->ReplaceWith(outOp, comp); + } + + // Clean up hanging mul. + mul->gtOp1 = nullptr; + mul->gtOp2 = nullptr; + BlockRange().Remove(mul); + + return op; +} #endif // TARGET_ARM64 //------------------------------------------------------------------------ diff --git a/src/tests/JIT/opt/Multiply/MultiplyLongOps.cs b/src/tests/JIT/opt/Multiply/MultiplyLongOps.cs new file mode 100644 index 00000000000000..6639dc170b1dd1 --- /dev/null +++ b/src/tests/JIT/opt/Multiply/MultiplyLongOps.cs @@ -0,0 +1,162 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +// Unit tests for long multiply [add/sub/neg]. + +using System; +using System.Runtime.CompilerServices; +using Xunit; + +public class MultiplyLongOpsTest +{ + + [Theory] + [InlineData(72, 6, 68L, 500L)] + [InlineData(32, 5, 40L, 200L)] + [MethodImpl(MethodImplOptions.NoInlining)] + public static void smaddl_single_cast(int op1, int op2, long op3, long expected) + { + //ARM64-FULL-LINE: smaddl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}} + long result = ((long)op1 * op2) + op3; + Assert.Equal(expected, result); + } + + [Theory] + [InlineData(72, 6, 68L, 500L)] + [InlineData(32, 5, 40L, 200L)] + [MethodImpl(MethodImplOptions.NoInlining)] + public static void smaddl_double_cast(int op1, int op2, long op3, long expected) + { + //ARM64-FULL-LINE: smaddl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}} + long result = ((long)op1 * (long)op2) + op3; + Assert.Equal(expected, result); + } + + [Theory] + [InlineData(2000000000, 5, 68L, 10000000068L)] + [MethodImpl(MethodImplOptions.NoInlining)] + public static void smaddl_no_overflow(int op1, int op2, long op3, long expected) + { + //ARM64-FULL-LINE: smaddl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}} + long result = ((long)op1 * op2) + op3; + Assert.Equal(expected, result); + } + + [Theory] + [InlineData(72, 6, 500L, 68L)] + [InlineData(32, 5, 200L, 40L)] + [MethodImpl(MethodImplOptions.NoInlining)] + public static void smsubl_single_cast(int op1, int op2, long op3, long expected) + { + //ARM64-FULL-LINE: smsubl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}} + long result = op3 - ((long)op1 * op2); + Assert.Equal(expected, result); + } + + [Theory] + [InlineData(72, 6, 500L, 68L)] + [InlineData(32, 5, 200L, 40L)] + [MethodImpl(MethodImplOptions.NoInlining)] + public static void smsubl_double_cast(int op1, int op2, long op3, long expected) + { + //ARM64-FULL-LINE: smsubl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}} + long result = op3 - ((long)op1 * (long)op2); + Assert.Equal(expected, result); + } + + [Theory] + [InlineData(2000000000, 5, 10000000068L, 68L)] + [MethodImpl(MethodImplOptions.NoInlining)] + public static void smsubl_no_overflow(int op1, int op2, long op3, long expected) + { + //ARM64-FULL-LINE: smsubl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}} + long result = op3 - ((long)op1 * op2); + Assert.Equal(expected, result); + } + + [Theory] + [InlineData(72, 6, 68UL, 500UL)] + [InlineData(32, 5, 40UL, 200UL)] + [MethodImpl(MethodImplOptions.NoInlining)] + public static void umaddl_single_cast(uint op1, uint op2, ulong op3, ulong expected) + { + //ARM64-FULL-LINE: umaddl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}} + ulong result = ((ulong)op1 * op2) + op3; + Assert.Equal(expected, result); + } + + [Theory] + [InlineData(72, 6, 68UL, 500UL)] + [InlineData(32, 5, 40UL, 200UL)] + [MethodImpl(MethodImplOptions.NoInlining)] + public static void umaddl_double_cast(uint op1, uint op2, ulong op3, ulong expected) + { + //ARM64-FULL-LINE: umaddl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}} + ulong result = ((ulong)op1 * (ulong)op2) + op3; + Assert.Equal(expected, result); + } + + [Theory] + [InlineData(2000000000, 5, 68UL, 10000000068UL)] + [MethodImpl(MethodImplOptions.NoInlining)] + public static void umaddl_no_overflow(uint op1, uint op2, ulong op3, ulong expected) + { + //ARM64-FULL-LINE: umaddl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}} + ulong result = ((ulong)op1 * op2) + op3; + Assert.Equal(expected, result); + } + + [Theory] + [InlineData(72, 6, 500UL, 68UL)] + [InlineData(32, 5, 200UL, 40UL)] + [MethodImpl(MethodImplOptions.NoInlining)] + public static void umsubl_single_cast(uint op1, uint op2, ulong op3, ulong expected) + { + //ARM64-FULL-LINE: umsubl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}} + ulong result = op3 - ((ulong)op1 * op2); + Assert.Equal(expected, result); + } + + [Theory] + [InlineData(72, 6, 500UL, 68UL)] + [InlineData(32, 5, 200UL, 40UL)] + [MethodImpl(MethodImplOptions.NoInlining)] + public static void umsubl_double_cast(uint op1, uint op2, ulong op3, ulong expected) + { + //ARM64-FULL-LINE: umsubl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}} + ulong result = op3 - ((ulong)op1 * op2); + Assert.Equal(expected, result); + } + + [Theory] + [InlineData(2000000000, 5, 10000000068UL, 68UL)] + [MethodImpl(MethodImplOptions.NoInlining)] + public static void umsubl_no_overflow(uint op1, uint op2, ulong op3, ulong expected) + { + //ARM64-FULL-LINE: umsubl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}} + ulong result = op3 - ((ulong)op1 * op2); + Assert.Equal(expected, result); + } + + [Theory] + [InlineData(72, 6, -432L)] + [InlineData(32, 5, -160L)] + [MethodImpl(MethodImplOptions.NoInlining)] + public static void smnegl_single_cast(int op1, int op2, long expected) + { + //ARM64-FULL-LINE: smnegl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + long result = -((long)op1 * op2); + Assert.Equal(expected, result); + } + + [Theory] + [InlineData(72, 6, -432L)] + [InlineData(32, 5, -160L)] + [MethodImpl(MethodImplOptions.NoInlining)] + public static void smnegl_double_cast(int op1, int op2, long expected) + { + //ARM64-FULL-LINE: smnegl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + long result = -((long)op1 * (long)op2); + Assert.Equal(expected, result); + } +} diff --git a/src/tests/JIT/opt/Multiply/MultiplyLongOps.csproj b/src/tests/JIT/opt/Multiply/MultiplyLongOps.csproj new file mode 100644 index 00000000000000..dbc3ab7f2f9596 --- /dev/null +++ b/src/tests/JIT/opt/Multiply/MultiplyLongOps.csproj @@ -0,0 +1,18 @@ + + + + true + + + None + True + + + + true + + + + + +