-
Notifications
You must be signed in to change notification settings - Fork 14.8k
[AARCH64] Fold sve mul intrinsics using -1 to neg #156906
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Fold sve mul and mul_u intrinsics (signed and unsigned) to sve neg intrinsics if one of their operands is a splat value = -1 Do not perform this optimization for sve mul intrinsics handling floating point numbers siimilar to gcc. I copied this particular behaviour and this might need follow-up. This change adds a new instruction that is inserted and can therefore not be integrated into the changes introduced by c192737 and 1997073.
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-aarch64 Author: Martin Wehking (MartinWehking) ChangesFold sve mul and mul_u intrinsics (signed and unsigned) to sve neg intrinsics if one of their operands is a splat value = -1 Do not perform this optimization for sve mul intrinsics handling floating point numbers siimilar to gcc. This change adds a new instruction that is inserted and can therefore not be integrated into the changes introduced by c192737 and 1997073. Full diff: https://github.com/llvm/llvm-project/pull/156906.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 66272988889a2..532247c8f3b40 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2277,6 +2277,39 @@ static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
}
}
+static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
+ IntrinsicInst &II) {
+ Value *PG = II.getOperand(0);
+ Value *Op1 = II.getOperand(1);
+ Value *Op2 = II.getOperand(2);
+
+ // Return true if a given instruction is a negative unit splat value, false
+ // otherwise.
+ auto IsNegUnitSplat = [](auto *I) {
+ auto *SplatValue = getSplatValue(I);
+ ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
+ if (!SplatConstantInt)
+ return false;
+ APInt SCIV = SplatConstantInt->getValue();
+ const int64_t IntValue = SCIV.getSExtValue();
+ return IntValue == -1;
+ };
+
+ if (IsNegUnitSplat(Op1)) {
+ auto *NEG = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg,
+ {II.getType()}, {Op2, PG, Op2});
+ return IC.replaceInstUsesWith(II, NEG);
+ }
+
+ if (IsNegUnitSplat(Op2)) {
+ auto *NEG = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg,
+ {II.getType()}, {Op1, PG, Op1});
+ return IC.replaceInstUsesWith(II, NEG);
+ }
+
+ return std::nullopt;
+}
+
static std::optional<Instruction *>
instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) {
// Bail due to missing support for ISD::STRICT_ scalable vector operations.
@@ -2852,6 +2885,9 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
Intrinsic::aarch64_sve_mla_u>(
IC, II, true);
+ case Intrinsic::aarch64_sve_mul:
+ case Intrinsic::aarch64_sve_mul_u:
+ return instCombineSVEVectorMul(IC, II);
case Intrinsic::aarch64_sve_sub:
return instCombineSVEVectorSub(IC, II);
case Intrinsic::aarch64_sve_sub_u:
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-to-neg-fold.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-to-neg-fold.ll
new file mode 100644
index 0000000000000..a620fee7222ab
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-to-neg-fold.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Muls with (-1) as operand should fold to neg.
+define <vscale x 8 x i16> @mul_neg_fold_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @mul_neg_fold_i16(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.neg.nxv8i16(<vscale x 8 x i16> [[A]], <vscale x 8 x i1> [[PG]], <vscale x 8 x i16> [[A]])
+; CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+;
+ %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
+ %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
+ ret <vscale x 8 x i16> %2
+}
+
+define <vscale x 4 x i32> @mul_neg_fold_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @mul_neg_fold_i32(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.neg.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i1> [[PG]], <vscale x 4 x i32> [[A]])
+; CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]]
+;
+ %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 -1)
+ %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
+ ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 2 x i64> @mul_neg_fold_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @mul_neg_fold_i64(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.neg.nxv2i64(<vscale x 2 x i64> [[A]], <vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[A]])
+; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]]
+;
+ %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -1)
+ %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
+ ret <vscale x 2 x i64> %2
+}
+
+define <vscale x 8 x i16> @mul_neg_fold_two_dups(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
+ ; Edge case -- make sure that the case where we're multiplying two dups
+ ; together is sane.
+; CHECK-LABEL: define <vscale x 8 x i16> @mul_neg_fold_two_dups(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = select <vscale x 8 x i1> [[PG]], <vscale x 8 x i16> splat (i16 1), <vscale x 8 x i16> splat (i16 -1)
+; CHECK-NEXT: ret <vscale x 8 x i16> [[TMP1]]
+;
+ %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
+ %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
+ %3 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2)
+ ret <vscale x 8 x i16> %3
+}
+
+define <vscale x 2 x i64> @mul_neg_fold_different_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @mul_neg_fold_different_argument_order(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.neg.nxv2i64(<vscale x 2 x i64> [[A]], <vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[A]])
+; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
+;
+ %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -1)
+ ; Different argument order to the above tests.
+ %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %1, <vscale x 2 x i64> %a)
+ ret <vscale x 2 x i64> %2
+}
+
+; Non foldable muls -- we don't expect these to be optimised out.
+define <vscale x 8 x i16> @no_mul_neg_fold_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @no_mul_neg_fold_i16(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> [[PG]], <vscale x 8 x i16> [[A]], <vscale x 8 x i16> splat (i16 -2))
+; CHECK-NEXT: ret <vscale x 8 x i16> [[TMP1]]
+;
+ %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -2)
+ %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
+ ret <vscale x 8 x i16> %2
+}
+
+define <vscale x 4 x i32> @no_mul_neg_fold_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @no_mul_neg_fold_i32(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> splat (i32 -2))
+; CHECK-NEXT: ret <vscale x 4 x i32> [[TMP1]]
+;
+ %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 -2)
+ %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
+ ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 2 x i64> @no_mul_neg_fold_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @no_mul_neg_fold_i64(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> splat (i64 -2))
+; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
+;
+ %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -2)
+ %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
+ ret <vscale x 2 x i64> %2
+}
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+
+attributes #0 = { "target-features"="+sve" }
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-to-neg-fold.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-to-neg-fold.ll
new file mode 100644
index 0000000000000..ee179a57a0cae
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-to-neg-fold.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Muls with (-1) as operand should fold to neg.
+define <vscale x 8 x i16> @mul_neg_fold_u_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @mul_neg_fold_u_i16(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.neg.nxv8i16(<vscale x 8 x i16> [[A]], <vscale x 8 x i1> [[PG]], <vscale x 8 x i16> [[A]])
+; CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+;
+ %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
+ %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
+ ret <vscale x 8 x i16> %2
+}
+
+define <vscale x 4 x i32> @mul_neg_fold_u_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @mul_neg_fold_u_i32(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.neg.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i1> [[PG]], <vscale x 4 x i32> [[A]])
+; CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]]
+;
+ %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 -1)
+ %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
+ ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 2 x i64> @mul_neg_fold_u_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @mul_neg_fold_u_i64(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.neg.nxv2i64(<vscale x 2 x i64> [[A]], <vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[A]])
+; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]]
+;
+ %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -1)
+ %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
+ ret <vscale x 2 x i64> %2
+}
+
+define <vscale x 8 x i16> @mul_neg_fold_u_two_dups(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
+ ; Edge case -- make sure that the case where we're multiplying two dups
+ ; together is sane.
+; CHECK-LABEL: define <vscale x 8 x i16> @mul_neg_fold_u_two_dups(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: ret <vscale x 8 x i16> splat (i16 1)
+;
+ %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
+ %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -1)
+ %3 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2)
+ ret <vscale x 8 x i16> %3
+}
+
+define <vscale x 2 x i64> @mul_neg_fold_u_different_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @mul_neg_fold_u_different_argument_order(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.neg.nxv2i64(<vscale x 2 x i64> [[A]], <vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[A]])
+; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
+;
+ %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -1)
+ ; Different argument order to the above tests.
+ %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %1, <vscale x 2 x i64> %a)
+ ret <vscale x 2 x i64> %2
+}
+
+; Non foldable muls -- we don't expect these to be optimised out.
+define <vscale x 8 x i16> @no_mul_neg_fold_u_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @no_mul_neg_fold_u_i16(
+; CHECK-SAME: <vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> [[PG]], <vscale x 8 x i16> [[A]], <vscale x 8 x i16> splat (i16 -2))
+; CHECK-NEXT: ret <vscale x 8 x i16> [[TMP1]]
+;
+ %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 -2)
+ %2 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %1)
+ ret <vscale x 8 x i16> %2
+}
+
+define <vscale x 4 x i32> @no_mul_neg_fold_u_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @no_mul_neg_fold_u_i32(
+; CHECK-SAME: <vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> [[PG]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> splat (i32 -2))
+; CHECK-NEXT: ret <vscale x 4 x i32> [[TMP1]]
+;
+ %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 -2)
+ %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1)
+ ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 2 x i64> @no_mul_neg_fold_u_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @no_mul_neg_fold_u_i64(
+; CHECK-SAME: <vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[A]], <vscale x 2 x i64> splat (i64 -2))
+; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP1]]
+;
+ %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 -2)
+ %2 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %1)
+ ret <vscale x 2 x i64> %2
+}
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+
+attributes #0 = { "target-features"="+sve" }
|
@k-arrows - FYI this PR has been discussed offline, where I've asked for an isel based solution that is currently being developed. |
Thank you for letting me know. Just to clarify, I am not in a hurry. Please take your time. |
@paulwalker-arm @k-arrows |
Fold sve mul and mul_u intrinsics (signed and unsigned) to sve neg intrinsics if one of their operands is a splat value = -1
Do not perform this optimization for sve mul intrinsics handling floating point numbers siimilar to gcc.
I copied this particular behaviour and this might need follow-up.
This change adds a new instruction that is inserted and can therefore not be integrated into the changes introduced by c192737 and 1997073.