-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[GlobalIsel] Combine ADDO #82927
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[GlobalIsel] Combine ADDO #82927
Conversation
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Thorsten Schütt (tschuett) ChangesPerform the requested arithmetic and produce a carry output in addition to the normal result. Clang has them as builtins (__builtin_add_overflow_p). The middle end has intrinsics for them (sadd_with_overflow). AArch64: ADDS Add and set flags On Neoverse V2, they run at half the throughput of basic arithmetic and have a limited set of pipelines. Patch is 145.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/82927.diff 13 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 23728636498ba0..9e8fc5d635c50a 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -696,10 +696,6 @@ class CombinerHelper {
/// (G_*MULO x, 0) -> 0 + no carry out
bool matchMulOBy0(MachineInstr &MI, BuildFnTy &MatchInfo);
- /// Match:
- /// (G_*ADDO x, 0) -> x + no carry out
- bool matchAddOBy0(MachineInstr &MI, BuildFnTy &MatchInfo);
-
/// Match:
/// (G_*ADDE x, y, 0) -> (G_*ADDO x, y)
/// (G_*SUBE x, y, 0) -> (G_*SUBO x, y)
@@ -810,12 +806,15 @@ class CombinerHelper {
/// Combine selects.
bool matchSelect(MachineInstr &MI, BuildFnTy &MatchInfo);
- /// Combine ands,
+ /// Combine ands.
bool matchAnd(MachineInstr &MI, BuildFnTy &MatchInfo);
- /// Combine ors,
+ /// Combine ors.
bool matchOr(MachineInstr &MI, BuildFnTy &MatchInfo);
+ /// Combine addos.
+ bool matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo);
+
private:
/// Checks for legality of an indexed variant of \p LdSt.
bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
@@ -919,6 +918,7 @@ class CombinerHelper {
bool isZeroOrZeroSplat(Register Src, bool AllowUndefs);
bool isConstantSplatVector(Register Src, int64_t SplatValue,
bool AllowUndefs);
+ bool isConstantOrConstantVectorI(Register Src) const;
std::optional<APInt> getConstantOrConstantSplatVector(Register Src);
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
index f5a6528d10a973..6b03703192df91 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
@@ -359,6 +359,8 @@ class GBinOpCarryOut : public GenericMachineInstr {
Register getCarryOutReg() const { return getReg(1); }
MachineOperand &getLHS() { return getOperand(2); }
MachineOperand &getRHS() { return getOperand(3); }
+ Register getLHSReg() const { return getOperand(2).getReg(); }
+ Register getRHSReg() const { return getOperand(3).getReg(); }
static bool classof(const MachineInstr *MI) {
switch (MI->getOpcode()) {
@@ -429,6 +431,23 @@ class GAddSubCarryOut : public GBinOpCarryOut {
}
};
+/// Represents overflowing add operations.
+/// G_UADDO, G_SADDO
+class GAddCarryOut : public GBinOpCarryOut {
+public:
+ bool isSigned() const { return getOpcode() == TargetOpcode::G_SADDO; }
+
+ static bool classof(const MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ case TargetOpcode::G_UADDO:
+ case TargetOpcode::G_SADDO:
+ return true;
+ default:
+ return false;
+ }
+ }
+};
+
/// Represents overflowing add/sub operations that also consume a carry-in.
/// G_UADDE, G_SADDE, G_USUBE, G_SSUBE
class GAddSubCarryInOut : public GAddSubCarryOut {
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 17757ca3e41111..2ec19c68d20c42 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1059,12 +1059,6 @@ def mulo_by_0: GICombineRule<
[{ return Helper.matchMulOBy0(*${root}, ${matchinfo}); }]),
(apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
-def addo_by_0: GICombineRule<
- (defs root:$root, build_fn_matchinfo:$matchinfo),
- (match (wip_match_opcode G_UADDO, G_SADDO):$root,
- [{ return Helper.matchAddOBy0(*${root}, ${matchinfo}); }]),
- (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
-
// Transform (uadde x, y, 0) -> (uaddo x, y)
// (sadde x, y, 0) -> (saddo x, y)
// (usube x, y, 0) -> (usubo x, y)
@@ -1260,6 +1254,12 @@ def match_ors : GICombineRule<
[{ return Helper.matchOr(*${root}, ${matchinfo}); }]),
(apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+def match_addos : GICombineRule<
+ (defs root:$root, build_fn_matchinfo:$matchinfo),
+ (match (wip_match_opcode G_SADDO, G_UADDO):$root,
+ [{ return Helper.matchAddOverflow(*${root}, ${matchinfo}); }]),
+ (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+
// Combines concat operations
def concat_matchinfo : GIDefMatchData<"SmallVector<Register>">;
def combine_concat_vector : GICombineRule<
@@ -1295,7 +1295,7 @@ def identity_combines : GICombineGroup<[select_same_val, right_identity_zero,
def const_combines : GICombineGroup<[constant_fold_fp_ops, const_ptradd_to_i2p,
overlapping_and, mulo_by_2, mulo_by_0,
- addo_by_0, adde_to_addo,
+ adde_to_addo,
combine_minmax_nan]>;
def known_bits_simplifications : GICombineGroup<[
@@ -1343,7 +1343,7 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
and_or_disjoint_mask, fma_combines, fold_binop_into_select,
sub_add_reg, select_to_minmax, redundant_binop_in_equality,
fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors,
- combine_concat_vector]>;
+ combine_concat_vector, match_addos]>;
// A combine group used to for prelegalizer combiners at -O0. The combines in
// this group have been selected based on experiments to balance code size and
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index e8a5c6fedc395a..f5ac67a642a6ad 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -4936,24 +4936,6 @@ bool CombinerHelper::matchMulOBy0(MachineInstr &MI, BuildFnTy &MatchInfo) {
return true;
}
-bool CombinerHelper::matchAddOBy0(MachineInstr &MI, BuildFnTy &MatchInfo) {
- // (G_*ADDO x, 0) -> x + no carry out
- assert(MI.getOpcode() == TargetOpcode::G_UADDO ||
- MI.getOpcode() == TargetOpcode::G_SADDO);
- if (!mi_match(MI.getOperand(3).getReg(), MRI, m_SpecificICstOrSplat(0)))
- return false;
- Register Carry = MI.getOperand(1).getReg();
- if (!isConstantLegalOrBeforeLegalizer(MRI.getType(Carry)))
- return false;
- Register Dst = MI.getOperand(0).getReg();
- Register LHS = MI.getOperand(2).getReg();
- MatchInfo = [=](MachineIRBuilder &B) {
- B.buildCopy(Dst, LHS);
- B.buildConstant(Carry, 0);
- };
- return true;
-}
-
bool CombinerHelper::matchAddEToAddO(MachineInstr &MI, BuildFnTy &MatchInfo) {
// (G_*ADDE x, y, 0) -> (G_*ADDO x, y)
// (G_*SUBE x, y, 0) -> (G_*SUBO x, y)
@@ -6354,6 +6336,26 @@ CombinerHelper::getConstantOrConstantSplatVector(Register Src) {
return Value;
}
+// FIXME G_SPLAT_VECTOR
+bool CombinerHelper::isConstantOrConstantVectorI(Register Src) const {
+ auto IConstant = getIConstantVRegValWithLookThrough(Src, MRI);
+ if (IConstant)
+ return true;
+
+ GBuildVector *BuildVector = getOpcodeDef<GBuildVector>(Src, MRI);
+ if (!BuildVector)
+ return false;
+
+ unsigned NumSources = BuildVector->getNumSources();
+ for (unsigned I = 0; I < NumSources; ++I) {
+ std::optional<ValueAndVReg> IConstant =
+ getIConstantVRegValWithLookThrough(BuildVector->getSourceReg(I), MRI);
+ if (!IConstant)
+ return false;
+ }
+ return true;
+}
+
// TODO: use knownbits to determine zeros
bool CombinerHelper::tryFoldSelectOfConstants(GSelect *Select,
BuildFnTy &MatchInfo) {
@@ -6918,3 +6920,199 @@ bool CombinerHelper::matchOr(MachineInstr &MI, BuildFnTy &MatchInfo) {
return false;
}
+
+bool CombinerHelper::matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo) {
+ GAddCarryOut *Add = cast<GAddCarryOut>(&MI);
+
+ // Addo has no flags
+ Register Dst = Add->getReg(0);
+ Register Carry = Add->getReg(1);
+ Register LHS = Add->getLHSReg();
+ Register RHS = Add->getRHSReg();
+ bool IsSigned = Add->isSigned();
+ LLT DstTy = MRI.getType(Dst);
+ LLT CarryTy = MRI.getType(Carry);
+
+ // We want do fold the [u|s]addo.
+ if (!MRI.hasOneNonDBGUse(Dst))
+ return false;
+
+ // Fold addo, if the carry is dead -> add, undef.
+ if (MRI.use_nodbg_empty(Carry) &&
+ isLegalOrBeforeLegalizer({TargetOpcode::G_ADD, {DstTy}})) {
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildAdd(Dst, LHS, RHS);
+ B.buildUndef(Carry);
+ };
+ return true;
+ }
+
+ // We want do fold the [u|s]addo.
+ if (!MRI.hasOneNonDBGUse(Carry))
+ return false;
+
+ // Canonicalize constant to RHS.
+ if (isConstantOrConstantVectorI(LHS) && !isConstantOrConstantVectorI(RHS)) {
+ if (IsSigned) {
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildSAddo(Dst, Carry, RHS, LHS);
+ };
+ return true;
+ } else {
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildUAddo(Dst, Carry, RHS, LHS);
+ };
+ return true;
+ }
+ }
+
+ std::optional<APInt> MaybeLHS = getConstantOrConstantSplatVector(LHS);
+ std::optional<APInt> MaybeRHS = getConstantOrConstantSplatVector(RHS);
+
+ // Fold addo(c1, c2) -> c3, carry.
+ if (MaybeLHS && MaybeRHS && isConstantLegalOrBeforeLegalizer(DstTy) &&
+ isConstantLegalOrBeforeLegalizer(CarryTy)) {
+ // They must both have the same bitwidth. Otherwise APInt might
+ // assert. Pre legalization, they may have widely different bitwidths.
+ unsigned BitWidth =
+ std::max(MaybeLHS->getBitWidth(), MaybeRHS->getBitWidth());
+ bool Overflow;
+ APInt Result;
+ if (IsSigned) {
+ APInt LHS = MaybeLHS->sext(BitWidth);
+ APInt RHS = MaybeRHS->sext(BitWidth);
+ Result = LHS.sadd_ov(RHS, Overflow);
+ } else {
+ APInt LHS = MaybeLHS->zext(BitWidth);
+ APInt RHS = MaybeRHS->zext(BitWidth);
+ Result = LHS.uadd_ov(RHS, Overflow);
+ }
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildConstant(Dst, Result);
+ B.buildConstant(Carry, Overflow);
+ };
+ return true;
+ }
+
+ // Fold (addo x, 0) -> x, no borrow
+ if (MaybeRHS && *MaybeRHS == 0 && isConstantLegalOrBeforeLegalizer(CarryTy)) {
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildCopy(Dst, LHS);
+ B.buildConstant(Carry, 0);
+ };
+ return true;
+ }
+
+ // Given 2 constant operands whose sum does not overflow:
+ // uaddo (X +nuw C0), C1 -> uaddo X, C0 + C1
+ // saddo (X +nsw C0), C1 -> saddo X, C0 + C1
+ GAdd *AddLHS = getOpcodeDef<GAdd>(LHS, MRI);
+ if (MaybeRHS && AddLHS && MRI.hasOneNonDBGUse(Add->getReg(0)) &&
+ ((IsSigned && AddLHS->getFlag(MachineInstr::MIFlag::NoSWrap)) ||
+ (!IsSigned && AddLHS->getFlag(MachineInstr::MIFlag::NoUWrap)))) {
+ std::optional<APInt> MaybeAddRHS =
+ getConstantOrConstantSplatVector(AddLHS->getRHSReg());
+ if (MaybeAddRHS) {
+ unsigned BitWidth =
+ std::max(MaybeRHS->getBitWidth(), MaybeAddRHS->getBitWidth());
+ bool Overflow;
+ APInt NewC;
+ if (IsSigned) {
+ APInt LHS = MaybeRHS->sext(BitWidth);
+ APInt RHS = MaybeAddRHS->sext(BitWidth);
+ NewC = LHS.sadd_ov(RHS, Overflow);
+ } else {
+ APInt LHS = MaybeRHS->zext(BitWidth);
+ APInt RHS = MaybeAddRHS->zext(BitWidth);
+ NewC = LHS.uadd_ov(RHS, Overflow);
+ }
+ if (!Overflow && isConstantLegalOrBeforeLegalizer(DstTy)) {
+ if (IsSigned) {
+ MatchInfo = [=](MachineIRBuilder &B) {
+ auto ConstRHS = B.buildConstant(DstTy, NewC);
+ B.buildSAddo(Dst, Carry, AddLHS->getLHSReg(), ConstRHS);
+ };
+ return true;
+ } else {
+ MatchInfo = [=](MachineIRBuilder &B) {
+ auto ConstRHS = B.buildConstant(DstTy, NewC);
+ B.buildUAddo(Dst, Carry, AddLHS->getLHSReg(), ConstRHS);
+ };
+ return true;
+ }
+ }
+ }
+ };
+
+ // We try to combine uaddo to non-overflowing add.
+ if (!IsSigned && isLegalOrBeforeLegalizer({TargetOpcode::G_ADD, {DstTy}}) &&
+ isConstantLegalOrBeforeLegalizer(DstTy)) {
+ ConstantRange CRLHS =
+ ConstantRange::fromKnownBits(KB->getKnownBits(LHS), false /*IsSigned*/);
+ ConstantRange CRRHS =
+ ConstantRange::fromKnownBits(KB->getKnownBits(RHS), false /*IsSigned*/);
+
+ switch (CRLHS.unsignedAddMayOverflow(CRRHS)) {
+ case ConstantRange::OverflowResult::MayOverflow:
+ return false;
+ case ConstantRange::OverflowResult::NeverOverflows: {
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildAdd(Dst, LHS, RHS, MachineInstr::MIFlag::NoUWrap);
+ B.buildConstant(Carry, 0);
+ };
+ return true;
+ }
+ case ConstantRange::OverflowResult::AlwaysOverflowsLow:
+ case ConstantRange::OverflowResult::AlwaysOverflowsHigh: {
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildAdd(Dst, LHS, RHS);
+ B.buildConstant(Carry, 1);
+ };
+ return true;
+ }
+ };
+ return false;
+ };
+
+ // We try to combine saddo to non-overflowing add.
+ if (!isLegalOrBeforeLegalizer({TargetOpcode::G_ADD, {DstTy}}) ||
+ !isConstantLegalOrBeforeLegalizer(CarryTy))
+ return false;
+
+ // If LHS and RHS each have at least two sign bits, then there is no signed
+ // overflow.
+ if (KB->computeNumSignBits(LHS) > 1 && KB->computeNumSignBits(RHS) > 1) {
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildAdd(Dst, LHS, RHS, MachineInstr::MIFlag::NoSWrap);
+ B.buildConstant(Carry, 0);
+ };
+ return true;
+ }
+
+ ConstantRange CRLHS =
+ ConstantRange::fromKnownBits(KB->getKnownBits(LHS), true /*IsSigned*/);
+ ConstantRange CRRHS =
+ ConstantRange::fromKnownBits(KB->getKnownBits(RHS), true /*IsSigned*/);
+
+ switch (CRLHS.signedAddMayOverflow(CRRHS)) {
+ case ConstantRange::OverflowResult::MayOverflow:
+ return false;
+ case ConstantRange::OverflowResult::NeverOverflows: {
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildAdd(Dst, LHS, RHS, MachineInstr::MIFlag::NoSWrap);
+ B.buildConstant(Carry, 0);
+ };
+ return true;
+ }
+ case ConstantRange::OverflowResult::AlwaysOverflowsLow:
+ case ConstantRange::OverflowResult::AlwaysOverflowsHigh: {
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildAdd(Dst, LHS, RHS);
+ B.buildConstant(Carry, 1);
+ };
+ return true;
+ }
+ };
+
+ return false;
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
new file mode 100644
index 00000000000000..2967230cea6174
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
@@ -0,0 +1,94 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s
+
+---
+name: add_unused
+body: |
+ bb.0:
+ liveins: $w0, $w1
+ ; CHECK-LABEL: name: add_unused
+ ; CHECK: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+ ; CHECK-NEXT: %add:_(s32) = G_ADD [[COPY]], [[COPY1]]
+ ; CHECK-NEXT: $w0 = COPY %add(s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %1:_(s32) = COPY $w1
+ %add:_(s32), %o:_(s1) = G_SADDO %0, %1
+ $w0 = COPY %add(s32)
+ RET_ReallyLR implicit $w0
+...
+---
+name: add_canon
+body: |
+ bb.0:
+ liveins: $w0, $w1
+ ; CHECK-LABEL: name: add_canon
+ ; CHECK: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w1
+ ; CHECK-NEXT: %const:_(s32) = G_CONSTANT i32 10
+ ; CHECK-NEXT: %add:_(s32), %o:_(s1) = G_SADDO [[COPY]], %const
+ ; CHECK-NEXT: %o_wide:_(s32) = G_ZEXT %o(s1)
+ ; CHECK-NEXT: $w0 = COPY %add(s32)
+ ; CHECK-NEXT: $w1 = COPY %o_wide(s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %1:_(s32) = COPY $w1
+ %const:_(s32) = G_CONSTANT i32 10
+ %add:_(s32), %o:_(s1) = G_SADDO %const, %1
+ %o_wide:_(s32) = G_ZEXT %o(s1)
+ $w0 = COPY %add(s32)
+ $w1 = COPY %o_wide
+ RET_ReallyLR implicit $w0
+...
+---
+name: add_const_fold
+body: |
+ bb.0:
+ liveins: $w0, $w1
+ ; CHECK-LABEL: name: add_const_fold
+ ; CHECK: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %add:_(s32) = G_CONSTANT i32 21
+ ; CHECK-NEXT: %o_wide:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: $w0 = COPY %add(s32)
+ ; CHECK-NEXT: $w1 = COPY %o_wide(s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %1:_(s32) = COPY $w1
+ %const:_(s32) = G_CONSTANT i32 10
+ %const1:_(s32) = G_CONSTANT i32 11
+ %add:_(s32), %o:_(s1) = G_UADDO %const, %const1
+ %o_wide:_(s32) = G_ZEXT %o(s1)
+ $w0 = COPY %add(s32)
+ $w1 = COPY %o_wide
+ RET_ReallyLR implicit $w0
+...
+---
+name: add_add_zero
+body: |
+ bb.0:
+ liveins: $w0, $w1
+ ; CHECK-LABEL: name: add_add_zero
+ ; CHECK: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w2
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: $w0 = COPY [[COPY]](s32)
+ ; CHECK-NEXT: $w1 = COPY [[C]](s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %1:_(s32) = COPY $w1
+ %2:_(s32) = COPY $w2
+ %const:_(s32) = G_CONSTANT i32 10
+ %addl:_(s32) = nsw G_ADD %2, %const
+ %const1:_(s32) = G_CONSTANT i32 -10
+ %add:_(s32), %o:_(s1) = G_SADDO %addl, %const1
+ %o_wide:_(s32) = G_ZEXT %o(s1)
+ $w0 = COPY %add(s32)
+ $w1 = COPY %o_wide
+ RET_ReallyLR implicit $w0
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-addo-zero.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-addo-zero.mir
index 94f56e5650b22f..9483cbf06f4057 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-addo-zero.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-addo-zero.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple aarch64 -debugify-and-strip-all-safe -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombiner-only-enable-rule="addo_by_0" -global-isel -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple aarch64 -debugify-and-strip-all-safe -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombiner-only-enable-rule="match_addos" -global-isel -verify-machineinstrs %s -o - | FileCheck %s
# REQUIRES: asserts
# (G_*ADDO x, 0) -> x + no carry
diff --git a/llvm/test/CodeGen/AArch64/overflow.ll b/llvm/test/CodeGen/AArch64/overflow.ll
index 444aaeb0f3fe75..1fd60c03097906 100644
--- a/llvm/test/CodeGen/AArch64/overflow.ll
+++ b/llvm/test/CodeGen/AArch64/overflow.ll
@@ -19,20 +19,12 @@ entry:
}
define zeroext i1 @saddo1.i32.fold(i32 %v1, i32 %v2, ptr %res) {
-; SDAG-LABEL: saddo1.i32.fold:
-; SDAG: // %bb.0: // %entry
-; SDAG-NEXT: mov w8, #20 // =0x14
-; SDAG-NEXT: mov w0, wzr
-; SDAG-NEXT: str w8, [x2]
-; SDAG-NEXT: ret
-;
-; GISEL-LABEL: saddo1.i32.fold:
-; GISEL: // %bb.0: // %entry
-; GISEL-NEXT: mov w8, #9 // =0x9
-; GISEL-NEXT: adds w8, w8, #11
-; GISEL-NEXT: cset w0, vs
-; GISEL-NEXT: str w8, [x2]
-; GISEL-NEXT: ret
+; CHECK-LABEL: saddo1.i32.fold:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #20 // =0x14
+; CHECK-NEXT: mov w0, wzr
+; CHECK-NEXT: str w8, [x2]
+; CHECK-NEXT: ret
entry:
%t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 9, i32 11)
%val = extractvalue {i32, i1} %t, 0
@@ -123,18 +115,11 @@ entry:
}
define zeroext i1 @saddo.canon.i32(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, ptr %res) {
-; SDAG-LABEL: saddo.canon.i32:
-; SDAG: // %bb.0: // %entry
-; SDAG-NEXT: mov w0, wzr
-; SDAG-NEXT: str w4, [x5]
-; SDAG-NEXT: ret
-;
-; GISEL-LABEL: saddo.canon.i32:
-; GISEL: // %bb.0: // %entry
-; GISEL-NEXT: adds w8, wzr, w4
-; GISEL-NEXT: cset w0, vs
-; GISEL-NEXT: str w8, [x5]
-; GISEL-NEXT: ret
+; CHECK-LABEL: saddo.canon.i32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w0, wzr
+; CHECK-NEXT: str w4, [x5]
+; CHECK-NEXT: ret
entry:
%t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 0, i32 %v5)
...
[truncated]
|
For @uaddo.select.i64 known bits seems to fail. |
For AMDGPU, there seems be a lot of noise and some size improvements. |
For AArch64, when the condition register of scalar select is def'd by an overflow op, then we could select less instructions. |
@@ -429,6 +431,23 @@ class GAddSubCarryOut : public GBinOpCarryOut { | |||
} | |||
}; | |||
|
|||
/// Represents overflowing add operations. | |||
/// G_UADDO, G_SADDO | |||
class GAddCarryOut : public GBinOpCarryOut { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is an odd interference with the more general GAddSubCarryOut
. Do you really need this class?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The common pattern is to assert that only the expected opcode is in MI
. I use cast<GAddCarryOut>
. I don't want unnoticed sub to come in.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
GAddSubCarryOut
has a isSub()
method for that purpose.
} | ||
|
||
// We want do fold the [u|s]addo. | ||
if (!MRI.hasOneNonDBGUse(Carry)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is not obvious why multiple uses prevent folding / make it unprofitable. Could you clarify the comment?
Same for Dst above.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure will do. If we have more than one use, then we are generating new instructions and keep the old ones.
(match (wip_match_opcode G_SADDO, G_UADDO):$root, | ||
[{ return Helper.matchAddOverflow(*${root}, ${matchinfo}); }]), | ||
(apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't know about the current direction, but if we're going to tablegen as much as possible, then we would need separate rules for each of the transformations (constant folding / swapping constant to RHS / replacing with G_ADD etc.).
} | ||
|
||
std::optional<APInt> MaybeLHS = getConstantOrConstantSplatVector(LHS); | ||
std::optional<APInt> MaybeRHS = getConstantOrConstantSplatVector(RHS); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
early exit after RHS, then LHS
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can still try the known bits optimizations, even when both are std::nullopt
.
@@ -6918,3 +6920,200 @@ bool CombinerHelper::matchOr(MachineInstr &MI, BuildFnTy &MatchInfo) { | |||
|
|||
return false; | |||
} | |||
|
|||
bool CombinerHelper::matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo) { | |||
GAddCarryOut *Add = cast<GAddCarryOut>(&MI); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Replacing this line with:
GAddSubCarryOut *Add = cast<GAddSubCarryOut>(&MI);
seems dangerous and defeats the purpose of the assert in the cast
.
// Fold (addo x, 0) -> x, no borrow | ||
if (MaybeRHS && *MaybeRHS == 0 && isConstantLegalOrBeforeLegalizer(CarryTy)) { | ||
MatchInfo = [=](MachineIRBuilder &B) { | ||
B.buildCopy(Dst, LHS); | ||
B.buildConstant(Carry, 0); | ||
}; | ||
return true; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
could be moved to pure tablegen as a separate pattern
I removed bit width and another else-after-return. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. I think it would be better to split these different combines into separate tablegen defined roots
%const:_(s32) = G_CONSTANT i32 10 | ||
%addl:_(s32) = nsw G_ADD %2, %const | ||
%const1:_(s32) = G_CONSTANT i32 -10 | ||
%add:_(s32), %o:_(s1) = G_SADDO %addl, %const1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe try some non-s1 typed cases
Perform the requested arithmetic and produce a carry output in addition to the normal result. Clang has them as builtins (__builtin_add_overflow_p). The middle end has intrinsics for them (sadd_with_overflow). AArch64: ADDS Add and set flags On Neoverse V2, they run at half the throughput of basic arithmetic and have a limited set of pipelines.
LLT DstTy = MRI.getType(Dst); | ||
LLT CarryTy = MRI.getType(Carry); | ||
|
||
// We want do fold the [u|s]addo. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Typo "want to", but the comment seems unrelated to the code. Why would multiple uses of the top level instruction prevent combining?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If the result Dst
has multiple uses, then we cannot replace it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes we can and we should. The combine will still be correct and beneficial. You can add tests for the multiple-use case.
As a general rule, if you are matching a pattern, you only need one-use checks for the inner nodes in the pattern, not the outer node. The reason for the checks is to ensure that when you remove the outer node, the inner nodes also get removed.
return true; | ||
} | ||
|
||
// We want do fold the [u|s]addo. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Likewise.
B.buildConstant(Dst, Result); | ||
B.buildConstant(Carry, Overflow); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In the vector-typed case don't you need to build new splat constants here? The patch is missing vector-typed tests for all these folds.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
AArch64 does not support vectorized overflow ops. buildConstant
builds under the hood scalars or build vectors. Support for G_SPLAT_VECTOR is still missing.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
AArch64 does not support vectorized overflow ops.
You should still add the tests. Your combine-overflow.mir test runs pre-legalization so any MIR should be allowed there.
buildConstant builds under the hood scalars or build vectors.
Ah! I didn't know that, thanks.
return true; | ||
} | ||
|
||
// Fold (addo x, 0) -> x, no borrow |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: "carry" not "borrow"
((IsSigned && AddLHS->getFlag(MachineInstr::MIFlag::NoSWrap)) || | ||
(!IsSigned && AddLHS->getFlag(MachineInstr::MIFlag::NoUWrap)))) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
((IsSigned && AddLHS->getFlag(MachineInstr::MIFlag::NoSWrap)) || | |
(!IsSigned && AddLHS->getFlag(MachineInstr::MIFlag::NoUWrap)))) { | |
AddLHS->getFlag(IsSigned ? MachineInstr::MIFlag::NoSWrap : MachineInstr::MIFlag::NoUWrap)) { |
bool Overflow; | ||
APInt NewC = IsSigned ? MaybeAddRHS->sadd_ov(*MaybeRHS, Overflow) | ||
: MaybeAddRHS->uadd_ov(*MaybeRHS, Overflow); | ||
if (!Overflow && isConstantLegalOrBeforeLegalizer(DstTy)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think you need to check isConstantLegalOrBeforeLegalizer(DstTy)
because it's the same type as MaybeRHS
and MaybeAddRHS
which we already know are constants.
MatchInfo = [=](MachineIRBuilder &B) { | ||
B.buildSAddo(Dst, Carry, RHS, LHS); | ||
}; | ||
return true; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: personally I think early return hurts symmetry here. I would prefer if/else followed by a single "return".
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The anti-symmetry is a result of another else-after-return violation.
Ping @tschuett please address post-commit review. At least:
|
Perform the requested arithmetic and produce a carry output in addition to the normal result.
Clang has them as builtins (__builtin_add_overflow_p). The middle end has intrinsics for them (sadd_with_overflow).
AArch64: ADDS Add and set flags
On Neoverse V2, they run at half the throughput of basic arithmetic and have a limited set of pipelines.