[InstCombine] Match intrinsic recurrences when known to be hoisted #149858

antoniofrighetto · 2025-07-21T17:36:34Z

For value-accumulating recurrences of kind:

  %umax.acc = phi i8 [ %umax, %backedge ], [ %a, %entry ]
  %umax = call i8 @llvm.umax.i8(i8 %umax.acc, i8 %b)

The binary intrinsic may be simplified into an intrinsic with init
value and the other operand, if the latter is loop-invariant:

  %umax = call i8 @llvm.umax.i8(i8 %a, i8 %b)

Proofs: https://alive2.llvm.org/ce/z/ea2cVC.

Fixes: #145875.

llvmbot · 2025-07-21T17:37:14Z

@llvm/pr-subscribers-llvm-transforms

Author: Antonio Frighetto (antoniofrighetto)

Changes

For value-accumulating recurrences of kind:

  %umax.acc = phi i8 [ %umax, %backedge ], [ %a, %entry ]
  %umax = call i8 @<!-- -->llvm.umax.i8(i8 %umax.acc, i8 %b)

The binary intrinsic may be simplified into an intrinsic with init
value and the other operand, if the latter is loop-invariant:

  %umax = call i8 @<!-- -->llvm.umax.i8(i8 %a, i8 %b)

Fixes: #145875.

Full diff: https://github.com/llvm/llvm-project/pull/149858.diff

2 Files Affected:

(modified) llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp (+48)
(added) llvm/test/Transforms/InstCombine/recurrence-binary-intrinsic.ll (+125)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index d88bc2c4901c7..c0abad6552440 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1532,6 +1532,46 @@ static Instruction *foldBitOrderCrossLogicOp(Value *V,
   return nullptr;
 }
 
+static bool foldBinaryIntrinsicRecurrence(InstCombinerImpl &IC,
+                                          IntrinsicInst *II) {
+  PHINode *PN;
+  Value *Init, *OtherOp;
+
+  // A binary intrinsic recurrence with loop-invariant operands is equivalent to
+  // `call @llvm.binary.intrinsic(Init, OtherOp)`.
+  if (!matchSimpleBinaryIntrinsicRecurrence(II, PN, Init, OtherOp) ||
+      isa<Constant>(OtherOp) || !PN->hasOneUse() ||
+      !IC.getDominatorTree().dominates(OtherOp, PN))
+    return false;
+
+  auto IID = II->getIntrinsicID();
+  switch (IID) {
+  case Intrinsic::maxnum:
+  case Intrinsic::minnum:
+  case Intrinsic::maximum:
+  case Intrinsic::minimum:
+  case Intrinsic::maximumnum:
+  case Intrinsic::minimumnum:
+  case Intrinsic::smax:
+  case Intrinsic::smin:
+  case Intrinsic::umax:
+  case Intrinsic::umin:
+    break;
+  default:
+    return false;
+  }
+
+  IC.Builder.SetInsertPoint(&*PN->getParent()->getFirstInsertionPt());
+  auto *InvariantBinaryInst =
+      cast<IntrinsicInst>(IC.Builder.CreateBinaryIntrinsic(IID, Init, OtherOp));
+  if (isa<FPMathOperator>(II))
+    InvariantBinaryInst->copyFastMathFlags(II);
+  InvariantBinaryInst->takeName(II);
+  IC.eraseInstFromFunction(*IC.replaceInstUsesWith(*II, InvariantBinaryInst));
+  IC.eraseInstFromFunction(*PN);
+  return true;
+}
+
 static Value *simplifyReductionOperand(Value *Arg, bool CanReorderLanes) {
   if (!CanReorderLanes)
     return nullptr;
@@ -3906,6 +3946,14 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
   if (Value *Reverse = foldReversedIntrinsicOperands(II))
     return replaceInstUsesWith(*II, Reverse);
 
+  // Attempt to simplify value-accumulating recurrences of kind:
+  //   %umax.acc = phi i8 [ %umax, %backedge ], [ %a, %entry ]
+  //   %umax = call i8 @llvm.umax.i8(i8 %umax.acc, i8 %b)
+  // And let the binary intrinsic be hoisted, when the operands are known to be
+  // loop-invariant.
+  if (foldBinaryIntrinsicRecurrence(*this, II))
+    return nullptr;
+
   // Some intrinsics (like experimental_gc_statepoint) can be used in invoke
   // context, so it is handled in visitCallBase and we should trigger it.
   return visitCallBase(*II);
diff --git a/llvm/test/Transforms/InstCombine/recurrence-binary-intrinsic.ll b/llvm/test/Transforms/InstCombine/recurrence-binary-intrinsic.ll
new file mode 100644
index 0000000000000..4fa54ba6f023c
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/recurrence-binary-intrinsic.ll
@@ -0,0 +1,125 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+define i8 @simple_recurrence_intrinsic(i8 %n, i8 %a, i8 %b) {
+; CHECK-LABEL: define i8 @simple_recurrence_intrinsic(
+; CHECK-SAME: i8 [[N:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i8 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[UMAX:%.*]] = call i8 @llvm.umax.i8(i8 [[A]], i8 [[B]])
+; CHECK-NEXT:    ret i8 [[UMAX]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i8 [ %iv.next, %loop ], [ 0, %entry ]
+  %umax.acc = phi i8 [ %umax, %loop ], [ %a, %entry ]
+  %umax = call i8 @llvm.umax.i8(i8 %umax.acc, i8 %b)
+  %iv.next = add nuw i8 %iv, 1
+  %cmp = icmp ult i8 %iv.next, %n
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i8 %umax
+}
+
+define float @simple_recurrence_intrinsic_2(i32 %n, float %a, float %b) {
+; CHECK-LABEL: define float @simple_recurrence_intrinsic_2(
+; CHECK-SAME: i32 [[N:%.*]], float [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[FMAX:%.*]] = call nnan float @llvm.maximum.f32(float [[A]], float [[B]])
+; CHECK-NEXT:    ret float [[FMAX]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32  [ %iv.next, %loop ], [ 0, %entry ]
+  %fmax.acc = phi float [ %fmax, %loop ], [ %a, %entry ]
+  %fmax = call nnan float @llvm.maximum.f32(float %fmax.acc, float %b)
+  %iv.next = add nuw i32 %iv, 1
+  %cmp = icmp ult i32 %iv.next, %n
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret float %fmax
+}
+
+define i8 @simple_recurrence_intrinsic_multiuse_umax(i8 %n, i8 %a, i8 %b) {
+; CHECK-LABEL: define i8 @simple_recurrence_intrinsic_multiuse_umax(
+; CHECK-SAME: i8 [[N:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[UMAX_ACC:%.*]] = phi i8 [ [[UMAX:%.*]], %[[LOOP]] ], [ [[A]], %[[ENTRY]] ]
+; CHECK-NEXT:    call void @use(i8 [[UMAX_ACC]])
+; CHECK-NEXT:    [[UMAX]] = call i8 @llvm.umax.i8(i8 [[UMAX_ACC]], i8 [[B]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i8 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i8 [[UMAX]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i8 [ %iv.next, %loop ], [ 0, %entry ]
+  %umax.acc = phi i8 [ %umax, %loop ], [ %a, %entry ]
+  call void @use(i8 %umax.acc)
+  %umax = call i8 @llvm.umax.i8(i8 %umax.acc, i8 %b)
+  %iv.next = add nuw i8 %iv, 1
+  %cmp = icmp ult i8 %iv.next, %n
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i8 %umax
+}
+
+define i8 @simple_recurrence_intrinsic_arg_loop_variant(i8 %n, i8 %a) {
+; CHECK-LABEL: define i8 @simple_recurrence_intrinsic_arg_loop_variant(
+; CHECK-SAME: i8 [[N:%.*]], i8 [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[UMAX_ACC:%.*]] = phi i8 [ [[UMAX:%.*]], %[[LOOP]] ], [ [[A]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[B:%.*]] = xor i8 [[IV]], 42
+; CHECK-NEXT:    [[UMAX]] = call i8 @llvm.umax.i8(i8 [[UMAX_ACC]], i8 [[B]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i8 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i8 [[UMAX]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i8 [ %iv.next, %loop ], [ 0, %entry ]
+  %umax.acc = phi i8 [ %umax, %loop ], [ %a, %entry ]
+  %b = xor i8 %iv, 42
+  %umax = call i8 @llvm.umax.i8(i8 %umax.acc, i8 %b)
+  %iv.next = add nuw i8 %iv, 1
+  %cmp = icmp ult i8 %iv.next, %n
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i8 %umax
+}
+
+declare void @use(i8)

llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp

nikic · 2025-07-22T14:01:18Z

llvm/test/Transforms/InstCombine/recurrence-binary-intrinsic.ll

Negative test with an invalid intrinsic?

Added a test w/ copysign, thanks (think we could add that too, but I wouldn't expect to see a copysign in a recurrence?).

I'd suggest using something like uadd.sat or usub.sat, where the transform would be invalid because the intrinsic is not idempotent (in the sense that f(f(x, y), y) != f(x, y)).

That was indeed a poor negative test, updated, thanks.

nikic

LGTM

dtcxzyw

LG

dtcxzyw · 2025-07-24T13:43:28Z

llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp

Suggested change

auto *InvariantBinaryInst =

IC.Builder.CreateBinaryIntrinsic(IID, Init, OtherOp);

if (isa<FPMathOperator>(InvariantBinaryInst))

cast<Instruction>(InvariantBinaryInst)->copyFastMathFlags(II);

auto *InvariantBinaryInst =

IC.Builder.CreateBinaryIntrinsic(IID, Init, OtherOp, /*FMFSource=*/II);

That's unfortunately not possible per assertion isa<FPMathOperator>(this) && "getting fast-math flag on invalid op"' on non-fast-math binary intrinsics. Perhaps we may want to change this in IRBuilder intrinsic creation in the future.

Use dyn_cast<FPMathOperator>(II)?

In this case, I think you'd need another dyn_cast<Instruction>(InvariantBinaryInst) to check that CreateBinaryIntrinsic didn't constant-fold, as copyFastMathFlags is private to FPMathOperator. Think the current one may be cleaner.

nikic · 2025-07-28T19:25:07Z

Hm, I'm curious whether we would get this optimization "for free" if we called foldOpIntoPhi for intrinsics.

antoniofrighetto · 2025-07-29T09:31:23Z

Hm, I'm curious whether we would get this optimization "for free" if we called foldOpIntoPhi for intrinsics.

Tried by pivoting the fold in the intrinsics visitor as follows:

PHINode *PN;
Value *Init, *OtherOp;
if (matchSimpleBinaryIntrinsicRecurrence(II, PN, Init, OtherOp) &&
    DT.dominates(OtherOp, PN))
  if (auto *Res = foldOpIntoPhi(*II, PN, true))
    return Res;

We fold correctly the one-use phi case, however for the multiuse we bail out in:

llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp

Lines 1947 to 1948 in c9ceb9b

    
           if (!OneUse && !IdenticalUsers) 
        
             return nullptr;

May as well take a deeper look as a follow-up patch, if you say.

nikic · 2025-07-29T09:58:22Z

@antoniofrighetto What I had in mind is something more general along these lines: #151115 (I took the liberty of pushing your tests to main so that shows the diff.) This doesn't do any recurrence check upfront.

You're right that this can't handle the multi-use case. The question then would be whether handling multi-use is worth a dedicated fold. (Though possibly we could extend foldOpIntoPhi to handle this specific multi-use case, not sure...)

antoniofrighetto · 2025-07-29T10:37:53Z

@nikic Oh, I see that now, thanks; that indeed looks like a nice extension. Both using the recurrence helper and foldOpIntoPhi seem like cheap checks, so I assume the choice here mostly boils down to having less code around. If we go with your approach, I may take a look into supporting the specific multi-use case (the second run on the benchmark suite shows some nice little improvements with multi-use phi, suggesting it could indeed be worthwhile to have it). My only thought is that foldBinaryIntrinsicRecurrence doesn't look particularly invasive and, considering we have already some matchSimpleRecurrence folds around, it might make sense to have a dedicated fold for intrinsic recurrences as well.

antoniofrighetto · 2025-07-29T12:13:45Z

The question then would be whether handling multi-use is worth a dedicated fold. (Though possibly we could extend foldOpIntoPhi to handle this specific multi-use case, not sure...)

Comparing the results of the first run dtcxzyw/llvm-opt-benchmark#2596 (w/o multi-use phi) and the one in #151115, and it seems like there are four test cases that were not folded via foldOpIntoPhi. Wouldn't this imply that #151115 does not entirely subsume this fold?

nikic · 2025-07-29T12:28:53Z

@antoniofrighetto The diff only shows a subset of changed files due to github limitations. It's hard to say whether the transform did not happen or whether it's just missing from the diff. Need to run one on top of the other to do that reliably.

antoniofrighetto · 2025-07-29T12:37:12Z

@antoniofrighetto The diff only shows a subset of changed files due to github limitations. It's hard to say whether the transform did not happen or whether it's just missing from the diff. Need to run one on top of the other to do that reliably.

The diff should be a subset, though, I thought the list here dtcxzyw/llvm-opt-benchmark#2616 (comment) would encompass all the changes.

Call foldOpIntoPhi() for speculatable intrinsics. We already do this for FoldOpIntoSelect(). Among other things, this partially subsumes #149858.

Call foldOpIntoPhi() for speculatable intrinsics. We already do this for FoldOpIntoSelect(). Among other things, this partially subsumes llvm/llvm-project#149858.

nikic · 2025-07-31T12:46:09Z

Can you please rebase over #151115? Then we can get a clean diff from llvm-opt-benchmark.

antoniofrighetto · 2025-07-31T14:03:27Z

From the diff, it looks like the fold got triggered only with multi-use phi (though this folds fmaximumnum/fminimumnum too). Could make sense rooting this after foldOpIntoPhi with !PN->hasOneUse()?

I took a look at extending foldOpIntoPhi; though, if I didn't miss anything, it didn't look that promising unless passing an extra, say, IdempotentBinaryIntrinsic boolean or including the fold inside if (!OneUse && !IdenticalUsers) (which none of the two look that elegant to me).

antoniofrighetto · 2025-08-01T15:30:42Z

Am assuming this patch is still fine to merge as it is.

For value-accumulating recurrences of kind: ``` %umax.acc = phi i8 [ %umax, %backedge ], [ %a, %entry ] %umax = call i8 @llvm.umax.i8(i8 %umax.acc, i8 %b) ``` The binary intrinsic may be simplified into an intrinsic with init value and the other operand, if the latter is loop-invariant: ``` %umax = call i8 @llvm.umax.i8(i8 %a, i8 %b) ``` Proofs: https://alive2.llvm.org/ce/z/ea2cVC. Fixes: llvm#145875.

antoniofrighetto requested a review from dtcxzyw July 21, 2025 17:36

antoniofrighetto requested a review from nikic as a code owner July 21, 2025 17:36

llvmbot added llvm:instcombine Covers the InstCombine, InstSimplify and AggressiveInstCombine passes llvm:transforms labels Jul 21, 2025

antoniofrighetto mentioned this pull request Jul 22, 2025

Task submission dtcxzyw/llvm-opt-benchmark#1312

Open

zyw-bot mentioned this pull request Jul 22, 2025

pre-commit: PR149858 dtcxzyw/llvm-opt-benchmark#2596

Closed

nikic reviewed Jul 22, 2025

View reviewed changes

antoniofrighetto force-pushed the feature/ic-fold-intrinsic-recurrence branch from 42ddcf9 to 522b476 Compare July 24, 2025 10:23

zyw-bot mentioned this pull request Jul 24, 2025

pre-commit: PR149858 dtcxzyw/llvm-opt-benchmark#2607

Closed

nikic approved these changes Jul 24, 2025

View reviewed changes

dtcxzyw approved these changes Jul 24, 2025

View reviewed changes

nikic mentioned this pull request Jul 29, 2025

[InstCombine] Support folding intrinsics into phis #151115

Merged

nikic added a commit that referenced this pull request Jul 31, 2025

[InstCombine] Support folding intrinsics into phis (#151115)

16d7383

Call foldOpIntoPhi() for speculatable intrinsics. We already do this for FoldOpIntoSelect(). Among other things, this partially subsumes #149858.

antoniofrighetto force-pushed the feature/ic-fold-intrinsic-recurrence branch 2 times, most recently from a627732 to 76b5d50 Compare July 31, 2025 13:00

zyw-bot mentioned this pull request Jul 31, 2025

pre-commit: PR149858 dtcxzyw/llvm-opt-benchmark#2624

Closed

antoniofrighetto force-pushed the feature/ic-fold-intrinsic-recurrence branch from dfd2ab9 to e977b28 Compare August 8, 2025 07:32

antoniofrighetto merged commit e977b28 into llvm:main Aug 8, 2025
5 of 8 checks passed

-  auto *InvariantBinaryInst =
-      IC.Builder.CreateBinaryIntrinsic(IID, Init, OtherOp);
-  if (isa<FPMathOperator>(InvariantBinaryInst))
-    cast<Instruction>(InvariantBinaryInst)->copyFastMathFlags(II);
+  auto *InvariantBinaryInst =
+      IC.Builder.CreateBinaryIntrinsic(IID, Init, OtherOp, /*FMFSource=*/II);

[InstCombine] Match intrinsic recurrences when known to be hoisted #149858

[InstCombine] Match intrinsic recurrences when known to be hoisted #149858

Conversation

antoniofrighetto commented Jul 21, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Jul 21, 2025

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

nikic left a comment

Choose a reason for hiding this comment

Uh oh!

dtcxzyw left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

nikic commented Jul 28, 2025

Uh oh!

antoniofrighetto commented Jul 29, 2025

Uh oh!

nikic commented Jul 29, 2025

Uh oh!

antoniofrighetto commented Jul 29, 2025

Uh oh!

antoniofrighetto commented Jul 29, 2025

Uh oh!

nikic commented Jul 29, 2025

Uh oh!

antoniofrighetto commented Jul 29, 2025

Uh oh!

nikic commented Jul 31, 2025

Uh oh!

antoniofrighetto commented Jul 31, 2025

Uh oh!

antoniofrighetto commented Aug 1, 2025

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

antoniofrighetto commented Jul 21, 2025 •

edited

Loading