[InstCombine] Fold `X udiv Y` to `X lshr cttz(Y)` if Y is a power of 2 #121386

veera-sivarajan · 2024-12-31T11:38:11Z

Fixes #115767

This PR folds X udiv Y to X lshr cttz(Y) if Y is a power of two since
bitwise operations are faster than division.

Proof: https://alive2.llvm.org/ce/z/qHmLta

llvmbot · 2024-12-31T11:38:44Z

@llvm/pr-subscribers-llvm-transforms

Author: Veera (veera-sivarajan)

Changes

Fixes #115767

This PR folds X udiv Y to X lshr cttz(Y) if Y is a power of two since
bitwise operations are faster than division.

Proof: https://alive2.llvm.org/ce/z/qHmLta

Full diff: https://github.com/llvm/llvm-project/pull/121386.diff

3 Files Affected:

(modified) llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp (+10)
(modified) llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-value.ll (+4-2)
(modified) llvm/test/Transforms/InstCombine/div-shift.ll (+105-4)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index f85a3c93651353..00779fe5fa2ee1 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -1632,6 +1632,16 @@ Instruction *InstCombinerImpl::visitUDiv(BinaryOperator &I) {
         I, Builder.CreateLShr(Op0, Res, I.getName(), I.isExact()));
   }
 
+  // Op0 udiv Op1 -> Op0 lshr cttz(Op1), if Op1 is a power of 2.
+  if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, /*Depth*/ 0, &I)) {
+    // This will increase instruction count but it's okay
+    // since bitwise operations are substantially faster than
+    // division.
+    auto *Cttz =
+        Builder.CreateBinaryIntrinsic(Intrinsic::cttz, Op1, Builder.getTrue());
+    return BinaryOperator::CreateLShr(Op0, Cttz);
+  }
+
   return nullptr;
 }
 
diff --git a/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-value.ll b/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-value.ll
index 1956f454a52bbf..fa47d06d859e97 100644
--- a/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-value.ll
+++ b/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-value.ll
@@ -218,7 +218,8 @@ define i32 @vscale_slt_with_vp_umin(ptr nocapture %A, i32 %n) mustprogress vscal
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i32 [[N]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = udiv i32 [[TMP0]], [[VF]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call range(i32 2, 33) i32 @llvm.cttz.i32(i32 [[VF]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP1]], [[VSCALE]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[N]], [[TMP3]]
@@ -270,7 +271,8 @@ define i32 @vscale_slt_with_vp_umin2(ptr nocapture %A, i32 %n) mustprogress vsca
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = udiv i32 [[TMP0]], [[VF]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call range(i32 2, 33) i32 @llvm.cttz.i32(i32 [[VF]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP1]], [[VSCALE]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[N]], [[TMP3]]
diff --git a/llvm/test/Transforms/InstCombine/div-shift.ll b/llvm/test/Transforms/InstCombine/div-shift.ll
index 8dd6d4a2e83712..005daed087c169 100644
--- a/llvm/test/Transforms/InstCombine/div-shift.ll
+++ b/llvm/test/Transforms/InstCombine/div-shift.ll
@@ -148,7 +148,8 @@ define i8 @udiv_umin_extra_use(i8 %x, i8 %y, i8 %z) {
 ; CHECK-NEXT:    [[Z2:%.*]] = shl nuw i8 1, [[Z:%.*]]
 ; CHECK-NEXT:    [[M:%.*]] = call i8 @llvm.umin.i8(i8 [[Y2]], i8 [[Z2]])
 ; CHECK-NEXT:    call void @use(i8 [[M]])
-; CHECK-NEXT:    [[D:%.*]] = udiv i8 [[X:%.*]], [[M]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[M]], i1 true)
+; CHECK-NEXT:    [[D:%.*]] = lshr i8 [[X:%.*]], [[TMP1]]
 ; CHECK-NEXT:    ret i8 [[D]]
 ;
   %y2 = shl i8 1, %y
@@ -165,7 +166,8 @@ define i8 @udiv_smin(i8 %x, i8 %y, i8 %z) {
 ; CHECK-NEXT:    [[Y2:%.*]] = shl nuw i8 1, [[Y:%.*]]
 ; CHECK-NEXT:    [[Z2:%.*]] = shl nuw i8 1, [[Z:%.*]]
 ; CHECK-NEXT:    [[M:%.*]] = call i8 @llvm.smin.i8(i8 [[Y2]], i8 [[Z2]])
-; CHECK-NEXT:    [[D:%.*]] = udiv i8 [[X:%.*]], [[M]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[M]], i1 true)
+; CHECK-NEXT:    [[D:%.*]] = lshr i8 [[X:%.*]], [[TMP1]]
 ; CHECK-NEXT:    ret i8 [[D]]
 ;
   %y2 = shl i8 1, %y
@@ -181,7 +183,8 @@ define i8 @udiv_smax(i8 %x, i8 %y, i8 %z) {
 ; CHECK-NEXT:    [[Y2:%.*]] = shl nuw i8 1, [[Y:%.*]]
 ; CHECK-NEXT:    [[Z2:%.*]] = shl nuw i8 1, [[Z:%.*]]
 ; CHECK-NEXT:    [[M:%.*]] = call i8 @llvm.smax.i8(i8 [[Y2]], i8 [[Z2]])
-; CHECK-NEXT:    [[D:%.*]] = udiv i8 [[X:%.*]], [[M]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[M]], i1 true)
+; CHECK-NEXT:    [[D:%.*]] = lshr i8 [[X:%.*]], [[TMP1]]
 ; CHECK-NEXT:    ret i8 [[D]]
 ;
   %y2 = shl i8 1, %y
@@ -1006,7 +1009,8 @@ define i8 @udiv_fail_shl_overflow(i8 %x, i8 %y) {
 ; CHECK-LABEL: @udiv_fail_shl_overflow(
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i8 2, [[Y:%.*]]
 ; CHECK-NEXT:    [[MIN:%.*]] = call i8 @llvm.umax.i8(i8 [[SHL]], i8 1)
-; CHECK-NEXT:    [[MUL:%.*]] = udiv i8 [[X:%.*]], [[MIN]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[MIN]], i1 true)
+; CHECK-NEXT:    [[MUL:%.*]] = lshr i8 [[X:%.*]], [[TMP1]]
 ; CHECK-NEXT:    ret i8 [[MUL]]
 ;
   %shl = shl i8 2, %y
@@ -1294,3 +1298,100 @@ entry:
   %div = sdiv i32 %add, %add2
   ret i32 %div
 }
+
+define i8 @udiv_if_power_of_two(i8 %x, i8 %y) {
+; CHECK-LABEL: @udiv_if_power_of_two(
+; CHECK-NEXT:  start:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[Y:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i8 [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[TMP1]], label [[BB1:%.*]], label [[BB3:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP2:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[Y]], i1 true)
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i8 [[X:%.*]], [[TMP2]]
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[_0_SROA_0_0:%.*]] = phi i8 [ [[TMP3]], [[BB1]] ], [ 0, [[START:%.*]] ]
+; CHECK-NEXT:    ret i8 [[_0_SROA_0_0]]
+;
+start:
+  %0 = tail call i8 @llvm.ctpop.i8(i8 %y)
+  %1 = icmp eq i8 %0, 1
+  br i1 %1, label %bb1, label %bb3
+
+bb1:
+  %2 = udiv i8 %x, %y
+  br label %bb3
+
+bb3:
+  %_0.sroa.0.0 = phi i8 [ %2, %bb1 ], [ 0, %start ]
+  ret i8 %_0.sroa.0.0
+}
+
+define i8 @udiv_exact_assume_power_of_two(i8 %x, i8 %y) {
+; CHECK-LABEL: @udiv_exact_assume_power_of_two(
+; CHECK-NEXT:  start:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call range(i8 1, 9) i8 @llvm.ctpop.i8(i8 [[Y:%.*]])
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i8 [[TMP0]], 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[Y]], i1 true)
+; CHECK-NEXT:    [[_0:%.*]] = lshr i8 [[X:%.*]], [[TMP1]]
+; CHECK-NEXT:    ret i8 [[_0]]
+;
+start:
+  %0 = tail call i8 @llvm.ctpop.i8(i8 %y)
+  %cond = icmp eq i8 %0, 1
+  tail call void @llvm.assume(i1 %cond)
+  %_0 = udiv exact i8 %x, %y
+  ret i8 %_0
+}
+
+define i7 @udiv_assume_power_of_two_illegal_type(i7 %x, i7 %y) {
+; CHECK-LABEL: @udiv_assume_power_of_two_illegal_type(
+; CHECK-NEXT:  start:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call range(i7 1, 8) i7 @llvm.ctpop.i7(i7 [[Y:%.*]])
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i7 [[TMP0]], 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i7 0, 8) i7 @llvm.cttz.i7(i7 [[Y]], i1 true)
+; CHECK-NEXT:    [[_0:%.*]] = lshr i7 [[X:%.*]], [[TMP1]]
+; CHECK-NEXT:    ret i7 [[_0]]
+;
+start:
+  %0 = tail call i7 @llvm.ctpop.i7(i7 %y)
+  %cond = icmp eq i7 %0, 1
+  tail call void @llvm.assume(i1 %cond)
+  %_0 = udiv i7 %x, %y
+  ret i7 %_0
+}
+
+define i8 @udiv_assume_power_of_two_multiuse(i8 %x, i8 %y) {
+; CHECK-LABEL: @udiv_assume_power_of_two_multiuse(
+; CHECK-NEXT:  start:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call range(i8 1, 9) i8 @llvm.ctpop.i8(i8 [[Y:%.*]])
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i8 [[TMP0]], 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[Y]], i1 true)
+; CHECK-NEXT:    [[_0:%.*]] = lshr i8 [[X:%.*]], [[TMP1]]
+; CHECK-NEXT:    call void @use(i8 [[_0]])
+; CHECK-NEXT:    ret i8 [[_0]]
+;
+start:
+  %0 = tail call i8 @llvm.ctpop.i8(i8 %y)
+  %cond = icmp eq i8 %0, 1
+  tail call void @llvm.assume(i1 %cond)
+  %_0 = udiv i8 %x, %y
+  call void @use(i8 %_0)
+  ret i8 %_0
+}
+
+define i8 @udiv_power_of_two_negative(i8 %x, i8 %y) {
+; CHECK-LABEL: @udiv_power_of_two_negative(
+; CHECK-NEXT:  start:
+; CHECK-NEXT:    [[_0:%.*]] = udiv i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i8 [[_0]]
+;
+start:
+  %0 = tail call i8 @llvm.ctpop.i8(i8 %y)
+  %cond = icmp eq i8 %0, 1
+  %_0 = udiv i8 %x, %y
+  ret i8 %_0
+}

dtcxzyw

The IR diff looks good. It also eliminates some redundant llvm.umul.with.overflow calls :)

llvm/test/Transforms/InstCombine/div-shift.ll

llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp

dtcxzyw

LGTM. Thank you!

llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp

veera-sivarajan · 2025-01-09T16:24:11Z

Please merge it for me. I don't have commit access yet.

llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp

nikic

It would probably make sense to add a takeLog2 based fold for cttz as a follow up. I think that would recover the regression from dtcxzyw/llvm-opt-benchmark#1908 (comment).

goldsteinn · 2025-01-10T16:28:24Z

It would probably make sense to add a takeLog2 based fold for cttz as a follow up. I think that would recover the regression from dtcxzyw/llvm-opt-benchmark#1908 (comment).

I think we would probably need a flash for that, or maybe a wrapping function. It makes sense to create cttz to avoid a div, but probably not a mul.

nikic · 2025-01-10T16:54:14Z

It would probably make sense to add a takeLog2 based fold for cttz as a follow up. I think that would recover the regression from dtcxzyw/llvm-opt-benchmark#1908 (comment).

I think we would probably need a flash for that, or maybe a wrapping function. It makes sense to create cttz to avoid a div, but probably not a mul.

What I meant is to optimize cttz using takeLog2, not make takeLog2 emit cttz.

goldsteinn · 2025-01-10T17:01:50Z

It would probably make sense to add a takeLog2 based fold for cttz as a follow up. I think that would recover the regression from dtcxzyw/llvm-opt-benchmark#1908 (comment).

I think we would probably need a flash for that, or maybe a wrapping function. It makes sense to create cttz to avoid a div, but probably not a mul.

What I meant is to optimize cttz using takeLog2, not make takeLog2 emit cttz.

Ahh, yeah that makes sense.

goldsteinn · 2025-01-10T17:05:24Z

It would probably make sense to add a takeLog2 based fold for cttz as a follow up. I think that would recover the regression from dtcxzyw/llvm-opt-benchmark#1908 (comment).

I think we would probably need a flash for that, or maybe a wrapping function. It makes sense to create cttz to avoid a div, but probably not a mul.

What I meant is to optimize cttz using takeLog2, not make takeLog2 emit cttz.

Ahh, yeah that makes sense.

Ill post a patch for that shortly.

goldsteinn · 2025-01-10T17:05:44Z

LGTM

dtcxzyw · 2025-01-11T04:20:50Z

Conflicting files
llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp

Please rebase. Then I will merge this patch :)

veera-sivarajan · 2025-01-11T05:01:54Z

done, thanks!

llvm#121386) Fixes llvm#115767 This PR folds `X udiv Y` to `X lshr cttz(Y)` if Y is a power of two since bitwise operations are faster than division. Proof: https://alive2.llvm.org/ce/z/qHmLta

This attempts to fix a regression in code that performs `svcntb() / svcntw()` (which is just a constant). https://godbolt.org/z/4o3a67s6n. We would previous expand the svcnt into two different vscale intrinsics, CSE them in a later pass and then fold udiv of shifts into a constant in a second instcombine. After llvm#121386 we now introduce a cttz. This patch just adds an additional check for vscale to the div of shift fold, allowing it to happen earlier and avoiding the need to look through the awkward (but probably not impossible) cttz that was introduced.

(llvm#121386) Introduced cttz intrinsics which caused a regression where vscale/vscale divisions could no longer be constant folded. This fold was suggested as a fix in (llvm#126411)

#121386 Introduced cttz intrinsics which caused a regression where vscale/vscale divisions could no longer be constant folded. This fold was suggested as a fix in #126411. https://alive2.llvm.org/ce/z/gWbtPw

llvm#121386 Introduced cttz intrinsics which caused a regression where vscale/vscale divisions could no longer be constant folded. This fold was suggested as a fix in llvm#126411. https://alive2.llvm.org/ce/z/gWbtPw

veera-sivarajan requested a review from nikic as a code owner December 31, 2024 11:38

llvmbot added llvm:instcombine Covers the InstCombine, InstSimplify and AggressiveInstCombine passes llvm:transforms labels Dec 31, 2024

veera-sivarajan changed the title ~~[InstCombine] Fold X udiv Y to 'X lshr cttz(Y)` if Y is a power of 2~~ [InstCombine] Fold X udiv Y to X lshr cttz(Y) if Y is a power of 2 Dec 31, 2024

This was referenced Dec 31, 2024

Fuzz PR121386 dtcxzyw/llvm-mutation-based-fuzz-service#16

Closed

Task submission dtcxzyw/llvm-opt-benchmark#1312

Open

pre-commit: PR121386 dtcxzyw/llvm-opt-benchmark#1908

Closed

dtcxzyw reviewed Dec 31, 2024

View reviewed changes

llvm/test/Transforms/InstCombine/div-shift.ll Outdated Show resolved Hide resolved

llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp Outdated Show resolved Hide resolved

llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp Outdated Show resolved Hide resolved

goldsteinn reviewed Dec 31, 2024

View reviewed changes

llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp Outdated Show resolved Hide resolved

veera-sivarajan force-pushed the fix-115767 branch from c05f232 to b220403 Compare January 9, 2025 16:04

dtcxzyw approved these changes Jan 9, 2025

View reviewed changes

llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp Outdated Show resolved Hide resolved

veera-sivarajan force-pushed the fix-115767 branch from b220403 to b3538a6 Compare January 9, 2025 16:22

goldsteinn reviewed Jan 9, 2025

View reviewed changes

llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp Outdated Show resolved Hide resolved

dtcxzyw mentioned this pull request Jan 10, 2025

Fuzz PR121386 dtcxzyw/llvm-fuzz-service#50

Closed

nikic reviewed Jan 10, 2025

View reviewed changes

goldsteinn mentioned this pull request Jan 10, 2025

[InstCombine] Make takeLog2 usable throughout InstCombine; NFC #122498

Closed

veera-sivarajan added 2 commits January 11, 2025 04:33

Add Test

72b6b01

Fold X udiv Y to X lshr cttz(Y) if Y is a power of 2

7b4831d

veera-sivarajan force-pushed the fix-115767 branch from b3538a6 to 7b4831d Compare January 11, 2025 05:00

dtcxzyw merged commit 2d5f07c into llvm:main Jan 11, 2025
6 of 8 checks passed

davemgreen mentioned this pull request Feb 9, 2025

[InstCombine] Detect different vscales in div by shift combine. #126411

Closed

MDevereau mentioned this pull request Feb 13, 2025

[InstCombine] Fold shift+cttz with power of 2 operands #127055

Merged

[InstCombine] Fold X udiv Y to X lshr cttz(Y) if Y is a power of 2 #121386

[InstCombine] Fold X udiv Y to X lshr cttz(Y) if Y is a power of 2 #121386

Uh oh!

Conversation

veera-sivarajan commented Dec 31, 2024

Uh oh!

llvmbot commented Dec 31, 2024

Uh oh!

dtcxzyw left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

dtcxzyw left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

veera-sivarajan commented Jan 9, 2025

Uh oh!

Uh oh!

nikic left a comment

Choose a reason for hiding this comment

Uh oh!

goldsteinn commented Jan 10, 2025

Uh oh!

nikic commented Jan 10, 2025

Uh oh!

goldsteinn commented Jan 10, 2025

Uh oh!

goldsteinn commented Jan 10, 2025

Uh oh!

goldsteinn commented Jan 10, 2025

Uh oh!

dtcxzyw commented Jan 11, 2025

Uh oh!

veera-sivarajan commented Jan 11, 2025

Uh oh!

Uh oh!

Uh oh!

[InstCombine] Fold `X udiv Y` to `X lshr cttz(Y)` if Y is a power of 2 #121386

[InstCombine] Fold `X udiv Y` to `X lshr cttz(Y)` if Y is a power of 2 #121386