Skip to content

Commit 2d5f07c

Browse files
[InstCombine] Fold X udiv Y to X lshr cttz(Y) if Y is a power of 2 (#121386)
Fixes #115767 This PR folds `X udiv Y` to `X lshr cttz(Y)` if Y is a power of two since bitwise operations are faster than division. Proof: https://alive2.llvm.org/ce/z/qHmLta
1 parent 24bd9bc commit 2d5f07c

File tree

3 files changed

+130
-8
lines changed

3 files changed

+130
-8
lines changed

llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1598,8 +1598,23 @@ Instruction *InstCombinerImpl::visitUDiv(BinaryOperator &I) {
15981598
return Lshr;
15991599
}
16001600

1601-
// Op1 udiv Op2 -> Op1 lshr log2(Op2), if log2() folds away.
1602-
if (Value *Res = tryGetLog2(Op1, /*AssumeNonZero=*/true))
1601+
auto GetShiftableDenom = [&](Value *Denom) -> Value * {
1602+
// Op0 udiv Op1 -> Op0 lshr log2(Op1), if log2() folds away.
1603+
if (Value *Log2 = tryGetLog2(Op1, /*AssumeNonZero=*/true))
1604+
return Log2;
1605+
1606+
// Op0 udiv Op1 -> Op0 lshr cttz(Op1), if Op1 is a power of 2.
1607+
if (isKnownToBeAPowerOfTwo(Denom, /*OrZero=*/true, /*Depth=*/0, &I))
1608+
// This will increase instruction count but it's okay
1609+
// since bitwise operations are substantially faster than
1610+
// division.
1611+
return Builder.CreateBinaryIntrinsic(Intrinsic::cttz, Denom,
1612+
Builder.getTrue());
1613+
1614+
return nullptr;
1615+
};
1616+
1617+
if (auto *Res = GetShiftableDenom(Op1))
16031618
return replaceInstUsesWith(
16041619
I, Builder.CreateLShr(Op0, Res, I.getName(), I.isExact()));
16051620

llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-value.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,8 @@ define i32 @vscale_slt_with_vp_umin(ptr nocapture %A, i32 %n) mustprogress vscal
218218
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
219219
; CHECK: for.end:
220220
; CHECK-NEXT: [[TMP0:%.*]] = add nsw i32 [[N]], -1
221-
; CHECK-NEXT: [[TMP1:%.*]] = udiv i32 [[TMP0]], [[VF]]
221+
; CHECK-NEXT: [[TMP5:%.*]] = call range(i32 2, 33) i32 @llvm.cttz.i32(i32 [[VF]], i1 true)
222+
; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], [[TMP5]]
222223
; CHECK-NEXT: [[TMP2:%.*]] = mul i32 [[TMP1]], [[VSCALE]]
223224
; CHECK-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 2
224225
; CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[N]], [[TMP3]]
@@ -270,7 +271,8 @@ define i32 @vscale_slt_with_vp_umin2(ptr nocapture %A, i32 %n) mustprogress vsca
270271
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
271272
; CHECK: for.end:
272273
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1
273-
; CHECK-NEXT: [[TMP1:%.*]] = udiv i32 [[TMP0]], [[VF]]
274+
; CHECK-NEXT: [[TMP5:%.*]] = call range(i32 2, 33) i32 @llvm.cttz.i32(i32 [[VF]], i1 true)
275+
; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], [[TMP5]]
274276
; CHECK-NEXT: [[TMP2:%.*]] = mul i32 [[TMP1]], [[VSCALE]]
275277
; CHECK-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 2
276278
; CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[N]], [[TMP3]]

llvm/test/Transforms/InstCombine/div-shift.ll

Lines changed: 109 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,8 @@ define i8 @udiv_umin_extra_use(i8 %x, i8 %y, i8 %z) {
148148
; CHECK-NEXT: [[Z2:%.*]] = shl nuw i8 1, [[Z:%.*]]
149149
; CHECK-NEXT: [[M:%.*]] = call i8 @llvm.umin.i8(i8 [[Y2]], i8 [[Z2]])
150150
; CHECK-NEXT: call void @use(i8 [[M]])
151-
; CHECK-NEXT: [[D:%.*]] = udiv i8 [[X:%.*]], [[M]]
151+
; CHECK-NEXT: [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[M]], i1 true)
152+
; CHECK-NEXT: [[D:%.*]] = lshr i8 [[X:%.*]], [[TMP1]]
152153
; CHECK-NEXT: ret i8 [[D]]
153154
;
154155
%y2 = shl i8 1, %y
@@ -165,7 +166,8 @@ define i8 @udiv_smin(i8 %x, i8 %y, i8 %z) {
165166
; CHECK-NEXT: [[Y2:%.*]] = shl nuw i8 1, [[Y:%.*]]
166167
; CHECK-NEXT: [[Z2:%.*]] = shl nuw i8 1, [[Z:%.*]]
167168
; CHECK-NEXT: [[M:%.*]] = call i8 @llvm.smin.i8(i8 [[Y2]], i8 [[Z2]])
168-
; CHECK-NEXT: [[D:%.*]] = udiv i8 [[X:%.*]], [[M]]
169+
; CHECK-NEXT: [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[M]], i1 true)
170+
; CHECK-NEXT: [[D:%.*]] = lshr i8 [[X:%.*]], [[TMP1]]
169171
; CHECK-NEXT: ret i8 [[D]]
170172
;
171173
%y2 = shl i8 1, %y
@@ -181,7 +183,8 @@ define i8 @udiv_smax(i8 %x, i8 %y, i8 %z) {
181183
; CHECK-NEXT: [[Y2:%.*]] = shl nuw i8 1, [[Y:%.*]]
182184
; CHECK-NEXT: [[Z2:%.*]] = shl nuw i8 1, [[Z:%.*]]
183185
; CHECK-NEXT: [[M:%.*]] = call i8 @llvm.smax.i8(i8 [[Y2]], i8 [[Z2]])
184-
; CHECK-NEXT: [[D:%.*]] = udiv i8 [[X:%.*]], [[M]]
186+
; CHECK-NEXT: [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[M]], i1 true)
187+
; CHECK-NEXT: [[D:%.*]] = lshr i8 [[X:%.*]], [[TMP1]]
185188
; CHECK-NEXT: ret i8 [[D]]
186189
;
187190
%y2 = shl i8 1, %y
@@ -1006,7 +1009,8 @@ define i8 @udiv_fail_shl_overflow(i8 %x, i8 %y) {
10061009
; CHECK-LABEL: @udiv_fail_shl_overflow(
10071010
; CHECK-NEXT: [[SHL:%.*]] = shl i8 2, [[Y:%.*]]
10081011
; CHECK-NEXT: [[MIN:%.*]] = call i8 @llvm.umax.i8(i8 [[SHL]], i8 1)
1009-
; CHECK-NEXT: [[MUL:%.*]] = udiv i8 [[X:%.*]], [[MIN]]
1012+
; CHECK-NEXT: [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[MIN]], i1 true)
1013+
; CHECK-NEXT: [[MUL:%.*]] = lshr i8 [[X:%.*]], [[TMP1]]
10101014
; CHECK-NEXT: ret i8 [[MUL]]
10111015
;
10121016
%shl = shl i8 2, %y
@@ -1294,3 +1298,104 @@ entry:
12941298
%div = sdiv i32 %add, %add2
12951299
ret i32 %div
12961300
}
1301+
1302+
define i8 @udiv_if_power_of_two(i8 %x, i8 %y) {
1303+
; CHECK-LABEL: @udiv_if_power_of_two(
1304+
; CHECK-NEXT: start:
1305+
; CHECK-NEXT: [[TMP0:%.*]] = tail call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[Y:%.*]])
1306+
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i8 [[TMP0]], 1
1307+
; CHECK-NEXT: br i1 [[TMP1]], label [[BB1:%.*]], label [[BB3:%.*]]
1308+
; CHECK: bb1:
1309+
; CHECK-NEXT: [[TMP2:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[Y]], i1 true)
1310+
; CHECK-NEXT: [[TMP3:%.*]] = lshr i8 [[X:%.*]], [[TMP2]]
1311+
; CHECK-NEXT: br label [[BB3]]
1312+
; CHECK: bb3:
1313+
; CHECK-NEXT: [[_0_SROA_0_0:%.*]] = phi i8 [ [[TMP3]], [[BB1]] ], [ 0, [[START:%.*]] ]
1314+
; CHECK-NEXT: ret i8 [[_0_SROA_0_0]]
1315+
;
1316+
start:
1317+
%ctpop = tail call i8 @llvm.ctpop.i8(i8 %y)
1318+
%cmp = icmp eq i8 %ctpop, 1
1319+
br i1 %cmp, label %bb1, label %bb3
1320+
1321+
bb1:
1322+
%div = udiv i8 %x, %y
1323+
br label %bb3
1324+
1325+
bb3:
1326+
%result = phi i8 [ %div, %bb1 ], [ 0, %start ]
1327+
ret i8 %result
1328+
}
1329+
1330+
define i8 @udiv_exact_assume_power_of_two(i8 %x, i8 %y) {
1331+
; CHECK-LABEL: @udiv_exact_assume_power_of_two(
1332+
; CHECK-NEXT: start:
1333+
; CHECK-NEXT: [[TMP0:%.*]] = tail call range(i8 1, 9) i8 @llvm.ctpop.i8(i8 [[Y:%.*]])
1334+
; CHECK-NEXT: [[COND:%.*]] = icmp eq i8 [[TMP0]], 1
1335+
; CHECK-NEXT: tail call void @llvm.assume(i1 [[COND]])
1336+
; CHECK-NEXT: [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[Y]], i1 true)
1337+
; CHECK-NEXT: [[_0:%.*]] = lshr exact i8 [[X:%.*]], [[TMP1]]
1338+
; CHECK-NEXT: ret i8 [[_0]]
1339+
;
1340+
start:
1341+
%ctpop = tail call i8 @llvm.ctpop.i8(i8 %y)
1342+
%cond = icmp eq i8 %ctpop, 1
1343+
tail call void @llvm.assume(i1 %cond)
1344+
%div = udiv exact i8 %x, %y
1345+
ret i8 %div
1346+
}
1347+
1348+
define i7 @udiv_assume_power_of_two_illegal_type(i7 %x, i7 %y) {
1349+
; CHECK-LABEL: @udiv_assume_power_of_two_illegal_type(
1350+
; CHECK-NEXT: start:
1351+
; CHECK-NEXT: [[TMP0:%.*]] = tail call range(i7 1, 8) i7 @llvm.ctpop.i7(i7 [[Y:%.*]])
1352+
; CHECK-NEXT: [[COND:%.*]] = icmp eq i7 [[TMP0]], 1
1353+
; CHECK-NEXT: tail call void @llvm.assume(i1 [[COND]])
1354+
; CHECK-NEXT: [[TMP1:%.*]] = call range(i7 0, 8) i7 @llvm.cttz.i7(i7 [[Y]], i1 true)
1355+
; CHECK-NEXT: [[_0:%.*]] = lshr i7 [[X:%.*]], [[TMP1]]
1356+
; CHECK-NEXT: ret i7 [[_0]]
1357+
;
1358+
start:
1359+
%ctpop = tail call i7 @llvm.ctpop.i7(i7 %y)
1360+
%cond = icmp eq i7 %ctpop, 1
1361+
tail call void @llvm.assume(i1 %cond)
1362+
%div = udiv i7 %x, %y
1363+
ret i7 %div
1364+
}
1365+
1366+
define i8 @udiv_assume_power_of_two_multiuse(i8 %x, i8 %y) {
1367+
; CHECK-LABEL: @udiv_assume_power_of_two_multiuse(
1368+
; CHECK-NEXT: start:
1369+
; CHECK-NEXT: [[TMP0:%.*]] = tail call range(i8 1, 9) i8 @llvm.ctpop.i8(i8 [[Y:%.*]])
1370+
; CHECK-NEXT: [[COND:%.*]] = icmp eq i8 [[TMP0]], 1
1371+
; CHECK-NEXT: tail call void @llvm.assume(i1 [[COND]])
1372+
; CHECK-NEXT: [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[Y]], i1 true)
1373+
; CHECK-NEXT: [[_0:%.*]] = lshr i8 [[X:%.*]], [[TMP1]]
1374+
; CHECK-NEXT: call void @use(i8 [[_0]])
1375+
; CHECK-NEXT: ret i8 [[_0]]
1376+
;
1377+
start:
1378+
%ctpop = tail call i8 @llvm.ctpop.i8(i8 %y)
1379+
%cond = icmp eq i8 %ctpop, 1
1380+
tail call void @llvm.assume(i1 %cond)
1381+
%div = udiv i8 %x, %y
1382+
call void @use(i8 %div)
1383+
ret i8 %div
1384+
}
1385+
1386+
define i8 @udiv_power_of_two_negative(i8 %x, i8 %y, i8 %z) {
1387+
; CHECK-LABEL: @udiv_power_of_two_negative(
1388+
; CHECK-NEXT: start:
1389+
; CHECK-NEXT: [[CTPOP:%.*]] = tail call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[Z:%.*]])
1390+
; CHECK-NEXT: [[COND:%.*]] = icmp eq i8 [[CTPOP]], 1
1391+
; CHECK-NEXT: tail call void @llvm.assume(i1 [[COND]])
1392+
; CHECK-NEXT: [[_0:%.*]] = udiv i8 [[X:%.*]], [[Y:%.*]]
1393+
; CHECK-NEXT: ret i8 [[_0]]
1394+
;
1395+
start:
1396+
%ctpop = tail call i8 @llvm.ctpop.i8(i8 %z)
1397+
%cond = icmp eq i8 %ctpop, 1
1398+
tail call void @llvm.assume(i1 %cond)
1399+
%div = udiv i8 %x, %y
1400+
ret i8 %div
1401+
}

0 commit comments

Comments
 (0)