Skip to content

Conversation

mtrofin
Copy link
Member

@mtrofin mtrofin commented Oct 1, 2025

We cannot calculate the weights of the switch precisely, but we do know the probability of the default branch. We then split equally the remaining probability over the rest of the cases. If we did nothing, the static estimation could be considerably poorer.

Issue #147390

Base automatically changed from users/mtrofin/09-25-_simplifycfg_profcheck_fix_artificially-failing_preserve-branchweights.ll_ to main October 1, 2025 16:54
@mtrofin mtrofin force-pushed the users/mtrofin/09-30-_simplifycfg_profcheck_synthesize_profile_for_br_x_0_x_1_t_f1_-_switch branch 2 times, most recently from 2d42032 to 0029ab3 Compare October 1, 2025 17:41
@mtrofin mtrofin marked this pull request as ready for review October 1, 2025 17:47
@llvmbot
Copy link
Member

llvmbot commented Oct 1, 2025

@llvm/pr-subscribers-llvm-transforms

Author: Mircea Trofin (mtrofin)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/161549.diff

2 Files Affected:

  • (modified) llvm/lib/Transforms/Utils/SimplifyCFG.cpp (+23-7)
  • (modified) llvm/test/Transforms/SimplifyCFG/switch_create.ll (+20-7)
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 4d1f768e2177a..df436d0f68028 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -5148,14 +5148,18 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI,
   if (ExtraCase && Values.size() < 2)
     return false;
 
-  // TODO: Preserve branch weight metadata, similarly to how
-  // foldValueComparisonIntoPredecessors preserves it.
+  SmallVector<uint32_t> BranchWeights;
+  const bool HasProfile = !ProfcheckDisableMetadataFixes &&
+                          extractBranchWeights(*BI, BranchWeights);
 
   // Figure out which block is which destination.
   BasicBlock *DefaultBB = BI->getSuccessor(1);
   BasicBlock *EdgeBB = BI->getSuccessor(0);
-  if (!TrueWhenEqual)
+  if (!TrueWhenEqual) {
     std::swap(DefaultBB, EdgeBB);
+    if (HasProfile)
+      std::swap(BranchWeights[0], BranchWeights[1]);
+  }
 
   BasicBlock *BB = BI->getParent();
 
@@ -5186,10 +5190,11 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI,
     if (!isGuaranteedNotToBeUndefOrPoison(ExtraCase, AC, BI, nullptr))
       ExtraCase = Builder.CreateFreeze(ExtraCase);
 
-    if (TrueWhenEqual)
-      Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB);
-    else
-      Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB);
+    // We don't have any info about this condition.
+    auto *Br = TrueWhenEqual ? Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB)
+                             : Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB);
+    setExplicitlyUnknownBranchWeightsIfProfiled(*Br, *NewBB->getParent(),
+                                                DEBUG_TYPE);
 
     OldTI->eraseFromParent();
 
@@ -5216,6 +5221,17 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI,
 
   // Create the new switch instruction now.
   SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size());
+  if (HasProfile) {
+    // We know the weight of the default case. We don't know the weight of the
+    // other cases, but rather than completely loose profiling info, we split
+    // the remaining probability equally over them.
+    SmallVector<uint32_t> NewWeights(Values.size() + 1);
+    NewWeights[0] = BranchWeights[1]; // this is the default, and we swapped if
+                                      // TrueWhenEqual.
+    for (auto &V : drop_begin(NewWeights))
+      V = BranchWeights[0] / Values.size();
+    setBranchWeights(*New, NewWeights, /*IsExpected=*/false);
+  }
 
   // Add all of the 'cases' to the switch instruction.
   for (ConstantInt *Val : Values)
diff --git a/llvm/test/Transforms/SimplifyCFG/switch_create.ll b/llvm/test/Transforms/SimplifyCFG/switch_create.ll
index 18c4ade46162c..ef5aee68e268e 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch_create.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch_create.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt -S -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -switch-range-to-icmp < %s | FileCheck %s
 ; RUN: opt -S -data-layout="p:32:32-p1:16:16" -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -switch-range-to-icmp < %s | FileCheck -check-prefix=CHECK -check-prefix=DL %s
 
@@ -6,12 +6,12 @@ declare void @foo1()
 
 declare void @foo2()
 
-define void @test1(i32 %V) {
+define void @test1(i32 %V) !prof !0 {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:    switch i32 [[V:%.*]], label [[F:%.*]] [
 ; CHECK-NEXT:      i32 17, label [[T:%.*]]
 ; CHECK-NEXT:      i32 4, label [[T]]
-; CHECK-NEXT:    ]
+; CHECK-NEXT:    ], !prof [[PROF1:![0-9]+]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
 ; CHECK:       T:
@@ -24,7 +24,7 @@ define void @test1(i32 %V) {
   %C1 = icmp eq i32 %V, 4         ; <i1> [#uses=1]
   %C2 = icmp eq i32 %V, 17                ; <i1> [#uses=1]
   %CN = or i1 %C1, %C2            ; <i1> [#uses=1]
-  br i1 %CN, label %T, label %F
+  br i1 %CN, label %T, label %F, !prof !1
 T:              ; preds = %0
   call void @foo1( )
   ret void
@@ -116,12 +116,12 @@ F:              ; preds = %0
   ret void
 }
 
-define void @test2(i32 %V) {
+define void @test2(i32 %V) !prof !0 {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:    switch i32 [[V:%.*]], label [[T:%.*]] [
 ; CHECK-NEXT:      i32 17, label [[F:%.*]]
 ; CHECK-NEXT:      i32 4, label [[F]]
-; CHECK-NEXT:    ]
+; CHECK-NEXT:    ], !prof [[PROF2:![0-9]+]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
 ; CHECK:       T:
@@ -134,7 +134,7 @@ define void @test2(i32 %V) {
   %C1 = icmp ne i32 %V, 4         ; <i1> [#uses=1]
   %C2 = icmp ne i32 %V, 17                ; <i1> [#uses=1]
   %CN = and i1 %C1, %C2           ; <i1> [#uses=1]
-  br i1 %CN, label %T, label %F
+  br i1 %CN, label %T, label %F, !prof !1
 T:              ; preds = %0
   call void @foo1( )
   ret void
@@ -1313,3 +1313,16 @@ if.then:
 if.end:
   ret void
 }
+
+!0 = !{!"function_entry_count", i32 100}
+!1 = !{!"branch_weights", i32 6, i32 10}
+;.
+; DL: attributes #[[ATTR0:[0-9]+]] = { noredzone nounwind ssp }
+; DL: attributes #[[ATTR1:[0-9]+]] = { nounwind }
+; DL: attributes #[[ATTR2]] = { noredzone nounwind }
+; DL: attributes #[[ATTR3]] = { noredzone }
+;.
+; DL: [[META0:![0-9]+]] = !{!"function_entry_count", i32 100}
+; DL: [[PROF1]] = !{!"branch_weights", i32 10, i32 3, i32 3}
+; DL: [[PROF2]] = !{!"branch_weights", i32 6, i32 5, i32 5}
+;.

@mtrofin mtrofin force-pushed the users/mtrofin/09-30-_simplifycfg_profcheck_synthesize_profile_for_br_x_0_x_1_t_f1_-_switch branch from 0029ab3 to e2a3be6 Compare October 2, 2025 21:42
@mtrofin mtrofin requested a review from snehasish October 3, 2025 14:24
@mtrofin mtrofin changed the title [SimplifyCFG][profcheck] Synthesize profile for `br (X == 0 | X == 1), T, F1 -> switch [SimplifyCFG][profcheck] Synthesize profile for br (X == 0 | X == 1), T, F1 -> switch Oct 3, 2025
@mtrofin mtrofin force-pushed the users/mtrofin/09-30-_simplifycfg_profcheck_synthesize_profile_for_br_x_0_x_1_t_f1_-_switch branch from e2a3be6 to aaa7f1e Compare October 3, 2025 20:12
@mtrofin mtrofin force-pushed the users/mtrofin/09-30-_simplifycfg_profcheck_synthesize_profile_for_br_x_0_x_1_t_f1_-_switch branch from aaa7f1e to e49a6f0 Compare October 4, 2025 02:14
Copy link
Member Author

mtrofin commented Oct 4, 2025

Merge activity

  • Oct 4, 2:59 AM UTC: A user started a stack merge that includes this pull request via Graphite.
  • Oct 4, 3:01 AM UTC: @mtrofin merged this pull request with Graphite.

@mtrofin mtrofin merged commit 910e536 into main Oct 4, 2025
9 checks passed
@mtrofin mtrofin deleted the users/mtrofin/09-30-_simplifycfg_profcheck_synthesize_profile_for_br_x_0_x_1_t_f1_-_switch branch October 4, 2025 03:01
@llvm-ci
Copy link
Collaborator

llvm-ci commented Oct 4, 2025

LLVM Buildbot has detected a new failure on builder mlir-nvidia-gcc7 running on mlir-nvidia while building llvm at step 7 "test-build-check-mlir-build-only-check-mlir".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/116/builds/19252

Here is the relevant piece of the build log for the reference
Step 7 (test-build-check-mlir-build-only-check-mlir) failure: test (failure)
******************** TEST 'MLIR :: Integration/GPU/CUDA/async.mlir' FAILED ********************
Exit Code: 1

Command Output (stdout):
--
# RUN: at line 1
/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -gpu-kernel-outlining  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm),nvvm-attach-target)'  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -gpu-async-region -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary="format=fatbin"  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -async-to-async-runtime -async-runtime-ref-counting  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -convert-async-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-runner    --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_cuda_runtime.so    --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_async_runtime.so    --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_runner_utils.so    --entry-point-result=void -O0  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/FileCheck /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -gpu-kernel-outlining
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt '-pass-pipeline=builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm),nvvm-attach-target)'
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -gpu-async-region -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary=format=fatbin
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -async-to-async-runtime -async-runtime-ref-counting
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -convert-async-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-runner --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_cuda_runtime.so --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_async_runtime.so --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_runner_utils.so --entry-point-result=void -O0
# .---command stderr------------
# | 'cuStreamWaitEvent(stream, event, 0)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuStreamWaitEvent(stream, event, 0)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuStreamWaitEvent(stream, event, 0)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuStreamWaitEvent(stream, event, 0)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventSynchronize(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# `-----------------------------
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/FileCheck /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir
# .---command stderr------------
# | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir:68:12: error: CHECK: expected string not found in input
# |  // CHECK: [84, 84]
# |            ^
# | <stdin>:1:1: note: scanning from here
# | Unranked Memref base@ = 0x5cbe554aab30 rank = 1 offset = 0 sizes = [2] strides = [1] data = 
# | ^
# | <stdin>:2:1: note: possible intended match here
# | [42, 42]
# | ^
# | 
# | Input file: <stdin>
# | Check file: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir
# | 
# | -dump-input=help explains the following input dump.
# | 
# | Input was:
# | <<<<<<
# |             1: Unranked Memref base@ = 0x5cbe554aab30 rank = 1 offset = 0 sizes = [2] strides = [1] data =  
# | check:68'0     X~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ error: no match found
# |             2: [42, 42] 
# | check:68'0     ~~~~~~~~~
# | check:68'1     ?         possible intended match
...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants