Skip to content

AMDGPU: fix isSafeToSink expecting exactly one predecessor #89224

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions llvm/include/llvm/ADT/GenericCycleImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,21 @@ void GenericCycle<ContextT>::getExitBlocks(
}
}

template <typename ContextT>
void GenericCycle<ContextT>::getExitingBlocks(
SmallVectorImpl<BlockT *> &TmpStorage) const {
TmpStorage.clear();

for (BlockT *Block : blocks()) {
for (BlockT *Succ : successors(Block)) {
if (!contains(Succ)) {
TmpStorage.push_back(Block);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

break from the inner loop here.

break;
}
}
}
}

template <typename ContextT>
auto GenericCycle<ContextT>::getCyclePreheader() const -> BlockT * {
BlockT *Predecessor = getCyclePredecessor();
Expand Down
4 changes: 4 additions & 0 deletions llvm/include/llvm/ADT/GenericCycleInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,10 @@ template <typename ContextT> class GenericCycle {
/// branched to.
void getExitBlocks(SmallVectorImpl<BlockT *> &TmpStorage) const;

/// Return all blocks of this cycle that have successor outside of this cycle.
/// These blocks have cycle exit branch.
void getExitingBlocks(SmallVectorImpl<BlockT *> &TmpStorage) const;

/// Return the preheader block for this cycle. Pre-header is well-defined for
/// reducible cycle in docs/LoopTerminology.rst as: the only one entering
/// block and its only edge is to the entry block. Return null for irreducible
Expand Down
12 changes: 5 additions & 7 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -213,15 +213,13 @@ bool SIInstrInfo::isSafeToSink(MachineInstr &MI,
// Check if there is a FromCycle that contains SgprDef's basic block but
// does not contain SuccToSinkTo and also has divergent exit condition.
while (FromCycle && !FromCycle->contains(ToCycle)) {
// After structurize-cfg, there should be exactly one cycle exit.
SmallVector<MachineBasicBlock *, 1> ExitBlocks;
FromCycle->getExitBlocks(ExitBlocks);
assert(ExitBlocks.size() == 1);
assert(ExitBlocks[0]->getSinglePredecessor());
SmallVector<MachineBasicBlock *, 1> ExitingBlocks;
FromCycle->getExitingBlocks(ExitingBlocks);

// FromCycle has divergent exit condition.
if (hasDivergentBranch(ExitBlocks[0]->getSinglePredecessor())) {
return false;
for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
if (hasDivergentBranch(ExitingBlock))
return false;
}

FromCycle = FromCycle->getParentCycle();
Expand Down
93 changes: 93 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel -verify-machineinstrs < %s | FileCheck %s

; early-tailduplication deletes cycle exit block created by structurize-cfg
; that had exactly one predecessor. Now, new cycle exit block has two
; predecessors, we need to find predecessor that belongs to the cycle.

define amdgpu_ps void @_amdgpu_ps_main(i1 %arg) {
; CHECK-LABEL: _amdgpu_ps_main:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: s_mov_b32 s5, s4
; CHECK-NEXT: s_mov_b32 s6, s4
; CHECK-NEXT: s_mov_b32 s7, s4
; CHECK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; CHECK-NEXT: s_buffer_load_dword s1, s[4:7], 0x0
; CHECK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; CHECK-NEXT: s_mov_b32 s10, -1
; CHECK-NEXT: s_mov_b32 s11, 0x31c16000
; CHECK-NEXT: s_add_u32 s8, s8, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
; CHECK-NEXT: s_addc_u32 s9, s9, 0
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_cmp_ge_i32 s1, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB0_2
; CHECK-NEXT: .LBB0_1: ; %bb12
; CHECK-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s4
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: v_mov_b32_e32 v3, 0
; CHECK-NEXT: v_mov_b32_e32 v4, 0
; CHECK-NEXT: s_mov_b64 s[0:1], s[8:9]
; CHECK-NEXT: s_mov_b64 s[2:3], s[10:11]
; CHECK-NEXT: s_swappc_b64 s[30:31], 0
; CHECK-NEXT: .LBB0_2: ; %bb2.preheader
; CHECK-NEXT: s_mov_b32 s1, 0
; CHECK-NEXT: v_mov_b32_e32 v0, s1
; CHECK-NEXT: s_branch .LBB0_4
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB0_3: ; %bb6
; CHECK-NEXT: ; in Loop: Header=BB0_4 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s3
; CHECK-NEXT: s_and_b32 s2, 1, s2
; CHECK-NEXT: v_or_b32_e32 v1, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e64 s2, 0, s2
; CHECK-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v0
; CHECK-NEXT: v_mov_b32_e32 v0, v1
; CHECK-NEXT: s_and_b32 s4, s2, s1
; CHECK-NEXT: s_andn2_b32 s1, s1, exec_lo
; CHECK-NEXT: s_and_b32 s2, exec_lo, s4
; CHECK-NEXT: s_or_b32 s1, s1, s2
; CHECK-NEXT: s_cbranch_vccz .LBB0_1
; CHECK-NEXT: .LBB0_4: ; %bb2
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: s_mov_b32 s2, 0
; CHECK-NEXT: s_and_saveexec_b32 s3, s0
; CHECK-NEXT: s_cbranch_execz .LBB0_3
; CHECK-NEXT: ; %bb.5: ; %bb5
; CHECK-NEXT: ; in Loop: Header=BB0_4 Depth=1
; CHECK-NEXT: s_mov_b32 s2, 1
; CHECK-NEXT: s_branch .LBB0_3
bb:
%i = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> zeroinitializer, i32 0, i32 0)
%i1 = icmp slt i32 %i, 0
br i1 %i1, label %bb2, label %bb12

bb2:
%i3 = phi i1 [ %i9, %bb6 ], [ false, %bb ]
%i4 = phi i32 [ %i10, %bb6 ], [ 0, %bb ]
br i1 %arg, label %bb5, label %bb6

bb5:
br label %bb6

bb6:
%i7 = phi i32 [ 0, %bb2 ], [ 1, %bb5 ]
%i8 = icmp ne i32 %i7, 0
%i9 = select i1 %i8, i1 %i3, i1 false
%i10 = or i32 %i4, 1
%i11 = icmp slt i32 %i4, 0
br i1 %i11, label %bb2, label %bb12

bb12:
%i13 = phi i1 [ false, %bb ], [ %i9, %bb6 ]
%i14 = select i1 %i13, float 0.000000e+00, float 1.000000e+00
%i15 = insertelement <4 x float> zeroinitializer, float %i14, i64 0
call amdgpu_gfx addrspace(4) void null(<4 x float> %i15, i32 0)
unreachable
}

declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32 immarg)
Loading