-
Notifications
You must be signed in to change notification settings - Fork 13.7k
AMDGPU: Handle remote/fine-grained memory in atomicrmw fmin/fmax lowering #96759
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Handle remote/fine-grained memory in atomicrmw fmin/fmax lowering #96759
Conversation
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesConsider the new atomic metadata when choosing to expand as cmpxchg Patch is 1.01 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/96759.diff 13 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index fc34277c580a8..11ebfe7511f7b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16093,6 +16093,34 @@ static bool isBFloat2(Type *Ty) {
return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
}
+/// \returns true if it's valid to emit a native instruction for \p RMW, based
+/// on the properties of the target memory.
+static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
+ const AtomicRMWInst *RMW,
+ bool HasSystemScope) {
+ // The remote/fine-grained access logic is different from the integer
+ // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
+ // fine-grained access does not work, even for a device local allocation.
+ //
+ // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
+ // allocations work.
+ if (HasSystemScope) {
+ if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics() &&
+ RMW->hasMetadata("amdgpu.no.remote.memory"))
+ return true;
+ } else if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics())
+ return true;
+
+ if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
+ return true;
+
+ // TODO: Auto-upgrade this attribute to the metadata in function body and stop
+ // checking it.
+ return RMW->getFunction()
+ ->getFnAttribute("amdgpu-unsafe-fp-atomics")
+ .getValueAsBool();
+}
+
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
unsigned AS = RMW->getPointerAddressSpace();
@@ -16236,37 +16264,32 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
Type *Ty = RMW->getType();
// LDS float and double fmin/fmax were always supported.
- if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy()))
- return AtomicExpansionKind::None;
-
- if (unsafeFPAtomicsDisabled(RMW->getFunction()))
- return AtomicExpansionKind::CmpXChg;
-
- // Always expand system scope fp atomics.
- if (HasSystemScope)
- return AtomicExpansionKind::CmpXChg;
+ if (AS == AMDGPUAS::LOCAL_ADDRESS) {
+ return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
+ : AtomicExpansionKind::CmpXChg;
+ }
- // For flat and global cases:
- // float, double in gfx7. Manual claims denormal support.
- // Removed in gfx8.
- // float, double restored in gfx10.
- // double removed again in gfx11, so only f32 for gfx11/gfx12.
- //
- // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but no
- // f32.
- //
- // FIXME: Check scope and fine grained memory
- if (AS == AMDGPUAS::FLAT_ADDRESS) {
- if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
- return ReportUnsafeHWInst(AtomicExpansionKind::None);
- if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
- return ReportUnsafeHWInst(AtomicExpansionKind::None);
- } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
- AS == AMDGPUAS::BUFFER_FAT_POINTER) {
- if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
- return ReportUnsafeHWInst(AtomicExpansionKind::None);
- if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
- return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
+ // For flat and global cases:
+ // float, double in gfx7. Manual claims denormal support.
+ // Removed in gfx8.
+ // float, double restored in gfx10.
+ // double removed again in gfx11, so only f32 for gfx11/gfx12.
+ //
+ // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but
+ // no f32.
+ if (AS == AMDGPUAS::FLAT_ADDRESS) {
+ if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
+ AS == AMDGPUAS::BUFFER_FAT_POINTER) {
+ if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ }
}
return AtomicExpansionKind::CmpXChg;
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index 06dee9c279f2c..2a15cdaede44e 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -796,23 +796,64 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v1, s4
+; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
+; GFX11-NEXT: s_addk_i32 s4, 0x400
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1
+; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f32_e32 v0, v5, v5
+; GFX11-NEXT: v_max_f32_e32 v4, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB3_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, s8
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s8
+; GFX10-NEXT: s_addk_i32 s8, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s8
+; GFX10-NEXT: v_max_f32_e32 v2, v1, v1
+; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc
+; GFX10-NEXT: v_max_f32_e32 v0, v5, v5
+; GFX10-NEXT: v_max_f32_e32 v4, v0, v2
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_cbranch_execnz .LBB3_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
@@ -904,19 +945,60 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v1, s8
-; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc
+; GFX7-NEXT: v_mov_b32_e32 v1, v0
+; GFX7-NEXT: v_mov_b32_e32 v0, s8
+; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX7-NEXT: s_add_i32 s10, s8, 0x400
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1
+; GFX7-NEXT: v_mov_b32_e32 v3, s10
+; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v5, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5
+; GFX7-NEXT: v_max_f32_e32 v4, v0, v2
+; GFX7-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v1, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB3_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v1, s8
-; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc
+; GFX6-NEXT: v_mov_b32_e32 v1, v0
+; GFX6-NEXT: v_mov_b32_e32 v0, s8
+; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX6-NEXT: s_add_i32 s10, s8, 0x400
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1
+; GFX6-NEXT: v_mov_b32_e32 v3, s10
+; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v5, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5
+; GFX6-NEXT: v_max_f32_e32 v4, v0, v2
+; GFX6-NEXT: v_mov_b32_e32 v0, v4
+; GFX6-NEXT: v_mov_b32_e32 v1, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB3_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
@@ -1992,21 +2074,66 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, s8
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s8
+; GFX10-NEXT: v_mov_b32_e32 v3, v1
+; GFX10-NEXT: s_addk_i32 s8, 0x800
+; GFX10-NEXT: v_mov_b32_e32 v6, s8
+; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v10, v1
+; GFX10-NEXT: v_mov_b32_e32 v9, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
+; GFX10-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX10-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX10-NEXT: v_mov_b32_e32 v0, v7
+; GFX10-NEXT: v_mov_b32_e32 v1, v8
+; GFX10-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-NEXT: v_mov_b32_e32 v3, v10
+; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_cbranch_execnz .LBB8_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX90A-NEXT: s_add_i32 s10, s8, 0x800
+; GFX90A-NEXT: s_mov_b64 s[8:9], 0
+; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX90A-NEXT: v_mov_b32_e32 v6, s10
+; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
+; GFX90A-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX90A-NEXT: s_cbranch_execnz .LBB8_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
@@ -2078,19 +2205,68 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, s8
-; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: v_mov_b32_e32 v0, s8
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX7-NEXT: s_add_i32 s10, s8, 0x800
+; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s10
+; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v10, v1
+; GFX7-NEXT: v_mov_b32_e32 v9, v0
+; GFX7-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX7-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v7
+; GFX7-NEXT: v_mov_b32_e32 v1, v8
+; GFX7-NEXT: v_mov_b32_e32 v2, v9
+; GFX7-NEXT: v_mov_b32_e32 v3, v10
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB8_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, s8
-; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
+; GFX6-NEXT: v_mov_b32_e32 v2, v0
+; GFX6-NEXT: v_mov_b32_e32 v0, s8
+; GFX6-NEXT: v_mov_b32_e32 v3, v1
+; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX6-NEXT: s_add_i32 s10, s8, 0x800
+; GFX6-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_mov_b32_e32 v6, s10
+; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v10, v1
+; GFX6-NEXT: v_mov_b32_e32 v9, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX6-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v0, v7
+; GFX6-NEXT: v_mov_b32_e32 v1, v8
+; GFX6-NEXT: v_mov_b32_e32 v2, v9
+; GFX6-NEXT: v_mov_b32_e32 v3, v10
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB8_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
@@ -7943,32 +8119,11 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
-; GFX12-NEXT: s_addk_co_i32 s4, 0x400
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_num_f32 v2, v1, v1
-; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
-; GFX12-NEXT: s_mov_b32 s4, 0
-; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v5, v0
+; GFX12-NEXT: v_mov_b32_e32 v1, s4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v0, v5, v5
-; GFX12-NEXT: v_max_num_f32_e32 v4, v0, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT: s_cbranch_execnz .LBB22_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
@@ -8003,64 +8158,23 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_
; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NE...
[truncated]
|
db51986
to
36cbbdf
Compare
50d27a4
to
bfa6075
Compare
36cbbdf
to
0381e27
Compare
bfa6075
to
9df089b
Compare
0381e27
to
234b772
Compare
9df089b
to
581f9cb
Compare
234b772
to
20d2b3f
Compare
ping |
4590c05
to
788b25a
Compare
bb497b5
to
3e0884a
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ping
f5b8ff2
to
d4d5a69
Compare
d4d5a69
to
f00129c
Compare
ping in |
a2231fa
to
2c9a7e3
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ping
…ring Consider the new atomic metadata when choosing to expand as cmpxchg instead.
ping |
2c9a7e3
to
478d3cb
Compare
Consider the new atomic metadata when choosing to expand as cmpxchg
instead.