Skip to content

AMDGPU: Handle remote/fine-grained memory in atomicrmw fmin/fmax lowering #96759

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,8 @@ __global__ void ffp2(double *p) {
// UNSAFE-LABEL: @_Z4ffp2Pd
// UNSAFE: global_atomic_add_f64
// UNSAFE: global_atomic_cmpswap_x2
// UNSAFE: global_atomic_cmpswap_x2
// UNSAFE: global_atomic_cmpswap_x2
// UNSAFE: global_atomic_max_f64
// UNSAFE: global_atomic_min_f64
// UNSAFE: global_atomic_max_f64
// UNSAFE: global_atomic_min_f64
__atomic_fetch_add(p, 1.0, memory_order_relaxed);
Expand Down Expand Up @@ -124,8 +124,8 @@ __global__ void ffp3(long double *p) {
// SAFE: global_atomic_cmpswap_b64
// UNSAFE-LABEL: @_Z4ffp3Pe
// UNSAFE: global_atomic_cmpswap_x2
// UNSAFE: global_atomic_cmpswap_x2
// UNSAFE: global_atomic_cmpswap_x2
// UNSAFE: global_atomic_max_f64
// UNSAFE: global_atomic_min_f64
// UNSAFE: global_atomic_max_f64
// UNSAFE: global_atomic_min_f64
__atomic_fetch_add(p, 1.0L, memory_order_relaxed);
Expand Down
83 changes: 53 additions & 30 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16118,6 +16118,34 @@ static bool isBFloat2(Type *Ty) {
return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
}

/// \returns true if it's valid to emit a native instruction for \p RMW, based
/// on the properties of the target memory.
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
const AtomicRMWInst *RMW,
bool HasSystemScope) {
// The remote/fine-grained access logic is different from the integer
// atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
// fine-grained access does not work, even for a device local allocation.
//
// With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
// allocations work.
if (HasSystemScope) {
if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics() &&
RMW->hasMetadata("amdgpu.no.remote.memory"))
return true;
} else if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics())
return true;

if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
return true;

// TODO: Auto-upgrade this attribute to the metadata in function body and stop
// checking it.
return RMW->getFunction()
->getFnAttribute("amdgpu-unsafe-fp-atomics")
.getValueAsBool();
}

TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
unsigned AS = RMW->getPointerAddressSpace();
Expand Down Expand Up @@ -16268,37 +16296,32 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
Type *Ty = RMW->getType();

// LDS float and double fmin/fmax were always supported.
if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy()))
return AtomicExpansionKind::None;

if (unsafeFPAtomicsDisabled(RMW->getFunction()))
return AtomicExpansionKind::CmpXChg;

// Always expand system scope fp atomics.
if (HasSystemScope)
return AtomicExpansionKind::CmpXChg;
if (AS == AMDGPUAS::LOCAL_ADDRESS) {
return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
: AtomicExpansionKind::CmpXChg;
}

// For flat and global cases:
// float, double in gfx7. Manual claims denormal support.
// Removed in gfx8.
// float, double restored in gfx10.
// double removed again in gfx11, so only f32 for gfx11/gfx12.
//
// For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but no
// f32.
//
// FIXME: Check scope and fine grained memory
if (AS == AMDGPUAS::FLAT_ADDRESS) {
if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
return ReportUnsafeHWInst(AtomicExpansionKind::None);
if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
return ReportUnsafeHWInst(AtomicExpansionKind::None);
} else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
AS == AMDGPUAS::BUFFER_FAT_POINTER) {
if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
return ReportUnsafeHWInst(AtomicExpansionKind::None);
if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
return ReportUnsafeHWInst(AtomicExpansionKind::None);
if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
// For flat and global cases:
// float, double in gfx7. Manual claims denormal support.
// Removed in gfx8.
// float, double restored in gfx10.
// double removed again in gfx11, so only f32 for gfx11/gfx12.
//
// For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but
// no f32.
if (AS == AMDGPUAS::FLAT_ADDRESS) {
if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
return ReportUnsafeHWInst(AtomicExpansionKind::None);
if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
return ReportUnsafeHWInst(AtomicExpansionKind::None);
} else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
AS == AMDGPUAS::BUFFER_FAT_POINTER) {
if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
return ReportUnsafeHWInst(AtomicExpansionKind::None);
if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
return ReportUnsafeHWInst(AtomicExpansionKind::None);
}
}

return AtomicExpansionKind::CmpXChg;
Expand Down
Loading
Loading