Skip to content

clang/AMDGPU: Emit atomicrmw for __builtin_amdgcn_global_atomic_fadd_{f32|f64} #96872

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 28 additions & 13 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/MatrixBuilder.h"
#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Support/ConvertUTF.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/ScopedPrinter.h"
Expand Down Expand Up @@ -18919,8 +18920,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
return Builder.CreateCall(F, { Src0, Builder.getFalse() });
}
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
Expand All @@ -18932,18 +18931,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
Intrinsic::ID IID;
llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
ArgTy = llvm::Type::getFloatTy(getLLVMContext());
IID = Intrinsic::amdgcn_global_atomic_fadd;
break;
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
ArgTy = llvm::FixedVectorType::get(
llvm::Type::getHalfTy(getLLVMContext()), 2);
IID = Intrinsic::amdgcn_global_atomic_fadd;
break;
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
IID = Intrinsic::amdgcn_global_atomic_fadd;
break;
case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
IID = Intrinsic::amdgcn_global_atomic_fmin;
break;
Expand Down Expand Up @@ -19366,7 +19358,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
case AMDGPU::BI__builtin_amdgcn_ds_faddf:
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: {
llvm::AtomicRMWInst::BinOp BinOp;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
Expand All @@ -19382,6 +19376,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
BinOp = llvm::AtomicRMWInst::FAdd;
break;
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
Expand Down Expand Up @@ -19416,9 +19412,14 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
EmitScalarExpr(E->getArg(3)), AO, SSID);
} else {
// The ds_atomic_fadd_* builtins do not have syncscope/order arguments.
SSID = llvm::SyncScope::System;
AO = AtomicOrdering::SequentiallyConsistent;
// Most of the builtins do not have syncscope/order arguments. For DS
// atomics the scope doesn't really matter, as they implicitly operate at
// workgroup scope.
//
// The global/flat cases need to use agent scope to consistently produce
// the native instruction instead of a cmpxchg expansion.
SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens with system (the default) ? I'm not sure I like using agent just to force the right expansion when there is no memory model motivation behind it. Do we have a precedent for this kind of thing?

Could codegen be fixed so you can just use system?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's complicated. The meaning of the builtin/intrinsic is give the instruction. The instruction does not work work at system scope on most targets, so you'll get a cmpxchg loop expansion, so the defacto behavior was to given agent scope and broken behavior depending on whether the allocation is fine grained or remote memory.

The whole point of doing this is we can now interpret system appropriately conservatively, and have a path to atomics that always work.

AO = AtomicOrdering::Monotonic;

// The v2bf16 builtin uses i16 instead of a natural bfloat type.
if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16) {
Expand All @@ -19432,6 +19433,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
if (Volatile)
RMW->setVolatile(true);

unsigned AddrSpace = Ptr.getType()->getAddressSpace();
if (AddrSpace != llvm::AMDGPUAS::LOCAL_ADDRESS) {
// Most targets require "amdgpu.no.fine.grained.memory" to emit the native
// instruction for flat and global operations.
llvm::MDTuple *EmptyMD = MDNode::get(getLLVMContext(), {});
RMW->setMetadata("amdgpu.no.fine.grained.memory", EmptyMD);

// Most targets require "amdgpu.ignore.denormal.mode" to emit the native
// instruction, but this only matters for float fadd.
if (BinOp == llvm::AtomicRMWInst::FAdd && Val->getType()->isFloatTy())
RMW->setMetadata("amdgpu.ignore.denormal.mode", EmptyMD);
}

return Builder.CreateBitCast(RMW, OrigTy);
}
case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
Expand Down
2 changes: 1 addition & 1 deletion clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ void test_s_wait_event_export_ready() {
}

// CHECK-LABEL: @test_global_add_f32
// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %{{.*}}, float %{{.*}})
// CHECK: = atomicrmw fadd ptr addrspace(1) %addr, float %x syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
void test_global_add_f32(float *rtn, global float *addr, float x) {
*rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
}
8 changes: 4 additions & 4 deletions clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ typedef short __attribute__((ext_vector_type(2))) short2;

// CHECK-LABEL: test_local_add_2bf16
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>

// GFX12-LABEL: test_local_add_2bf16
Expand All @@ -22,7 +22,7 @@ short2 test_local_add_2bf16(__local short2 *addr, short2 x) {

// CHECK-LABEL: test_local_add_2bf16_noret
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>

// GFX12-LABEL: test_local_add_2bf16_noret
Expand All @@ -32,15 +32,15 @@ void test_local_add_2bf16_noret(__local short2 *addr, short2 x) {
}

// CHECK-LABEL: test_local_add_2f16
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} monotonic, align 4
// GFX12-LABEL: test_local_add_2f16
// GFX12: ds_pk_add_rtn_f16
half2 test_local_add_2f16(__local half2 *addr, half2 x) {
return __builtin_amdgcn_ds_atomic_fadd_v2f16(addr, x);
}

// CHECK-LABEL: test_local_add_2f16_noret
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} monotonic, align 4
// GFX12-LABEL: test_local_add_2f16_noret
// GFX12: ds_pk_add_f16
void test_local_add_2f16_noret(__local half2 *addr, half2 x) {
Expand Down
4 changes: 2 additions & 2 deletions clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
// REQUIRES: amdgpu-registered-target

// CHECK-LABEL: test_fadd_local
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, float %{{.+}} seq_cst, align 4
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, float %{{.+}} monotonic, align 4
// GFX8-LABEL: test_fadd_local$local:
// GFX8: ds_add_rtn_f32 v2, v0, v1
// GFX8: s_endpgm
Expand All @@ -16,7 +16,7 @@ kernel void test_fadd_local(__local float *ptr, float val){
}

// CHECK-LABEL: test_fadd_local_volatile
// CHECK: = atomicrmw volatile fadd ptr addrspace(3) %{{.+}}, float %{{.+}} seq_cst, align 4
// CHECK: = atomicrmw volatile fadd ptr addrspace(3) %{{.+}}, float %{{.+}} monotonic, align 4
kernel void test_fadd_local_volatile(volatile __local float *ptr, float val){
volatile float *res;
*res = __builtin_amdgcn_ds_atomic_fadd_f32(ptr, val);
Expand Down
8 changes: 4 additions & 4 deletions clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
typedef half __attribute__((ext_vector_type(2))) half2;

// CHECK-LABEL: test_global_add_f64
// CHECK: call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %{{.*}}, double %{{.*}})
// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, double %{{.+}} syncscope("agent") monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
// GFX90A-LABEL: test_global_add_f64$local:
// GFX90A: global_atomic_add_f64
void test_global_add_f64(__global double *addr, double x) {
Expand Down Expand Up @@ -99,7 +99,7 @@ void test_flat_global_max_f64(__global double *addr, double x){
}

// CHECK-LABEL: test_ds_add_local_f64
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, double %{{.+}} seq_cst, align 8
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, double %{{.+}} monotonic, align 8
// GFX90A: test_ds_add_local_f64$local
// GFX90A: ds_add_rtn_f64
void test_ds_add_local_f64(__local double *addr, double x){
Expand All @@ -108,7 +108,7 @@ void test_ds_add_local_f64(__local double *addr, double x){
}

// CHECK-LABEL: test_ds_addf_local_f32
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, float %{{.+}} seq_cst, align 4
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, float %{{.+}} monotonic, align 4
// GFX90A-LABEL: test_ds_addf_local_f32$local
// GFX90A: ds_add_rtn_f32
void test_ds_addf_local_f32(__local float *addr, float x){
Expand All @@ -117,7 +117,7 @@ void test_ds_addf_local_f32(__local float *addr, float x){
}

// CHECK-LABEL: @test_global_add_f32
// CHECK: call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %{{.*}}, float %{{.*}})
// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, float %{{.+}} syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
void test_global_add_f32(float *rtn, global float *addr, float x) {
*rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
}
8 changes: 4 additions & 4 deletions clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ short2 test_global_add_2bf16(__global short2 *addr, short2 x) {
// CHECK-LABEL: test_local_add_2bf16

// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4{{$}}
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>

// GFX940-LABEL: test_local_add_2bf16
Expand All @@ -54,23 +54,23 @@ short2 test_local_add_2bf16(__local short2 *addr, short2 x) {
}

// CHECK-LABEL: test_local_add_2f16
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} monotonic, align 4
// GFX940-LABEL: test_local_add_2f16
// GFX940: ds_pk_add_rtn_f16
half2 test_local_add_2f16(__local half2 *addr, half2 x) {
return __builtin_amdgcn_ds_atomic_fadd_v2f16(addr, x);
}

// CHECK-LABEL: test_local_add_2f16_noret
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} monotonic, align 4
// GFX940-LABEL: test_local_add_2f16_noret
// GFX940: ds_pk_add_f16
void test_local_add_2f16_noret(__local half2 *addr, half2 x) {
__builtin_amdgcn_ds_atomic_fadd_v2f16(addr, x);
}

// CHECK-LABEL: @test_global_add_f32
// CHECK: call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %{{.*}}, float %{{.*}})
// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, float %{{.+}} syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
void test_global_add_f32(float *rtn, global float *addr, float x) {
*rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
}
Loading