AMDGPU: Remove flat/global fmin/fmax intrinsics #105642

arsenm · 2024-08-22T11:06:30Z

These have been replaced with atomicrmw

arsenm · 2024-08-22T11:06:55Z

This stack of pull requests is managed by Graphite. Learn more about stacking.

Join @arsenm and the rest of your teammates on Graphite

llvmbot · 2024-08-22T11:06:58Z

@llvm/pr-subscribers-backend-amdgpu
@llvm/pr-subscribers-llvm-globalisel
@llvm/pr-subscribers-llvm-transforms
@llvm/pr-subscribers-clang-codegen

@llvm/pr-subscribers-clang

Author: Matt Arsenault (arsenm)

Changes

These have been replaced with atomicrmw

Patch is 89.93 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/105642.diff

18 Files Affected:

(modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (-5)
(modified) llvm/lib/IR/AutoUpgrade.cpp (+16-9)
(modified) llvm/lib/Target/AMDGPU/AMDGPUInstructions.td (-4)
(modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (-4)
(modified) llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td (-4)
(modified) llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp (-4)
(modified) llvm/lib/Target/AMDGPU/FLATInstructions.td (-11)
(modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (-16)
(modified) llvm/test/Bitcode/amdgcn-atomic.ll (+64)
(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll (+32-204)
(modified) llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll (+5-4)
(removed) llvm/test/CodeGen/AMDGPU/fp-min-max-flat-atomics-f64.ll (-51)
(removed) llvm/test/CodeGen/AMDGPU/fp-min-max-flat-atomics.ll (-83)
(removed) llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics-f64.ll (-51)
(removed) llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics.ll (-87)
(modified) llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll (+4-360)
(removed) llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat-fadd-fmin-fmax-intrinsics.ll (-224)
(modified) llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll (+6-57)

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index dc13a35c66f9ab..9f2a3a985a56b5 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2945,11 +2945,6 @@ def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v
 // gfx90a intrinsics
 // ===----------------------------------------------------------------------===//
 
-def int_amdgcn_global_atomic_fmin : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
-def int_amdgcn_global_atomic_fmax : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
-def int_amdgcn_flat_atomic_fmin   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
-def int_amdgcn_flat_atomic_fmax   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
-
 defset list<Intrinsic> AMDGPUMFMAIntrinsics90A = {
 def int_amdgcn_mfma_f32_32x32x4bf16_1k  : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4i16_ty>;
 def int_amdgcn_mfma_f32_16x16x4bf16_1k  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>;
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 8dd5b9b3ec3d1f..d2e00c928b1028 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1033,14 +1033,17 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
         break; // No other 'amdgcn.atomic.*'
       }
 
-      if (Name.starts_with("ds.fadd") || Name.starts_with("ds.fmin") ||
-          Name.starts_with("ds.fmax") ||
-          Name.starts_with("global.atomic.fadd") ||
-          Name.starts_with("flat.atomic.fadd")) {
-        // Replaced with atomicrmw fadd/fmin/fmax, so there's no new
-        // declaration.
-        NewFn = nullptr;
-        return true;
+      if (Name.consume_front("ds.") || Name.consume_front("global.atomic.") ||
+          Name.consume_front("flat.atomic.")) {
+        if (Name.starts_with("fadd") ||
+            // FIXME: We should also remove fmin.num and fmax.num intrinsics.
+            (Name.starts_with("fmin") && !Name.starts_with("fmin.num")) ||
+            (Name.starts_with("fmax") && !Name.starts_with("fmax.num"))) {
+          // Replaced with atomicrmw fadd/fmin/fmax, so there's no new
+          // declaration.
+          NewFn = nullptr;
+          return true;
+        }
       }
 
       if (Name.starts_with("ldexp.")) {
@@ -4046,7 +4049,11 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI,
           .StartsWith("atomic.inc.", AtomicRMWInst::UIncWrap)
           .StartsWith("atomic.dec.", AtomicRMWInst::UDecWrap)
           .StartsWith("global.atomic.fadd", AtomicRMWInst::FAdd)
-          .StartsWith("flat.atomic.fadd", AtomicRMWInst::FAdd);
+          .StartsWith("flat.atomic.fadd", AtomicRMWInst::FAdd)
+          .StartsWith("global.atomic.fmin", AtomicRMWInst::FMin)
+          .StartsWith("flat.atomic.fmin", AtomicRMWInst::FMin)
+          .StartsWith("global.atomic.fmax", AtomicRMWInst::FMax)
+          .StartsWith("flat.atomic.fmax", AtomicRMWInst::FMax);
 
   unsigned NumOperands = CI->getNumOperands();
   if (NumOperands < 3) // Malformed bitcode.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index aa5b151adef3a4..09987a6504b9d0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -618,10 +618,6 @@ multiclass local_addr_space_atomic_op {
     }
 }
 
-defm int_amdgcn_flat_atomic_fmin : noret_op;
-defm int_amdgcn_flat_atomic_fmax : noret_op;
-defm int_amdgcn_global_atomic_fmin : noret_op;
-defm int_amdgcn_global_atomic_fmax : noret_op;
 defm int_amdgcn_global_atomic_csub : noret_op;
 defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op;
 defm int_amdgcn_flat_atomic_fmin_num : noret_op;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 69a1936a11fe05..126fc4d9672d8d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4897,12 +4897,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       break;
     }
     case Intrinsic::amdgcn_global_atomic_csub:
-    case Intrinsic::amdgcn_global_atomic_fmin:
-    case Intrinsic::amdgcn_global_atomic_fmax:
     case Intrinsic::amdgcn_global_atomic_fmin_num:
     case Intrinsic::amdgcn_global_atomic_fmax_num:
-    case Intrinsic::amdgcn_flat_atomic_fmin:
-    case Intrinsic::amdgcn_flat_atomic_fmax:
     case Intrinsic::amdgcn_flat_atomic_fmin_num:
     case Intrinsic::amdgcn_flat_atomic_fmax_num:
     case Intrinsic::amdgcn_atomic_cond_sub_u32:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 95c4859674ecc4..40423ed3e25871 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -239,13 +239,9 @@ def : SourceOfDivergence<int_r600_read_tidig_y>;
 def : SourceOfDivergence<int_r600_read_tidig_z>;
 def : SourceOfDivergence<int_amdgcn_atomic_cond_sub_u32>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_csub>;
-def : SourceOfDivergence<int_amdgcn_global_atomic_fmin>;
-def : SourceOfDivergence<int_amdgcn_global_atomic_fmax>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_fmin_num>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_fmax_num>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_ordered_add_b64>;
-def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin>;
-def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax>;
 def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin_num>;
 def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax_num>;
 def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_swap>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index cb3fbdb850c1ac..b615639ac1b82e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1045,8 +1045,6 @@ bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
   switch (IID) {
   case Intrinsic::amdgcn_is_shared:
   case Intrinsic::amdgcn_is_private:
-  case Intrinsic::amdgcn_flat_atomic_fmax:
-  case Intrinsic::amdgcn_flat_atomic_fmin:
   case Intrinsic::amdgcn_flat_atomic_fmax_num:
   case Intrinsic::amdgcn_flat_atomic_fmin_num:
     OpIndexes.push_back(0);
@@ -1106,8 +1104,6 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
     return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
                              {NewV, MaskOp});
   }
-  case Intrinsic::amdgcn_flat_atomic_fmax:
-  case Intrinsic::amdgcn_flat_atomic_fmin:
   case Intrinsic::amdgcn_flat_atomic_fmax_num:
   case Intrinsic::amdgcn_flat_atomic_fmin_num: {
     Type *DestTy = II->getType();
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 7b3822067072e5..d5d1d27c3a850f 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1604,15 +1604,11 @@ let OtherPredicates = [isGFX12Plus] in {
 let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in {
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
-defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>;
-defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>;
 }
 
 let SubtargetPredicate = HasAtomicFMinFMaxF32FlatInsts in {
 defm : FlatAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>;
 defm : FlatAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>;
-defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin", f32>;
-defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax", f32>;
 }
 
 let OtherPredicates = [isGFX12Only] in {
@@ -1642,13 +1638,6 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_globa
 let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in {
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>;
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>;
-defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MIN_F64", "int_amdgcn_global_atomic_fmin", f64>;
-defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MAX_F64", "int_amdgcn_global_atomic_fmax", f64>;
-}
-
-let SubtargetPredicate = HasAtomicFMinFMaxF64FlatInsts in {
-defm : FlatAtomicIntrPat <"FLAT_ATOMIC_MIN_F64", "int_amdgcn_flat_atomic_fmin", f64>;
-defm : FlatAtomicIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax", f64>;
 }
 
 let OtherPredicates = [HasFlatBufferGlobalAtomicFaddF64Inst] in {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c954c0aa71f734..464e77f66acfab 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1351,13 +1351,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                   MachineMemOperand::MODereferenceable;
     return true;
   }
-  case Intrinsic::amdgcn_global_atomic_fmin:
-  case Intrinsic::amdgcn_global_atomic_fmax:
   case Intrinsic::amdgcn_global_atomic_fmin_num:
   case Intrinsic::amdgcn_global_atomic_fmax_num:
   case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
-  case Intrinsic::amdgcn_flat_atomic_fmin:
-  case Intrinsic::amdgcn_flat_atomic_fmax:
   case Intrinsic::amdgcn_flat_atomic_fmin_num:
   case Intrinsic::amdgcn_flat_atomic_fmax_num:
   case Intrinsic::amdgcn_atomic_cond_sub_u32: {
@@ -1462,14 +1458,10 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
   case Intrinsic::amdgcn_ds_consume:
   case Intrinsic::amdgcn_ds_ordered_add:
   case Intrinsic::amdgcn_ds_ordered_swap:
-  case Intrinsic::amdgcn_flat_atomic_fmax:
   case Intrinsic::amdgcn_flat_atomic_fmax_num:
-  case Intrinsic::amdgcn_flat_atomic_fmin:
   case Intrinsic::amdgcn_flat_atomic_fmin_num:
   case Intrinsic::amdgcn_global_atomic_csub:
-  case Intrinsic::amdgcn_global_atomic_fmax:
   case Intrinsic::amdgcn_global_atomic_fmax_num:
-  case Intrinsic::amdgcn_global_atomic_fmin:
   case Intrinsic::amdgcn_global_atomic_fmin_num:
   case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
   case Intrinsic::amdgcn_global_load_tr_b64:
@@ -9285,12 +9277,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     DAG.setNodeMemRefs(NewNode, {MemRef});
     return SDValue(NewNode, 0);
   }
-  case Intrinsic::amdgcn_global_atomic_fmin:
-  case Intrinsic::amdgcn_global_atomic_fmax:
   case Intrinsic::amdgcn_global_atomic_fmin_num:
   case Intrinsic::amdgcn_global_atomic_fmax_num:
-  case Intrinsic::amdgcn_flat_atomic_fmin:
-  case Intrinsic::amdgcn_flat_atomic_fmax:
   case Intrinsic::amdgcn_flat_atomic_fmin_num:
   case Intrinsic::amdgcn_flat_atomic_fmax_num: {
     MemSDNode *M = cast<MemSDNode>(Op);
@@ -9301,16 +9289,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     };
     unsigned Opcode = 0;
     switch (IntrID) {
-    case Intrinsic::amdgcn_global_atomic_fmin:
     case Intrinsic::amdgcn_global_atomic_fmin_num:
-    case Intrinsic::amdgcn_flat_atomic_fmin:
     case Intrinsic::amdgcn_flat_atomic_fmin_num: {
       Opcode = ISD::ATOMIC_LOAD_FMIN;
       break;
     }
-    case Intrinsic::amdgcn_global_atomic_fmax:
     case Intrinsic::amdgcn_global_atomic_fmax_num:
-    case Intrinsic::amdgcn_flat_atomic_fmax:
     case Intrinsic::amdgcn_flat_atomic_fmax_num: {
       Opcode = ISD::ATOMIC_LOAD_FMAX;
       break;
diff --git a/llvm/test/Bitcode/amdgcn-atomic.ll b/llvm/test/Bitcode/amdgcn-atomic.ll
index d642372799f56b..af3338577f7163 100644
--- a/llvm/test/Bitcode/amdgcn-atomic.ll
+++ b/llvm/test/Bitcode/amdgcn-atomic.ll
@@ -354,4 +354,68 @@ define float @upgrade_amdgcn_global_atomic_fadd_f32_p1_f32(ptr addrspace(1) %ptr
   ret float %result
 }
 
+declare float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr nocapture, float) #0
+
+define float @upgrade_amdgcn_flat_atomic_fmin_f32_p0_f32(ptr %ptr, float %data) {
+  ; CHECK: %{{.+}} = atomicrmw fmin ptr %ptr, float %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  %result = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %ptr, float %data)
+  ret float %result
+}
+
+declare float @llvm.amdgcn.global.atomic.fmin.f32.p1.f32(ptr addrspace(1) nocapture, float) #0
+
+define float @upgrade_amdgcn_global_atomic_fmin_f32_p1_f32(ptr addrspace(1) %ptr, float %data) {
+  ; CHECK: %{{.+}} = atomicrmw fmin ptr addrspace(1) %ptr, float %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  %result = call float @llvm.amdgcn.global.atomic.fmin.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
+  ret float %result
+}
+
+declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr nocapture, double) #0
+
+define double @upgrade_amdgcn_flat_atomic_fmin_f64_p0_f64(ptr %ptr, double %data) {
+  ; CHECK: %{{.+}} = atomicrmw fmin ptr %ptr, double %data syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  %result = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data)
+  ret double %result
+}
+
+declare double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) nocapture, double) #0
+
+define double @upgrade_amdgcn_global_atomic_fmin_f64_p1_f64(ptr addrspace(1) %ptr, double %data) {
+  ; CHECK: %{{.+}} = atomicrmw fmin ptr addrspace(1) %ptr, double %data syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  %result = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
+  ret double %result
+}
+
+declare float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr nocapture, float) #0
+
+define float @upgrade_amdgcn_flat_atomic_fmax_f32_p0_f32(ptr %ptr, float %data) {
+  ; CHECK: %{{.+}} = atomicrmw fmax ptr %ptr, float %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  %result = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %ptr, float %data)
+  ret float %result
+}
+
+declare float @llvm.amdgcn.global.atomic.fmax.f32.p1.f32(ptr addrspace(1) nocapture, float) #0
+
+define float @upgrade_amdgcn_global_atomic_fmax_f32_p1_f32(ptr addrspace(1) %ptr, float %data) {
+  ; CHECK: %{{.+}} = atomicrmw fmax ptr addrspace(1) %ptr, float %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  %result = call float @llvm.amdgcn.global.atomic.fmax.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
+  ret float %result
+}
+
+declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr nocapture, double) #0
+
+define double @upgrade_amdgcn_flat_atomic_fmax_f64_p0_f64(ptr %ptr, double %data) {
+  ; CHECK: %{{.+}} = atomicrmw fmax ptr %ptr, double %data syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  %result = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data)
+  ret double %result
+}
+
+declare double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) nocapture, double) #0
+
+define double @upgrade_amdgcn_global_atomic_fmax_f64_p1_f64(ptr addrspace(1) %ptr, double %data) {
+  ; CHECK: %{{.+}} = atomicrmw fmax ptr addrspace(1) %ptr, double %data syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  %result = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
+  ret double %result
+}
+
 attributes #0 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index eb39ca2d7daa7f..92ce2af47e22ad 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -14,10 +14,6 @@ declare double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double, <4 x i32>, i32
 declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg)
 declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32 immarg)
 declare double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32 immarg)
-declare double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
-declare double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
-declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data)
-declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data)
 
 define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: raw_buffer_atomic_add_noret_f64:
@@ -1015,52 +1011,6 @@ main_body:
   ret void
 }
 
-define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) {
-; GFX90A-LABEL: global_atomic_fmin_f64_noret:
-; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    global_atomic_min_f64 v2, v[0:1], s[4:5]
-; GFX90A-NEXT:    s_endpgm
-;
-; GFX940-LABEL: global_atomic_fmin_f64_noret:
-; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX940-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX940-NEXT:    global_atomic_min_f64 v2, v[0:1], s[4:5]
-; GFX940-NEXT:    s_endpgm
-main_body:
-  %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
-  ret void
-}
-
-define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, double %data) {
-; GFX90A-LABEL: global_atomic_fmax_f64_noret:
-; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    global_atomic_max_f64 v2, v[0:1], s[4:5]
-; GFX90A-NEXT:    s_endpgm
-;
-; GFX940-LABEL: global_atomic_fmax_f64_noret:
-; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX940-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX940-NEXT:    global_atomic_max_f64 v2, v[0:1], s[4:5]
-; GFX940-NEXT:    s_endpgm
-main_body:
-  %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
-  ret void
-}
-
 define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %ptr) #1 {
 ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat:
 ; GFX90A:       ; %bb.0: ; %main_body
@@ -1070,7 +1020,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
 ; GFX90A-NEXT:    v_mbcnt_hi_u32_b32 v0, s4, v0
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT:    s_cbranch_execz .LBB38_2
+; GFX90A-NEXT:    s_cbranch_execz .LBB36_2
 ; GFX90A-NEXT:  ; %bb.1:
 ; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX90A-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
@@ -1083,7 +1033,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NEXT:  .LBB38_2:
+; GFX90A-NEXT:  .LBB36_2:
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat:
@@ -1094,7 +1044,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
 ; GFX940-NEXT:    v_mbcnt_hi_u32_b32 v0, s4, v0
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT:    s_cbranch_execz .LBB38_2
+; GFX940-NEXT:    s_cbranch_execz .LBB36_2
 ; GFX940-NEXT:  ; %bb.1:
 ; GFX940-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
@@ -...
[truncated]

doru1004

Can you provide a similar intrinsic which can be called as Intrinsic::int_amdgcn_global_atomic_fmax which calls your desired intrinsic instead?

arsenm · 2024-09-10T14:16:50Z

Can you provide a similar intrinsic which can be called as Intrinsic::int_amdgcn_global_atomic_fmax which calls your desired intrinsic instead?

No. The point is no intrinsics. Adding an identical intrinsic helps nothing. Only atomicrmw is an acceptable atomic representation

doru1004 · 2024-09-10T14:16:51Z

@arsenm can you provide an alternative to the current functionality which you are removing?

Use atomicrmw with appropriate metadata. See the autoupgrade code

Are you exposing something like Intrinsic::atomicrmw which I can call from Clang code gen?

arsenm · 2024-09-10T14:19:01Z

Are you exposing something like Intrinsic::atomicrmw which I can call from Clang code gen?

atomicrmw is already a first class IR instruction. The intrinsics were always a hack for missing operations and annotations on atomicrmw

doru1004 · 2024-09-10T14:19:01Z

Can you provide a similar intrinsic which can be called as Intrinsic::int_amdgcn_global_atomic_fmax which calls your desired intrinsic instead?

No. The point is no intrinsics. Adding an identical intrinsic helps nothing. Only atomicrmw is an acceptable atomic representation

I understand and you are free to only use atomicrmw under the hood. I just need to have a way to do it from Clang code gen.

doru1004 · 2024-09-10T14:22:05Z

Are you exposing something like Intrinsic::atomicrmw which I can call from Clang code gen?

atomicrmw is already a first class IR instruction. The intrinsics were always a hack for missing operations and annotations on atomicrmw

Are there examples of how to emit such an instruction in clang code gen?

arsenm · 2024-09-10T14:30:22Z

Are you exposing something like Intrinsic::atomicrmw which I can call from Clang code gen?

atomicrmw is already a first class IR instruction. The intrinsics were always a hack for missing operations and annotations on atomicrmw

Are there examples of how to emit such an instruction in clang code gen?

Yes. The corresponding builtins are already emitting atomicrmw with annotations.

doru1004 · 2024-09-10T14:31:27Z

Are you exposing something like Intrinsic::atomicrmw which I can call from Clang code gen?

atomicrmw is already a first class IR instruction. The intrinsics were always a hack for missing operations and annotations on atomicrmw

Are there examples of how to emit such an instruction in clang code gen?

Yes. The corresponding builtins are already emitting atomicrmw with annotations.

Can you point me to where this is happening please?

arsenm · 2024-09-11T12:21:33Z

Can you point me to where this is happening please?

llvm-project/clang/lib/CodeGen/CGBuiltin.cpp

Line 19552 in b9c2e2e

case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:

doru1004 · 2024-09-11T13:47:54Z

@arsenm I am not ok with the removal of these intrinsics that allow direct call to some of these atomics. I think you should keep them while you still do the changes that you want with the other atomicrmw. Your change is not backwards compatible.
It is not yet clear to me if calling create atomicrmw is equivalent to calling the old intrisincs.

arsenm · 2024-09-11T13:55:29Z

@arsenm I am not ok with the removal of these intrinsics that allow direct call to some of these atomics. I think you should keep them while you still do the changes that you want with the other atomicrmw.

The changes are already completed and we are at the final point of removal.

Your change is not backwards compatible. It is not yet clear to me if calling create atomicrmw is equivalent to calling the old intrisincs.

The intrinsics are strictly less expressive than atomicrmw. There is no plus to using the intrinsics, and it is trivial to migrate to an atomicrmw that will produce the ISA instruction. The old intrinsics had broken scope and ordering arguments which did not work. You only need to decide whether the use case is OK with breaking remote memory, fine grained memory, or both.

rovka · 2024-09-12T07:39:59Z

@arsenm I am not ok with the removal of these intrinsics that allow direct call to some of these atomics. I think you should keep them while you still do the changes that you want with the other atomicrmw. Your change is not backwards compatible. It is not yet clear to me if calling create atomicrmw is equivalent to calling the old intrisincs.

@doru1004 Can you please provide an example use case where it's not clear how to use atomicrmw to replace the old intrinsics? Then maybe we can add something to the docs or at least commit message to clarify.

doru1004 · 2024-09-16T15:28:29Z

@arsenm I am not ok with the removal of these intrinsics that allow direct call to some of these atomics. I think you should keep them while you still do the changes that you want with the other atomicrmw. Your change is not backwards compatible. It is not yet clear to me if calling create atomicrmw is equivalent to calling the old intrisincs.

@doru1004 Can you please provide an example use case where it's not clear how to use atomicrmw to replace the old intrinsics? Then maybe we can add something to the docs or at least commit message to clarify.

For all the cases where the intrinsic was used i.e. fadd/fmin/fmax emitting atomic RMW without any metadata is conservative, safe but not equivalent with the behavior of the intrinsic. So one thing that has not been clear is what is the correct combination of metadata attributes that allows the newly emitted atomic RMW instruction to be equivalent (or very very close) to the behavior of the intrinsic that was removed. I would argue that some higher level functions like emitAtomicRMW{FAdd/FMin/FMax} are needed to cover those cases and emit the atomicRMW with the correct combination of metadata attributes.

These have been replaced with atomicrmw

shiltian

LGTM, as long as @doru1004 's concern has been resolved.

doru1004 · 2024-10-08T19:51:54Z

I would still like to see something like the functions I suggested in my previous comment. Something that will be equivalent with the intrinsic that is being eliminated. These functions would use the new atomicRMW method AND also set the attributes accordingly such that it matches the behavior of the intrinsic.

arsenm · 2024-10-08T19:55:02Z

I would still like to see something like the functions I suggested in my previous comment. Something that will be equivalent with the intrinsic that is being eliminated. These functions would use the new atomicRMW method AND also set the attributes accordingly such that it matches the behavior of the intrinsic.

There won't be a new function, but @yxsamliu is working on source level controls for these fields for the atomic builtins

doru1004 · 2024-10-08T20:02:24Z

There won't be a new function, but @yxsamliu is working on source level controls for these fields for the atomic builtins

I need to be able to call this from Clang code gen. The reason I insist on this is because I don't think the selection of attributes to pass to the current atomicRMW function is clear enough i.e. if I was relying on the intrinsic up until now, it is not clear what set of additional options to pass to atomicRMW() to make it behave like the intrinsic. So I would like that knowledge to live somewhere in the compiler i.e. in the shape of a function atomicRMWFastFAdd or something like that which calls atomicRMW under the hood BUT ALSO provides the set of additional options that make it behave like the intrinsic.

arsenm · 2024-10-08T20:08:33Z

I need to be able to call this from Clang code gen. The reason I insist on this is because I don't think the selection of attributes to pass to the current atomicRMW function is clear enough i.e. if I was relying on the intrinsic up until now, it is not clear what set of additional options to pass to atomicRMW() to make it behave like the intrinsic.

This is exactly why we must remove this intrinsic or anything that acts like it. You, the frontend writer, must semantically know what you can do. The semantics cannot be "do whatever the intrinsic happens to do on this target"

doru1004 · 2024-10-08T20:20:09Z

This is exactly why we must remove this intrinsic or anything that acts like it. You, the frontend writer, must semantically know what you can do. The semantics cannot be "do whatever the intrinsic happens to do on this target"

But you're not removing everything that acts like it except for the intrinsic itself. There's nothing illegal about asking for a new style of atomic RMW with a particular set of additional options that happens to match the previous intrinsic.

arsenm · 2024-10-08T20:41:53Z

But you're not removing everything that acts like it except for the intrinsic itself. There's nothing illegal about asking for a new style of atomic RMW with a particular set of additional options that happens to match the previous intrinsic.

Part of the point of this is to make you do this work. Adding a convenience to "match the intrinsic" is counterproductive. The intrinsic was not semantically defined

doru1004 · 2024-10-08T20:59:23Z

Part of the point of this is to make you do this work. Adding a convenience to "match the intrinsic" is counterproductive. The intrinsic was not semantically defined

The problem is that no one knows what options to use to make the atomicRMW closer to the old intrinsic. You do not document that anywhere.

arsenm · 2024-10-08T21:19:12Z

The problem is that no one knows what options to use to make the atomicRMW closer to the old intrinsic. You do not document that anywhere.

It's in AMDGPUUsage

llvm-ci · 2024-10-09T05:49:16Z

LLVM Buildbot has detected a new failure on builder sanitizer-aarch64-linux-fuzzer running on sanitizer-buildbot11 while building llvm at step 2 "annotate".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/159/builds/7787

Here is the relevant piece of the build log for the reference

Step 2 (annotate) failure: 'python ../sanitizer_buildbot/sanitizers/zorg/buildbot/builders/sanitizers/buildbot_selector.py' (failure) (timed out)
...
[5/7] Linking CXX shared library /home/b/sanitizer-aarch64-linux-fuzzer/build/llvm_build0/lib/clang/20/lib/aarch64-unknown-linux-gnu/libclang_rt.ubsan_standalone.so
[6/7] Linking CXX shared library /home/b/sanitizer-aarch64-linux-fuzzer/build/llvm_build0/lib/clang/20/lib/aarch64-unknown-linux-gnu/libclang_rt.hwasan.so
[7/7] Linking CXX shared library /home/b/sanitizer-aarch64-linux-fuzzer/build/llvm_build0/lib/clang/20/lib/aarch64-unknown-linux-gnu/libclang_rt.asan.so
[20/22] No install step for 'runtimes'
[22/22] Completed 'runtimes'
02a9cac8b07f7ebb5a12cb2ad6427507  llvm_build0/bin/clang
@@@BUILD_STEP get fuzzer-test-suite @@@
Already up to date.
@@@BUILD_STEP test libxml2-v2.9.2 fuzzer@@@
Cloning into 'SRC'...
command timed out: 1200 seconds without output running [b'python', b'../sanitizer_buildbot/sanitizers/zorg/buildbot/builders/sanitizers/buildbot_selector.py'], attempting to kill
process killed by signal 9
program finished with exit code -1
elapsedTime=1242.223521
Step 9 (test libxml2-v2.9.2 fuzzer) failure: test libxml2-v2.9.2 fuzzer (failure)
@@@BUILD_STEP test libxml2-v2.9.2 fuzzer@@@
Cloning into 'SRC'...

command timed out: 1200 seconds without output running [b'python', b'../sanitizer_buildbot/sanitizers/zorg/buildbot/builders/sanitizers/buildbot_selector.py'], attempting to kill
process killed by signal 9
program finished with exit code -1
elapsedTime=1242.223521

doru1004 · 2024-10-11T12:41:12Z

The problem is that no one knows what options to use to make the atomicRMW closer to the old intrinsic. You do not document that anywhere.

It's in AMDGPUUsage

I agree but that doesn't say what combination of those options to use to be as close as possible to the previous intrinsic. So if we can document that then we will be all good!

arsenm mentioned this pull request Aug 22, 2024

AMDGPU: Remove global/flat atomic fadd intrinics #97051

Merged

arsenm added backend:AMDGPU clang Clang issues not falling into any other category clang:codegen IR generation bugs: mangling, exceptions, etc. llvm:globalisel llvm:ir llvm:transforms labels Aug 22, 2024 — with Graphite App

arsenm mentioned this pull request Aug 22, 2024

AMDGPU: Legalize v2f16 atomicrmw fadd for buffer fat pointers #95929

Merged

arsenm requested review from AlexVlx, jayfoad, Sisyph, yashssh and yxsamliu August 22, 2024 11:06

arsenm marked this pull request as ready for review August 22, 2024 11:06

This was referenced Aug 22, 2024

clang/AMDGPU: Emit atomicrmw for flat/global atomic min/max f64 builtins #96876

Merged

AMDGPU: Remove flat/global atomic fadd v2bf16 intrinsics #97050

Merged

Base automatically changed from users/arsenm/amdgpu-remove-global-flat-atomic-fadd-intrinsics to main August 22, 2024 19:27

arsenm force-pushed the users/arsenm/amdgpu-remove-flat-global-fmin-fmax-intrinsics branch from be7d2ae to 98ba592 Compare August 22, 2024 19:38

doru1004 requested changes Sep 10, 2024

View reviewed changes

arsenm force-pushed the users/arsenm/amdgpu-remove-flat-global-fmin-fmax-intrinsics branch from 6e4312b to c2eebd0 Compare October 4, 2024 20:25

arsenm requested a review from doru1004 October 4, 2024 20:28

AMDGPU: Remove flat/global fmin/fmax intrinsics

d4bb0b2

These have been replaced with atomicrmw

arsenm force-pushed the users/arsenm/amdgpu-remove-flat-global-fmin-fmax-intrinsics branch from c2eebd0 to d4bb0b2 Compare October 8, 2024 19:13

shiltian approved these changes Oct 8, 2024

View reviewed changes

arsenm merged commit c198f77 into main Oct 9, 2024
8 checks passed

arsenm deleted the users/arsenm/amdgpu-remove-flat-global-fmin-fmax-intrinsics branch October 9, 2024 05:27

AMDGPU: Remove flat/global fmin/fmax intrinsics #105642

AMDGPU: Remove flat/global fmin/fmax intrinsics #105642

Uh oh!

Conversation

arsenm commented Aug 22, 2024

Uh oh!

arsenm commented Aug 22, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Aug 22, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

doru1004 left a comment

Choose a reason for hiding this comment

Uh oh!

arsenm commented Sep 10, 2024

Uh oh!

doru1004 commented Sep 10, 2024

Uh oh!

arsenm commented Sep 10, 2024

Uh oh!

doru1004 commented Sep 10, 2024

Uh oh!

doru1004 commented Sep 10, 2024

Uh oh!

arsenm commented Sep 10, 2024

Uh oh!

doru1004 commented Sep 10, 2024

Uh oh!

arsenm commented Sep 11, 2024

Uh oh!

doru1004 commented Sep 11, 2024

Uh oh!

arsenm commented Sep 11, 2024

Uh oh!

rovka commented Sep 12, 2024

Uh oh!

doru1004 commented Sep 16, 2024

Uh oh!

shiltian left a comment

Choose a reason for hiding this comment

Uh oh!

doru1004 commented Oct 8, 2024

Uh oh!

arsenm commented Oct 8, 2024

Uh oh!

doru1004 commented Oct 8, 2024

Uh oh!

arsenm commented Oct 8, 2024

Uh oh!

doru1004 commented Oct 8, 2024

Uh oh!

arsenm commented Oct 8, 2024

Uh oh!

doru1004 commented Oct 8, 2024

Uh oh!

arsenm commented Oct 8, 2024

Uh oh!

Uh oh!

llvm-ci commented Oct 9, 2024

Uh oh!

doru1004 commented Oct 11, 2024

Uh oh!

Uh oh!

arsenm commented Aug 22, 2024 •

edited

Loading

llvmbot commented Aug 22, 2024 •

edited

Loading