diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index c9eb5eea89690..2b68c54de8beb 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -75,6 +75,11 @@ Changes to the AArch64 Backend Changes to the AMDGPU Backend ----------------------------- +* Removed ``llvm.amdgcn.flat.atomic.fadd`` and + ``llvm.amdgcn.global.atomic.fadd`` intrinsics. Users should use the + :ref:`atomicrmw ` instruction with `fadd` and + addrspace(0) or addrspace(1) instead. + Changes to the ARM Backend -------------------------- diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 539410f1ed05e..dc13a35c66f9a 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2909,8 +2909,6 @@ def int_amdgcn_dot4_f32_bf8_bf8 : AMDGPU8bitFloatDot4Intrinsic; // gfx908 intrinsics // ===----------------------------------------------------------------------===// -def int_amdgcn_global_atomic_fadd : AMDGPUAtomicRtn; - // llvm.amdgcn.mfma.*.* vdst, srcA, srcB, srcC, cbsz, abid, blgp class AMDGPUMfmaIntrinsic : ClangBuiltin, @@ -2949,7 +2947,6 @@ def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic; def int_amdgcn_global_atomic_fmax : AMDGPUAtomicRtn; -def int_amdgcn_flat_atomic_fadd : AMDGPUAtomicRtn; def int_amdgcn_flat_atomic_fmin : AMDGPUAtomicRtn; def int_amdgcn_flat_atomic_fmax : AMDGPUAtomicRtn; diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index c6963edf5288a..8dd5b9b3ec3d1 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -1035,8 +1035,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, if (Name.starts_with("ds.fadd") || Name.starts_with("ds.fmin") || Name.starts_with("ds.fmax") || - Name.starts_with("global.atomic.fadd.v2bf16") || - Name.starts_with("flat.atomic.fadd.v2bf16")) { + Name.starts_with("global.atomic.fadd") || + Name.starts_with("flat.atomic.fadd")) { // Replaced with atomicrmw fadd/fmin/fmax, so there's no new // declaration. NewFn = nullptr; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index db8b44149cf47..aa5b151adef3a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -618,16 +618,11 @@ multiclass local_addr_space_atomic_op { } } -defm int_amdgcn_flat_atomic_fadd : noret_op; -defm int_amdgcn_flat_atomic_fadd : flat_addr_space_atomic_op; defm int_amdgcn_flat_atomic_fmin : noret_op; defm int_amdgcn_flat_atomic_fmax : noret_op; -defm int_amdgcn_global_atomic_fadd : global_addr_space_atomic_op; -defm int_amdgcn_flat_atomic_fadd : global_addr_space_atomic_op; defm int_amdgcn_global_atomic_fmin : noret_op; defm int_amdgcn_global_atomic_fmax : noret_op; defm int_amdgcn_global_atomic_csub : noret_op; -defm int_amdgcn_flat_atomic_fadd : local_addr_space_atomic_op; defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op; defm int_amdgcn_flat_atomic_fmin_num : noret_op; defm int_amdgcn_flat_atomic_fmax_num : noret_op; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 12aa6ee2a2536..69a1936a11fe0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4896,13 +4896,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; } - case Intrinsic::amdgcn_global_atomic_fadd: case Intrinsic::amdgcn_global_atomic_csub: case Intrinsic::amdgcn_global_atomic_fmin: case Intrinsic::amdgcn_global_atomic_fmax: case Intrinsic::amdgcn_global_atomic_fmin_num: case Intrinsic::amdgcn_global_atomic_fmax_num: - case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fmin_num: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 48fb786ed9720..95c4859674ecc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -239,13 +239,11 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; -def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; -def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index c38c2dc0f5f61..cb3fbdb850c1a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1045,7 +1045,6 @@ bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl &OpIndexes, switch (IID) { case Intrinsic::amdgcn_is_shared: case Intrinsic::amdgcn_is_private: - case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmax_num: @@ -1107,7 +1106,6 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy}, {NewV, MaskOp}); } - case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmax_num: diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 3c76fb2f7961f..e9283fde85a48 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -1135,11 +1135,7 @@ class DSAtomicRetPatIntrinsic.ret:$value, Offset:$offset, (i1 gds))> { } - -def : DSAtomicRetPatIntrinsic; -let AddedComplexity = 1 in -def : DSAtomicRetPatIntrinsic; -} +} // End SubtargetPredicate = HasLdsAtomicAddF64 let SubtargetPredicate = HasAtomicDsPkAdd16Insts in { defm : DSAtomicRetNoRetPat_mc; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 8067090636a9a..7b3822067072e 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1625,25 +1625,17 @@ let OtherPredicates = [isGFX12Only] in { let OtherPredicates = [HasAtomicFaddNoRtnInsts] in { defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>; -defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f32>; -defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", "global_addrspace", f32>; } let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in { defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>; -defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>; -defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>; } let OtherPredicates = [HasAtomicFaddRtnInsts] in { defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>; -defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f32>; -defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", "global_addrspace", f32>; } let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in { -defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>; -defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>; } @@ -1661,19 +1653,14 @@ defm : FlatAtomicIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax", let OtherPredicates = [HasFlatBufferGlobalAtomicFaddF64Inst] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>; -defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f64>; -defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", "global_addrspace", f64>; defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>; -defm : FlatAtomicIntrPat <"FLAT_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", f64>; } let OtherPredicates = [HasFlatAtomicFaddF32Inst] in { defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>; -defm : FlatAtomicIntrPat <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", f32>; } let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in { -defm : FlatAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", v2f16>; defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_flat", v2f16>; defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16>; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index d02d0bbb52e56..c954c0aa71f73 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1351,13 +1351,11 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, MachineMemOperand::MODereferenceable; return true; } - case Intrinsic::amdgcn_global_atomic_fadd: case Intrinsic::amdgcn_global_atomic_fmin: case Intrinsic::amdgcn_global_atomic_fmax: case Intrinsic::amdgcn_global_atomic_fmin_num: case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_ordered_add_b64: - case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fmin_num: @@ -1464,13 +1462,11 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, case Intrinsic::amdgcn_ds_consume: case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: - case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmin_num: case Intrinsic::amdgcn_global_atomic_csub: - case Intrinsic::amdgcn_global_atomic_fadd: case Intrinsic::amdgcn_global_atomic_fmax: case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_fmin: diff --git a/llvm/test/Bitcode/amdgcn-atomic.ll b/llvm/test/Bitcode/amdgcn-atomic.ll index 9563d178e6433..d642372799f56 100644 --- a/llvm/test/Bitcode/amdgcn-atomic.ll +++ b/llvm/test/Bitcode/amdgcn-atomic.ll @@ -322,4 +322,36 @@ define <2 x i16> @upgrade_amdgcn_global_atomic_fadd_v2bf16_p1(ptr addrspace(1) % ret <2 x i16> %result } +declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr nocapture, <2 x half>) #0 + +define <2 x half> @upgrade_amdgcn_flat_atomic_fadd_v2f16_p0_v2f16(ptr %ptr, <2 x half> %data) { + ; CHECK: %{{.+}} = atomicrmw fadd ptr %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + %result = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) + ret <2 x half> %result +} + +declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) nocapture, <2 x half>) #0 + +define <2 x half> @upgrade_amdgcn_global_atomic_fadd_v2f16_p1_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) { + ; CHECK: %{{.+}} = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + %result = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) + ret <2 x half> %result +} + +declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr nocapture, float) #0 + +define float @upgrade_amdgcn_flat_atomic_fadd_f32_p0_f32(ptr %ptr, float %data) { + ; CHECK: %{{.+}} = atomicrmw fadd ptr %ptr, float %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}} + %result = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data) + ret float %result +} + +declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) nocapture, float) #0 + +define float @upgrade_amdgcn_global_atomic_fadd_f32_p1_f32(ptr addrspace(1) %ptr, float %data) { + ; CHECK: %{{.+}} = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}} + %result = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) + ret float %result +} + attributes #0 = { argmemonly nounwind willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll index 820f9ee1ce7f4..7a97ac8211f67 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll @@ -12,7 +12,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %da ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX940-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; GFX940-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr) ; GFX940-NEXT: S_ENDPGM 0 ; ; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic @@ -23,7 +23,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %da ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr) ; GFX11-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr %ptr, float %data) ret void @@ -38,7 +38,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX940-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; GFX940-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr) ; GFX940-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; @@ -50,7 +50,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr %ptr, float %data) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll index b2a96fb948797..c1cb74cb0e25a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll @@ -1,46 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s - -define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_intrinsic(ptr %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_no_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: FLAT_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s64) on %ir.ptr) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr %ptr, double %data) - ret void -} - -define amdgpu_ps double @flat_atomic_fadd_f64_rtn_intrinsic(ptr %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s64) on %ir.ptr) - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr %ptr, double %data) - ret double %ret -} +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %data) { ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_no_rtn_atomicrmw @@ -82,6 +42,4 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %da ret double %ret } -declare double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr, double) - !0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.v2f16.ll index f2f00823e0278..0896e4dc7af14 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.v2f16.ll @@ -1,38 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX940 %s -define amdgpu_ps void @flat_atomic_fadd_v2f16_no_rtn_intrinsic(ptr %ptr, <2 x half> %data) { - ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_no_rtn_intrinsic - ; GFX940: bb.1 (%ir-block.0): - ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX940-NEXT: FLAT_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr) - ; GFX940-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr %ptr, <2 x half> %data) - ret void -} - -define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_rtn_intrinsic(ptr %ptr, <2 x half> %data) { - ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_rtn_intrinsic - ; GFX940: bb.1 (%ir-block.0): - ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX940-NEXT: [[FLAT_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_PK_ADD_F16_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr) - ; GFX940-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_PK_ADD_F16_RTN]] - ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr %ptr, <2 x half> %data) - ret <2 x half> %ret -} - -define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) #0 { +define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_rtn ; GFX940: bb.1 (%ir-block.0): ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 @@ -48,7 +17,7 @@ define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %da ret <2 x half> %ret } -define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> %data) #0 { +define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> %data) { ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_saddr_rtn ; GFX940: bb.1 (%ir-block.0): ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 @@ -65,7 +34,7 @@ define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_saddr_rtn(ptr inreg %ptr, <2 ret <2 x half> %ret } -define amdgpu_ps void @flat_atomic_fadd_v2f16_no_rtn(ptr %ptr, <2 x half> %data) #0 { +define amdgpu_ps void @flat_atomic_fadd_v2f16_no_rtn(ptr %ptr, <2 x half> %data) { ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_no_rtn ; GFX940: bb.1 (%ir-block.0): ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 @@ -80,7 +49,7 @@ define amdgpu_ps void @flat_atomic_fadd_v2f16_no_rtn(ptr %ptr, <2 x half> %data) ret void } -define amdgpu_ps void @flat_atomic_fadd_v2f16_saddr_no_rtn(ptr inreg %ptr, <2 x half> %data) #0 { +define amdgpu_ps void @flat_atomic_fadd_v2f16_saddr_no_rtn(ptr inreg %ptr, <2 x half> %data) { ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_saddr_no_rtn ; GFX940: bb.1 (%ir-block.0): ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 @@ -96,6 +65,4 @@ define amdgpu_ps void @flat_atomic_fadd_v2f16_saddr_no_rtn(ptr inreg %ptr, <2 x ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" } - !0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll index 031a3633bd375..25c7fc6463c33 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll @@ -1,22 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 -global-isel -verify-machineinstrs | FileCheck %s -check-prefix=GFX940 - -declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data) -declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) - -define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) { -; GFX940-LABEL: flat_atomic_fadd_f32_noret: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 -; GFX940-NEXT: s_endpgm - %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data) - ret void -} +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 < %s | FileCheck %s -check-prefix=GFX940 define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat: @@ -50,17 +33,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ret void } -define float @flat_atomic_fadd_f32_rtn(ptr %ptr, float %data) { -; GFX940-LABEL: flat_atomic_fadd_f32_rtn: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data) - ret float %ret -} - define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { ; GFX940-LABEL: flat_atomic_fadd_f32_rtn_pat: ; GFX940: ; %bb.0: @@ -75,31 +47,6 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { ret float %ret } -define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2f16_noret: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 -; GFX940-NEXT: s_endpgm - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) - ret void -} - -define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2f16_rtn: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) - ret <2 x half> %ret -} - define <2 x half> @local_atomic_fadd_ret_v2f16_offset(ptr addrspace(3) %ptr, <2 x half> %val) { ; GFX940-LABEL: local_atomic_fadd_ret_v2f16_offset: ; GFX940: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index cfd4e1211bb01..eb39ca2d7daa7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -14,10 +14,8 @@ declare double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double, <4 x i32>, i32 declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg) declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32 immarg) declare double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32 immarg) -declare double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) declare double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data) declare double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data) -declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data) declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data) declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) @@ -1017,29 +1015,6 @@ main_body: ret void } -define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, double %data) { -; GFX90A-LABEL: global_atomic_fadd_f64_noret: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] -; GFX90A-NEXT: s_endpgm -; -; GFX940-LABEL: global_atomic_fadd_f64_noret: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] -; GFX940-NEXT: s_endpgm -main_body: - %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret void -} - define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: global_atomic_fmin_f64_noret: ; GFX90A: ; %bb.0: ; %main_body @@ -1095,7 +1070,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB39_2 +; GFX90A-NEXT: s_cbranch_execz .LBB38_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1108,7 +1083,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: .LBB39_2: +; GFX90A-NEXT: .LBB38_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat: @@ -1119,7 +1094,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB39_2 +; GFX940-NEXT: s_cbranch_execz .LBB38_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1131,7 +1106,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: .LBB39_2: +; GFX940-NEXT: .LBB38_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1147,7 +1122,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB40_2 +; GFX90A-NEXT: s_cbranch_execz .LBB39_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1158,7 +1133,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: .LBB40_2: +; GFX90A-NEXT: .LBB39_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent: @@ -1169,7 +1144,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB40_2 +; GFX940-NEXT: s_cbranch_execz .LBB39_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1181,7 +1156,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: .LBB40_2: +; GFX940-NEXT: .LBB39_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1197,7 +1172,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB41_2 +; GFX90A-NEXT: s_cbranch_execz .LBB40_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1210,7 +1185,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: .LBB41_2: +; GFX90A-NEXT: .LBB40_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system: @@ -1221,7 +1196,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB41_2 +; GFX940-NEXT: s_cbranch_execz .LBB40_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1233,7 +1208,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: .LBB41_2: +; GFX940-NEXT: .LBB40_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1249,7 +1224,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB42_2 +; GFX90A-NEXT: s_cbranch_execz .LBB41_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1260,7 +1235,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: .LBB42_2: +; GFX90A-NEXT: .LBB41_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_flush: @@ -1271,7 +1246,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB42_2 +; GFX940-NEXT: s_cbranch_execz .LBB41_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1283,32 +1258,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: .LBB42_2: +; GFX940-NEXT: .LBB41_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define double @global_atomic_fadd_f64_rtn(ptr addrspace(1) %ptr, double %data) { -; GFX90A-LABEL: global_atomic_fadd_f64_rtn: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_atomic_fadd_f64_rtn: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret double %ret -} - define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %data) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat: ; GFX90A: ; %bb.0: ; %main_body @@ -1435,7 +1391,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB49_2 +; GFX90A-NEXT: s_cbranch_execz .LBB47_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1446,7 +1402,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: .LBB49_2: +; GFX90A-NEXT: .LBB47_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: @@ -1457,7 +1413,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB49_2 +; GFX940-NEXT: s_cbranch_execz .LBB47_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1469,7 +1425,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: .LBB49_2: +; GFX940-NEXT: .LBB47_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1648,48 +1604,6 @@ main_body: ret double %ret } -define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) { -; GFX90A-LABEL: flat_atomic_fadd_f64_noret: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX90A-NEXT: s_endpgm -; -; GFX940-LABEL: flat_atomic_fadd_f64_noret: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_endpgm -main_body: - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data) - ret void -} - -define double @flat_atomic_fadd_f64_rtn(ptr %ptr, double %data) { -; GFX90A-LABEL: flat_atomic_fadd_f64_rtn: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fadd_f64_rtn: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data) - ret double %ret -} - define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body @@ -1812,7 +1726,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB63_2 +; GFX90A-NEXT: s_cbranch_execz .LBB59_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1822,7 +1736,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; GFX90A-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB63_2: +; GFX90A-NEXT: .LBB59_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat: @@ -1833,7 +1747,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB63_2 +; GFX940-NEXT: s_cbranch_execz .LBB59_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1843,7 +1757,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; GFX940-NEXT: v_mov_b32_e32 v2, s2 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB63_2: +; GFX940-NEXT: .LBB59_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1859,7 +1773,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB64_2 +; GFX90A-NEXT: s_cbranch_execz .LBB60_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1869,7 +1783,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; GFX90A-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB64_2: +; GFX90A-NEXT: .LBB60_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush: @@ -1880,7 +1794,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB64_2 +; GFX940-NEXT: s_cbranch_execz .LBB60_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1890,7 +1804,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; GFX940-NEXT: v_mov_b32_e32 v2, s2 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB64_2: +; GFX940-NEXT: .LBB60_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1906,7 +1820,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB65_2 +; GFX90A-NEXT: s_cbranch_execz .LBB61_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1916,7 +1830,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX90A-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB65_2: +; GFX90A-NEXT: .LBB61_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: @@ -1927,7 +1841,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB65_2 +; GFX940-NEXT: s_cbranch_execz .LBB61_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1937,7 +1851,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX940-NEXT: v_mov_b32_e32 v2, s2 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB65_2: +; GFX940-NEXT: .LBB61_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll index 2bbb8e86f936a..fcd8c6fb0fe7c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll @@ -1,223 +1,342 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908_GFX11 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908_GFX11 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908_GFX11 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx908 -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s -define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) { - ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic - ; GFX908_GFX11: bb.1 (%ir-block.0): - ; GFX908_GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX908_GFX11-NEXT: {{ $}} - ; GFX908_GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908_GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908_GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX908_GFX11-NEXT: S_ENDPGM 0 - ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret void -} - -define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, float %data) { - ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_intrinsic - ; GFX908_GFX11: bb.1 (%ir-block.0): - ; GFX908_GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX908_GFX11-NEXT: {{ $}} - ; GFX908_GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX908_GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908_GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908_GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX908_GFX11-NEXT: S_ENDPGM 0 - ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data) - ret void -} - -define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, float %data) { - ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_no_rtn_flat_intrinsic - ; GFX908_GFX11: bb.1 (%ir-block.0): - ; GFX908_GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX908_GFX11-NEXT: {{ $}} - ; GFX908_GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908_GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908_GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX908_GFX11-NEXT: S_ENDPGM 0 +define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_atomicrmw(ptr addrspace(1) %ptr, float %data) { + ; GFX908-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw + ; GFX908: bb.1 (%ir-block.0): + ; GFX908-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX908-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret void -} - -define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, float %data) { - ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic - ; GFX908_GFX11: bb.1 (%ir-block.0): - ; GFX908_GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX908_GFX11-NEXT: {{ $}} - ; GFX908_GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX908_GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908_GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908_GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX908_GFX11-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data) - ret void -} - -define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_atomicrmw(ptr addrspace(1) %ptr, float %data) { - ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw - ; GFX908_GFX11: bb.1 (%ir-block.0): - ; GFX908_GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX908_GFX11-NEXT: {{ $}} - ; GFX908_GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908_GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX908_GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908_GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX908_GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) - ; GFX908_GFX11-NEXT: S_ENDPGM 0 + ; GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX11-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw + ; GFX11: bb.1 (%ir-block.0): + ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret void } define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspace(1) inreg %ptr, float %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE - ; GFX90A_GFX940-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A_GFX940-NEXT: S_BRANCH %bb.2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: bb.2 (%ir-block.5): - ; GFX90A_GFX940-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $exec - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub1 - ; GFX90A_GFX940-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 - ; GFX90A_GFX940-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX90A_GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; GFX90A_GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX90A_GFX940-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY8]], [[COPY9]], implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] - ; GFX90A_GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec - ; GFX90A_GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 - ; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A_GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY14]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY15]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY17]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec - ; GFX90A_GFX940-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 63 - ; GFX90A_GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_3]] - ; GFX90A_GFX940-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY18]], implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX90A_GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY19]], implicit $exec - ; GFX90A_GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A_GFX940-NEXT: S_BRANCH %bb.3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: bb.3 (%ir-block.31): - ; GFX90A_GFX940-NEXT: successors: %bb.4(0x80000000) - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: bb.4.Flow: - ; GFX90A_GFX940-NEXT: successors: %bb.5(0x80000000) - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: bb.5 (%ir-block.33): - ; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + ; GFX908-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw + ; GFX908: bb.1 (%ir-block.0): + ; GFX908-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) + ; GFX908-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX908-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX908-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX908-NEXT: S_BRANCH %bb.2 + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: bb.2 (%ir-block.5): + ; GFX908-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX908-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 + ; GFX908-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 + ; GFX908-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub1 + ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX908-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX908-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX908-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY8]], [[COPY9]], implicit $exec + ; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; GFX908-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX908-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 + ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX908-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec + ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX908-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec + ; GFX908-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec + ; GFX908-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX908-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX908-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec + ; GFX908-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX908-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY14]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX908-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec + ; GFX908-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX908-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY15]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX908-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec + ; GFX908-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX908-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX908-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec + ; GFX908-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX908-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY17]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX908-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec + ; GFX908-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX908-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_3]] + ; GFX908-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]] + ; GFX908-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY18]], implicit $exec + ; GFX908-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY19]], implicit $exec + ; GFX908-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX908-NEXT: S_BRANCH %bb.3 + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: bb.3 (%ir-block.31): + ; GFX908-NEXT: successors: %bb.4(0x80000000) + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX908-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: bb.4.Flow: + ; GFX908-NEXT: successors: %bb.5(0x80000000) + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: bb.5 (%ir-block.33): + ; GFX908-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX908-NEXT: S_ENDPGM 0 + ; + ; GFX90A-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) + ; GFX90A-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX90A-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: S_BRANCH %bb.2 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: bb.2 (%ir-block.5): + ; GFX90A-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 + ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 + ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub1 + ; GFX90A-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX90A-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY8]], [[COPY9]], implicit $exec + ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; GFX90A-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX90A-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 + ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX90A-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec + ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX90A-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec + ; GFX90A-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX90A-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX90A-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX90A-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY14]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX90A-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX90A-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY15]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX90A-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX90A-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX90A-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX90A-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY17]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX90A-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX90A-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_3]] + ; GFX90A-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]] + ; GFX90A-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY18]], implicit $exec + ; GFX90A-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY19]], implicit $exec + ; GFX90A-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: S_BRANCH %bb.3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: bb.3 (%ir-block.31): + ; GFX90A-NEXT: successors: %bb.4(0x80000000) + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: bb.4.Flow: + ; GFX90A-NEXT: successors: %bb.5(0x80000000) + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: bb.5 (%ir-block.33): + ; GFX90A-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: S_ENDPGM 0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) + ; GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE + ; GFX940-NEXT: [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940-NEXT: S_BRANCH %bb.2 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: bb.2 (%ir-block.5): + ; GFX940-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $exec + ; GFX940-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 + ; GFX940-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0 + ; GFX940-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub1 + ; GFX940-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GFX940-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX940-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX940-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; GFX940-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY8]], [[COPY9]], implicit $exec + ; GFX940-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 + ; GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec + ; GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec + ; GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec + ; GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX940-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec + ; GFX940-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec + ; GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX940-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY14]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec + ; GFX940-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec + ; GFX940-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX940-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY15]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec + ; GFX940-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec + ; GFX940-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX940-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec + ; GFX940-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec + ; GFX940-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; GFX940-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY17]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec + ; GFX940-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec + ; GFX940-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_3]] + ; GFX940-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]] + ; GFX940-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY18]], implicit $exec + ; GFX940-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY19]], implicit $exec + ; GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940-NEXT: S_BRANCH %bb.3 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: bb.3 (%ir-block.31): + ; GFX940-NEXT: successors: %bb.4(0x80000000) + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: bb.4.Flow: + ; GFX940-NEXT: successors: %bb.5(0x80000000) + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: bb.5 (%ir-block.33): + ; GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX940-NEXT: S_ENDPGM 0 + ; + ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_atomicrmw + ; GFX11: bb.1 (%ir-block.0): + ; GFX11-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) + ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_32_xm0_xexec = SI_PS_LIVE + ; GFX11-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX11-NEXT: S_BRANCH %bb.2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: bb.2 (%ir-block.5): + ; GFX11-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $exec_lo + ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX11-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[COPY5]], implicit $exec + ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 + ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY6]], implicit-def dead $scc, implicit $exec + ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX11-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY7]], [[V_SET_INACTIVE_B32_]], 353, 15, 15, 0, implicit $exec + ; GFX11-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX11-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY8]], [[V_ADD_F32_e64_]], 354, 15, 15, 0, implicit $exec + ; GFX11-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX11-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY9]], [[V_ADD_F32_e64_1]], 356, 15, 15, 0, implicit $exec + ; GFX11-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GFX11-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY10]], [[V_ADD_F32_e64_2]], 360, 15, 15, 0, implicit $exec + ; GFX11-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 + ; GFX11-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_2]], 0, [[S_MOV_B32_2]], [[V_ADD_F32_e64_3]], 0, implicit $exec + ; GFX11-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_PERMLANEX16_B32_e64_]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[V_ADD_F32_e64_4]], implicit $exec + ; GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX11-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_LO_U32_B32_e64_]], [[COPY11]], implicit $exec + ; GFX11-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX11-NEXT: S_BRANCH %bb.3 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: bb.3 (%ir-block.24): + ; GFX11-NEXT: successors: %bb.4(0x80000000) + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX11-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR [[V_MOV_B32_e32_]], [[STRICT_WWM]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: bb.4.Flow: + ; GFX11-NEXT: successors: %bb.5(0x80000000) + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: bb.5 (%ir-block.26): + ; GFX11-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret void } -declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1), float) -declare float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1), float) - !0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll index 212bc84d1349d..a8f9ed2e6fba9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll @@ -1,137 +1,33 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX90A_GFX940,GFX90A %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX90A_GFX940,GFX940 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s - -define amdgpu_ps float @global_atomic_fadd_f32_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; - ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic - ; GFX11: bb.1 (%ir-block.0): - ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] - ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret float %ret -} - -define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, float %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; - ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_intrinsic - ; GFX11: bb.1 (%ir-block.0): - ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] - ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data) - ret float %ret -} - -define amdgpu_ps float @global_atomic_fadd_f32_rtn_flat_intrinsic(ptr addrspace(1) %ptr, float %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; - ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_flat_intrinsic - ; GFX11: bb.1 (%ir-block.0): - ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] - ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret float %ret -} - -define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, float %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; - ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_flat_intrinsic - ; GFX11: bb.1 (%ir-block.0): - ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] - ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data) - ret float %ret -} +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX90A %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_ps float @global_atomic_fadd_f32_rtn_atomicrmw(ptr addrspace(1) %ptr, float %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; GFX90A-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] + ; GFX90A-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; + ; GFX940-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw ; GFX11: bb.1 (%ir-block.0): @@ -426,7 +322,4 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ret float %ret } -declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1), float) -declare float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1), float) - !0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll index 8007cc1f54a79..80fa24471a459 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll @@ -2,170 +2,6 @@ ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=GFX90A_GFX940,GFX90A %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=GFX90A_GFX940,GFX940 %s -define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret void -} - -define amdgpu_ps double @global_atomic_fadd_f64_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 - %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret double %ret -} - -define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret void -} - -define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 - %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret double %ret -} - -define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret void -} - -define amdgpu_ps double @global_atomic_fadd_f64_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret double %ret -} - -define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret void -} - -define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret double %ret -} - define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw ; GFX90A: bb.1 (%ir-block.0): @@ -319,7 +155,4 @@ define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspac ret double %ret } -declare double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1), double) -declare double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1), double) - !0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll index ace66e6a234aa..db508b5aea8c5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll @@ -3,115 +3,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) { - ; GFX908-LABEL: name: global_atomic_fadd_v2f16_no_rtn_intrinsic - ; GFX908: bb.1 (%ir-block.0): - ; GFX908-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; GFX908-NEXT: S_ENDPGM 0 - ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) - ret void -} - -define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) { - ; GFX908-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic - ; GFX908: bb.1 (%ir-block.0): - ; GFX908-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX908-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; GFX908-NEXT: S_ENDPGM 0 - ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) - ret void -} - -define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) { - ; GFX908-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat_intrinsic - ; GFX908: bb.1 (%ir-block.0): - ; GFX908-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; GFX908-NEXT: S_ENDPGM 0 - ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) - ret void -} - -define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) { - ; GFX908-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic - ; GFX908: bb.1 (%ir-block.0): - ; GFX908-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX908-NEXT: {{ $}} - ; GFX908-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX908-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; GFX908-NEXT: S_ENDPGM 0 - ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) - ret void -} - -define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn(ptr addrspace(1) %ptr, <2 x half> %data) #0 { +define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn(ptr addrspace(1) %ptr, <2 x half> %data) { ; GFX908-LABEL: name: global_atomic_fadd_v2f16_no_rtn ; GFX908: bb.1 (%ir-block.0): ; GFX908-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 @@ -137,7 +29,7 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn(ptr addrspace(1) %ptr, <2 ret void } -define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn(ptr addrspace(1) inreg %ptr, <2 x half> %data) #0 { +define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn(ptr addrspace(1) inreg %ptr, <2 x half> %data) { ; GFX908-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn ; GFX908: bb.1 (%ir-block.0): ; GFX908-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 @@ -165,6 +57,4 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn(ptr addrspace(1) in ret void } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" } - !0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-rtn.ll index e895d6f18c979..f11196be89bb1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-rtn.ll @@ -2,73 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) - ret <2 x half> %ret -} - -define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) - ret <2 x half> %ret -} - -define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_flat_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) - ret <2 x half> %ret -} - -define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.1 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) - ret <2 x half> %ret -} - -define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn(ptr addrspace(1) %ptr, <2 x half> %data) #0 { +define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn(ptr addrspace(1) %ptr, <2 x half> %data) { ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn ; GFX90A_GFX940: bb.1 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 @@ -84,7 +18,7 @@ define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn(ptr addrspace(1) %ptr, ret <2 x half> %ret } -define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn(ptr addrspace(1) inreg %ptr, <2 x half> %data) #0 { +define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn(ptr addrspace(1) inreg %ptr, <2 x half> %data) { ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn ; GFX90A_GFX940: bb.1 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 @@ -101,6 +35,4 @@ define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn(ptr addrspace(1) ret <2 x half> %ret } -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" } - !0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll deleted file mode 100644 index 9a66fe10ccdf3..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll +++ /dev/null @@ -1,21 +0,0 @@ -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s -; RUN: not --crash llc -global-isel < %s -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908 - -declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) nocapture, float) -declare <2 x half> @llvm.amdgcn.global.atomic.fadd.f32.p1.v2f16(ptr addrspace(1) nocapture, <2 x half>) - -; GFX908: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.global.atomic.fadd) - -; GFX90A-LABEL: {{^}}global_atomic_fadd_f32_rtn: -; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off glc -define float @global_atomic_fadd_f32_rtn(ptr addrspace(1) %ptr, float %data) { - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret float %ret -} - -; GFX90A-LABEL: {{^}}global_atomic_fadd_v2f16_rtn: -; GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc -define <2 x half> @global_atomic_fadd_v2f16_rtn(ptr addrspace(1) %ptr, <2 x half> %data) { - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.f32.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) - ret <2 x half> %ret -} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll deleted file mode 100644 index de91c45000f13..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll +++ /dev/null @@ -1,126 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX908 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s - -define void @global_atomic_fadd_f32(ptr addrspace(1) %ptr, float %data) { -; GFX908-LABEL: global_atomic_fadd_f32: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_atomic_fadd_f32: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret void -} - -define void @global_atomic_fadd_f32_off_2048(ptr addrspace(1) %ptr, float %data) { -; GFX908-LABEL: global_atomic_fadd_f32_off_2048: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2048 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_atomic_fadd_f32_off_2048: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2048 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 512 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %gep, float %data) - ret void -} - -define void @global_atomic_fadd_f32_off_neg2047(ptr addrspace(1) %ptr, float %data) { -; GFX908-LABEL: global_atomic_fadd_f32_off_neg2047: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2044 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_atomic_fadd_f32_off_neg2047: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2044 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -511 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %gep, float %data) - ret void -} - -define amdgpu_kernel void @global_atomic_fadd_f32_off_ss(ptr addrspace(1) %ptr, float %data) { -; GFX908-LABEL: global_atomic_fadd_f32_off_ss: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX908-NEXT: v_mov_b32_e32 v1, 0 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v0, s2 -; GFX908-NEXT: global_atomic_add_f32 v1, v0, s[0:1] offset:2048 -; GFX908-NEXT: s_endpgm -; -; GFX90A-LABEL: global_atomic_fadd_f32_off_ss: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dword s2, s[6:7], 0x8 -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s2 -; GFX90A-NEXT: global_atomic_add_f32 v1, v0, s[0:1] offset:2048 -; GFX90A-NEXT: s_endpgm - %gep = getelementptr float, ptr addrspace(1) %ptr, i64 512 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %gep, float %data) - ret void -} - -define void @global_atomic_fadd_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) { -; GFX908-LABEL: global_atomic_fadd_v2f16: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_atomic_fadd_v2f16: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) - ret void -} - -define void @global_atomic_fadd_v2f16_off_neg2047(ptr addrspace(1) %ptr, <2 x half> %data) { -; GFX908-LABEL: global_atomic_fadd_v2f16_off_neg2047: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2044 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-LABEL: global_atomic_fadd_v2f16_off_neg2047: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2044 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -511 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %gep, <2 x half> %data) - ret void -} - -declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) nocapture, float) #0 -declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) nocapture, <2 x half>) #0 - -attributes #0 = { argmemonly nounwind willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll index 21e2a85ab18d9..7587b81e9936d 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll @@ -1,26 +1,27 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: opt -S -passes='require,function(codegenprepare)' -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=OPT %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefix=GCN %s ; Make sure we match the addressing mode offset of globla.atomic.fadd intrinsics across blocks. define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) { -; OPT-LABEL: @test_sink_small_offset_global_atomic_fadd_f32( -; OPT-NEXT: entry: -; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]] +; OPT-LABEL: define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32( +; OPT-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[IN:%.*]]) #[[ATTR0:[0-9]+]] { +; OPT-NEXT: [[ENTRY:.*]]: +; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR2:[0-9]+]] ; OPT-NEXT: [[CMP:%.*]] = icmp eq i32 [[TID]], 0 -; OPT-NEXT: br i1 [[CMP]], label [[ENDIF:%.*]], label [[IF:%.*]] -; OPT: if: -; OPT-NEXT: [[IN_GEP:%.*]] = getelementptr float, ptr addrspace(1) [[IN:%.*]], i32 7 -; OPT-NEXT: [[FADD2:%.*]] = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) [[IN_GEP]], float 2.000000e+00) +; OPT-NEXT: br i1 [[CMP]], label %[[ENDIF:.*]], label %[[IF:.*]] +; OPT: [[IF]]: +; OPT-NEXT: [[IN_GEP:%.*]] = getelementptr float, ptr addrspace(1) [[IN]], i32 7 +; OPT-NEXT: [[FADD2:%.*]] = atomicrmw fadd ptr addrspace(1) [[IN_GEP]], float 2.000000e+00 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]], !amdgpu.ignore.denormal.mode [[META0]] ; OPT-NEXT: [[VAL:%.*]] = load volatile float, ptr addrspace(1) undef, align 4 -; OPT-NEXT: br label [[ENDIF]] -; OPT: endif: -; OPT-NEXT: [[X:%.*]] = phi float [ [[VAL]], [[IF]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr float, ptr addrspace(1) [[OUT:%.*]], i32 999999 +; OPT-NEXT: br label %[[ENDIF]] +; OPT: [[ENDIF]]: +; OPT-NEXT: [[X:%.*]] = phi float [ [[VAL]], %[[IF]] ], [ 0.000000e+00, %[[ENTRY]] ] +; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr float, ptr addrspace(1) [[OUT]], i32 999999 ; OPT-NEXT: store float [[X]], ptr addrspace(1) [[OUT_GEP]], align 4 -; OPT-NEXT: br label [[DONE:%.*]] -; OPT: done: +; OPT-NEXT: br label %[[DONE:.*]] +; OPT: [[DONE]]: ; OPT-NEXT: ret void ; ; GCN-LABEL: test_sink_small_offset_global_atomic_fadd_f32: @@ -36,6 +37,8 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(ptr add ; GCN-NEXT: v_mov_b32_e32 v1, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_atomic_add_f32 v0, v1, s[2:3] offset:28 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: global_load_dword v0, v[0:1], off glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: .LBB0_2: ; %endif @@ -51,7 +54,7 @@ entry: if: %in.gep = getelementptr float, ptr addrspace(1) %in, i32 7 - %fadd2 = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %in.gep, float 2.0) + %fadd2 = atomicrmw fadd ptr addrspace(1) %in.gep, float 2.000000e+00 syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 %val = load volatile float, ptr addrspace(1) undef br label %endif @@ -71,3 +74,8 @@ declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) nocapt attributes #0 = { argmemonly nounwind } attributes #1 = { nounwind readnone willreturn } attributes #2 = { argmemonly nounwind willreturn } + +!0 = !{} +;. +; OPT: [[META0]] = !{} +;. diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll index 3e9e5056f87d6..ef180cef7ed2a 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll @@ -14,7 +14,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %da ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX940-NEXT: FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; GFX940-NEXT: FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr) ; GFX940-NEXT: S_ENDPGM 0 ; ; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic @@ -26,7 +26,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %da ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr) ; GFX11-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr %ptr, float %data) ret void @@ -42,7 +42,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX940-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; GFX940-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr) ; GFX940-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] ; GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; @@ -55,7 +55,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr %ptr, float %data) diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll index 07293db896f44..e5dcf9ce309cd 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll @@ -21,7 +21,7 @@ define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_intrinsic(ptr %ptr, double %d ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: FLAT_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s64) on %ir.ptr) + ; GFX90A_GFX940-NEXT: FLAT_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s64) on %ir.ptr) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr %ptr, double %data) ret void @@ -44,7 +44,7 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_intrinsic(ptr %ptr, double %da ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s64) on %ir.ptr) + ; GFX90A_GFX940-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s64) on %ir.ptr) ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0 ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1 ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]] diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll deleted file mode 100644 index 647c5b568b7ad..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll +++ /dev/null @@ -1,63 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX12 %s - -define amdgpu_ps void @flat_atomic_fadd_v2f16_no_rtn_intrinsic(ptr %ptr, <2 x half> %data) { - ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_no_rtn_intrinsic - ; GFX940: bb.0 (%ir-block.0): - ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX940-NEXT: FLAT_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) - ; GFX940-NEXT: S_ENDPGM 0 - ; - ; GFX12-LABEL: name: flat_atomic_fadd_v2f16_no_rtn_intrinsic - ; GFX12: bb.0 (%ir-block.0): - ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; GFX12-NEXT: FLAT_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) - ; GFX12-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr %ptr, <2 x half> %data) - ret void -} - -define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_rtn_intrinsic(ptr %ptr, <2 x half> %data) { - ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_rtn_intrinsic - ; GFX940: bb.0 (%ir-block.0): - ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX940-NEXT: [[FLAT_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) - ; GFX940-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_PK_ADD_F16_RTN]] - ; GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - ; - ; GFX12-LABEL: name: flat_atomic_fadd_v2f16_rtn_intrinsic - ; GFX12: bb.0 (%ir-block.0): - ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; GFX12-NEXT: [[FLAT_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) - ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_PK_ADD_F16_RTN]] - ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr %ptr, <2 x half> %data) - ret <2 x half> %ret -} - -declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr, <2 x half>) diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll index 05259b4f51310..e94e2ee9f37d8 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll @@ -1,113 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs | FileCheck %s -check-prefix=GFX12-SDAG -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs | FileCheck %s -check-prefix=GFX12-GISEL +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 | FileCheck %s -check-prefix=GFX12-SDAG +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -global-isel=1 | FileCheck %s -check-prefix=GFX12-GISEL declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) declare <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat>, <4 x i32>, i32, i32, i32, i32 immarg) declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32) declare <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32, i32, i32) -declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) -declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) - -define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) { -; GFX12-SDAG-LABEL: flat_atomic_fadd_v2f16_noret: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-SDAG-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 -; GFX12-SDAG-NEXT: s_endpgm -; -; GFX12-GISEL-LABEL: flat_atomic_fadd_v2f16_noret: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-GISEL-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 -; GFX12-GISEL-NEXT: s_endpgm - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) - ret void -} - -define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { -; GFX12-SDAG-LABEL: flat_atomic_fadd_v2f16_rtn: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: flat_atomic_fadd_v2f16_rtn: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) - ret <2 x half> %ret -} - -define void @global_atomic_pk_add_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) { -; GFX12-SDAG-LABEL: global_atomic_pk_add_v2f16: -; GFX12-SDAG: ; %bb.0: ; %main_body -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: global_atomic_pk_add_v2f16: -; GFX12-GISEL: ; %bb.0: ; %main_body -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) - ret void -} - -define <2 x half> @global_atomic_pk_add_v2f16_rtn(ptr addrspace(1) %ptr, <2 x half> %data) { -; GFX12-SDAG-LABEL: global_atomic_pk_add_v2f16_rtn: -; GFX12-SDAG: ; %bb.0: ; %main_body -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: global_atomic_pk_add_v2f16_rtn: -; GFX12-GISEL: ; %bb.0: ; %main_body -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) - ret <2 x half> %ret -} define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret_offset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { ; GFX12-SDAG-LABEL: raw_buffer_atomic_add_v2f16_noret_offset: diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll index 5322a283d3de4..88a95937b9c90 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll @@ -1,35 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs | FileCheck %s -check-prefix=GFX940 -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefix=GFX12 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 | FileCheck %s -check-prefix=GFX940 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck %s -check-prefix=GFX12 -declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data) -declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32, i32, i1) declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) -define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) { -; GFX940-LABEL: flat_atomic_fadd_f32_noret: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 -; GFX940-NEXT: s_endpgm -; -; GFX12-LABEL: flat_atomic_fadd_f32_noret: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 -; GFX12-NEXT: s_endpgm - %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data) - ret void -} - define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX940: ; %bb.0: @@ -85,29 +60,6 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0 ret void } - -define float @flat_atomic_fadd_f32_rtn(ptr %ptr, float %data) { -; GFX940-LABEL: flat_atomic_fadd_f32_rtn: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_f32_rtn: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data) - ret float %ret -} - define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { ; GFX940-LABEL: flat_atomic_fadd_f32_rtn_pat: ; GFX940: ; %bb.0: @@ -137,51 +89,6 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { ret float %ret } -define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2f16_noret: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 -; GFX940-NEXT: s_endpgm -; -; GFX12-LABEL: flat_atomic_fadd_v2f16_noret: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 -; GFX12-NEXT: s_endpgm - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) - ret void -} - -define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2f16_rtn: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2f16_rtn: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) - ret <2 x half> %ret -} - define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { ; GFX940-LABEL: local_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: @@ -282,202 +189,6 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> ret <2 x i16> %ret } -define float @flat_atomic_fadd_f32_intrinsic_ret__posoffset(ptr %ptr, float %data) { -; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_ret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:4092 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_ret__posoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:4092 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 1023 - %result = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data) - ret float %result -} - -define float @flat_atomic_fadd_f32_intrinsic_ret__negoffset(ptr %ptr, float %data) { -; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_ret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_ret__negoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:-1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -256 - %result = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data) - ret float %result -} - -define void @flat_atomic_fadd_f32_intrinsic_noret__posoffset(ptr %ptr, float %data) { -; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_noret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:4092 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_noret__posoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:4092 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 1023 - %unused = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data) - ret void -} - -define void @flat_atomic_fadd_f32_intrinsic_noret__negoffset(ptr %ptr, float %data) { -; GFX940-LABEL: flat_atomic_fadd_f32_intrinsic_noret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_f32_intrinsic_noret__negoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:-1024 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -256 - %unused = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %gep, float %data) - ret void -} - -define <2 x half> @flat_atomic_fadd_v2f16_intrinsic_ret__posoffset(ptr %ptr, <2 x half> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:4092 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__posoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:4092 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 1023 - %result = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data) - ret <2 x half> %result -} - -define <2 x half> @flat_atomic_fadd_v2f16_intrinsic_ret__negoffset(ptr %ptr, <2 x half> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_ret__negoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:-1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 -256 - %result = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data) - ret <2 x half> %result -} - -define void @flat_atomic_fadd_v2f16_intrinsic_noret__posoffset(ptr %ptr, <2 x half> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:4092 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__posoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:4092 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 1023 - %unused = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data) - ret void -} - -define void @flat_atomic_fadd_v2f16_intrinsic_noret__negoffset(ptr %ptr, <2 x half> %data) { -; GFX940-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffc00, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: flat_atomic_fadd_v2f16_intrinsic_noret__negoffset: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:-1024 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 -256 - %unused = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %gep, <2 x half> %data) - ret void -} - attributes #0 = { "denormal-fp-math-f32"="ieee,ieee" } !0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index 2615147b488d4..957c10ddf85e5 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs | FileCheck %s -check-prefix=GFX90A -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs | FileCheck %s -check-prefix=GFX940 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=None | FileCheck %s -check-prefix=GFX90A +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 -amdgpu-atomic-optimizer-strategy=None | FileCheck %s -check-prefix=GFX940 declare double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32, i32 immarg) declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg) @@ -14,11 +14,8 @@ declare double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double, <4 x i32>, i32 declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg) declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32 immarg) declare double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32 immarg) -declare double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) declare double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data) declare double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data) -declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data) -declare double @llvm.amdgcn.flat.atomic.fadd.f64.p3.f64(ptr addrspace(3) %ptr, double %data) declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data) declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) declare double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) nocapture, double, i32, i32, i1) @@ -1019,31 +1016,6 @@ main_body: ret void } -define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, double %data) { -; GFX90A-LABEL: global_atomic_fadd_f64_noret: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s7 -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] -; GFX90A-NEXT: s_endpgm -; -; GFX940-LABEL: global_atomic_fadd_f64_noret: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s6 -; GFX940-NEXT: v_mov_b32_e32 v1, s7 -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] -; GFX940-NEXT: s_endpgm -main_body: - %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret void -} - define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) { ; GFX90A-LABEL: global_atomic_fmin_f64_noret: ; GFX90A: ; %bb.0: ; %main_body @@ -1097,47 +1069,28 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB39_2 -; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: .LBB39_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB39_2 -; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1 +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: .LBB39_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1147,45 +1100,26 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB40_2 -; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: .LBB40_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB40_2 -; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: .LBB40_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1195,47 +1129,28 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB41_2 -; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: .LBB41_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB41_2 -; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1 +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: .LBB41_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1245,70 +1160,32 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB42_2 -; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: .LBB42_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB42_2 -; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: .LBB42_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } -define double @global_atomic_fadd_f64_rtn(ptr addrspace(1) %ptr, double %data) { -; GFX90A-LABEL: global_atomic_fadd_f64_rtn: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_atomic_fadd_f64_rtn: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret double %ret -} - define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %data) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat: ; GFX90A: ; %bb.0: ; %main_body @@ -1429,57 +1306,37 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrspace(1) %ptr) { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[4:5], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB49_3 -; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s4 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 -; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: .LBB49_2: ; %atomicrmw.start +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB49_2 -; GFX90A-NEXT: .LBB49_3: +; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB49_2 -; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: .LBB49_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst @@ -1658,52 +1515,6 @@ main_body: ret double %ret } -define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) { -; GFX90A-LABEL: flat_atomic_fadd_f64_noret: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s4 -; GFX90A-NEXT: v_mov_b32_e32 v1, s5 -; GFX90A-NEXT: v_mov_b32_e32 v2, s6 -; GFX90A-NEXT: v_mov_b32_e32 v3, s7 -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX90A-NEXT: s_endpgm -; -; GFX940-LABEL: flat_atomic_fadd_f64_noret: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_endpgm -main_body: - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data) - ret void -} - -define double @flat_atomic_fadd_f64_rtn(ptr %ptr, double %data) { -; GFX90A-LABEL: flat_atomic_fadd_f64_rtn: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fadd_f64_rtn: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data) - ret double %ret -} - define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body @@ -1713,7 +1524,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 @@ -1724,7 +1535,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm ; @@ -1839,85 +1650,16 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, double %data) { ; GFX90A-LABEL: local_atomic_fadd_f64_noret: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB63_2 -; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c -; GFX90A-NEXT: s_load_dword s6, s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mul_f64 v[0:1], s[4:5], v[0:1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s6 -; GFX90A-NEXT: ds_add_f64 v2, v[0:1] -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB63_2: -; GFX90A-NEXT: s_endpgm -; -; GFX940-LABEL: local_atomic_fadd_f64_noret: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB63_2 -; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2c -; GFX940-NEXT: s_load_dword s6, s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mul_f64 v[0:1], s[4:5], v[0:1] -; GFX940-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-NEXT: ds_add_f64 v2, v[0:1] -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB63_2: -; GFX940-NEXT: s_endpgm -main_body: - %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) - ret void -} - -define double @local_atomic_fadd_f64_rtn(ptr addrspace(3) %ptr, double %data) { -; GFX90A-LABEL: local_atomic_fadd_f64_rtn: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: local_atomic_fadd_f64_rtn: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) - ret double %ret -} - -define amdgpu_kernel void @local_atomic_fadd_f64_noret_from_flat_intrinsic(ptr addrspace(3) %ptr, double %data) { -; GFX90A-LABEL: local_atomic_fadd_f64_noret_from_flat_intrinsic: -; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dword s4, s[2:3], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_endpgm ; -; GFX940-LABEL: local_atomic_fadd_f64_noret_from_flat_intrinsic: +; GFX940-LABEL: local_atomic_fadd_f64_noret: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_load_dword s4, s[2:3], 0x24 ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c @@ -1925,14 +1667,15 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_from_flat_intrinsic(ptr a ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: ds_add_f64 v2, v[0:1] +; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_endpgm main_body: - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p3.f64(ptr addrspace(3) %ptr, double %data) + %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) ret void } -define double @local_atomic_fadd_f64_rtn_from_flat_intrinsic(ptr addrspace(3) %ptr, double %data) { -; GFX90A-LABEL: local_atomic_fadd_f64_rtn_from_flat_intrinsic: +define double @local_atomic_fadd_f64_rtn(ptr addrspace(3) %ptr, double %data) { +; GFX90A-LABEL: local_atomic_fadd_f64_rtn: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 @@ -1941,7 +1684,7 @@ define double @local_atomic_fadd_f64_rtn_from_flat_intrinsic(ptr addrspace(3) %p ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: local_atomic_fadd_f64_rtn_from_flat_intrinsic: +; GFX940-LABEL: local_atomic_fadd_f64_rtn: ; GFX940: ; %bb.0: ; %main_body ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v3, v2 @@ -1950,49 +1693,30 @@ define double @local_atomic_fadd_f64_rtn_from_flat_intrinsic(ptr addrspace(3) %p ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p3.f64(ptr addrspace(3) %ptr, double %data) + %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) ret double %ret } define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr) #1 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB67_2 -; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX90A-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB67_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB67_2 -; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB67_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -2002,42 +1726,23 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3) %ptr) #0 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB68_2 -; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX90A-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB68_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB68_2 -; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB68_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -2047,42 +1752,23 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrspace(3) %ptr) #4 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_mov_b64 s[0:1], exec -; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB69_2 -; GFX90A-NEXT: ; %bb.1: -; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX90A-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB69_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_mov_b64 s[0:1], exec -; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 -; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB69_2 -; GFX940-NEXT: ; %bb.1: -; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 -; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 -; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX940-NEXT: s_load_dword s0, s[2:3], 0x24 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB69_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst @@ -2157,92 +1843,6 @@ main_body: ret double %ret } -define double @flat_atomic_fadd_f64_intrinsic_rtn__posoffset(ptr %ptr, double %data) #1 { -; GFX90A-LABEL: flat_atomic_fadd_f64_intrinsic_rtn__posoffset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fadd_f64_intrinsic_rtn__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 511 - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data) - ret double %ret -} - -define double @flat_atomic_fadd_f64_intrinsic_rtn__negoffset(ptr %ptr, double %data) #1 { -; GFX90A-LABEL: flat_atomic_fadd_f64_intrinsic_rtn__negoffset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fadd_f64_intrinsic_rtn__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 -511 - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %gep, double %data) - ret double %ret -} - -define void @flat_atomic_fadd_f64_intrinsic_noret__posoffset(ptr %ptr, double %data) #1 { -; GFX90A-LABEL: flat_atomic_fadd_f64_intrinsic_noret__posoffset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fadd_f64_intrinsic_noret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 511 - %unused = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data) - ret void -} - -define void @flat_atomic_fadd_f64_intrinsic_noret__negoffset(ptr %ptr, double %data) #1 { -; GFX90A-LABEL: flat_atomic_fadd_f64_intrinsic_noret__negoffset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fadd_f64_intrinsic_noret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 -511 - %unused = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %gep, double %data) - ret void -} - define double @flat_atomic_fmin_f64_intrinsic_rtn__posoffset(ptr %ptr, double %data) #1 { ; GFX90A-LABEL: flat_atomic_fmin_f64_intrinsic_rtn__posoffset: ; GFX90A: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll index d70d45d44af0f..c8bbafbfa44d8 100644 --- a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll +++ b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll @@ -1,7 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s - -declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr nocapture, double) #8 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck %s define protected amdgpu_kernel void @IllegalGEPConst(i32 %a, ptr addrspace(1) %b, double %c) { ; CHECK-LABEL: IllegalGEPConst: @@ -17,14 +15,16 @@ define protected amdgpu_kernel void @IllegalGEPConst(i32 %a, ptr addrspace(1) %b ; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: s_addc_u32 s1, s5, s1 ; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] offset:-8 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_wbinvl1_vol ; CHECK-NEXT: s_endpgm entry: %i = add nsw i32 %a, -1 %i.2 = sext i32 %i to i64 %i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2 %i.4 = addrspacecast ptr addrspace(1) %i.3 to ptr - %i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %i.4, double %c) #8 + %i.5 = atomicrmw fadd ptr %i.4, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret void } -attributes #8 = { argmemonly mustprogress nounwind willreturn "target-cpu"="gfx90a" } +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll index c1b333c692749..ad3f920eadc91 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll @@ -5,118 +5,6 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX908_GFX11_GFX12,GFX11_GFX12 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefixes=GFX908_GFX11_GFX12,GFX11_GFX12 %s -define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) { - ; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic - ; GFX908_GFX11_GFX12: bb.0 (%ir-block.0): - ; GFX908_GFX11_GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX908_GFX11_GFX12-NEXT: {{ $}} - ; GFX908_GFX11_GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX908_GFX11_GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX908_GFX11_GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908_GFX11_GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908_GFX11_GFX12-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; GFX908_GFX11_GFX12-NEXT: GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX908_GFX11_GFX12-NEXT: S_ENDPGM 0 - ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret void -} - -define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, float %data) { - ; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_intrinsic - ; GFX908_GFX11_GFX12: bb.0 (%ir-block.0): - ; GFX908_GFX11_GFX12-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX908_GFX11_GFX12-NEXT: {{ $}} - ; GFX908_GFX11_GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908_GFX11_GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX908_GFX11_GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX908_GFX11_GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908_GFX11_GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX908_GFX11_GFX12-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX908_GFX11_GFX12-NEXT: S_ENDPGM 0 - ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data) - ret void -} - -define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, float %data) { - ; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_no_rtn_flat_intrinsic - ; GFX908_GFX11_GFX12: bb.0 (%ir-block.0): - ; GFX908_GFX11_GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX908_GFX11_GFX12-NEXT: {{ $}} - ; GFX908_GFX11_GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX908_GFX11_GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX908_GFX11_GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908_GFX11_GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908_GFX11_GFX12-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; GFX908_GFX11_GFX12-NEXT: GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX908_GFX11_GFX12-NEXT: S_ENDPGM 0 - ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret void -} - -define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, float %data) { - ; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic - ; GFX908_GFX11_GFX12: bb.0 (%ir-block.0): - ; GFX908_GFX11_GFX12-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX908_GFX11_GFX12-NEXT: {{ $}} - ; GFX908_GFX11_GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX908_GFX11_GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX908_GFX11_GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX908_GFX11_GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX908_GFX11_GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX908_GFX11_GFX12-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX908_GFX11_GFX12-NEXT: S_ENDPGM 0 - ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data) - ret void -} - define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_atomicrmw(ptr addrspace(1) %ptr, float %data) { ; GFX908_GFX11_GFX12-LABEL: name: global_atomic_fadd_f32_no_rtn_atomicrmw ; GFX908_GFX11_GFX12: bb.0 (%ir-block.0): @@ -324,7 +212,4 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ret void } -declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1), float) -declare float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1), float) - !0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll index 1b955eae89159..3951e02d46a8f 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll @@ -4,126 +4,6 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s -define amdgpu_ps float @global_atomic_fadd_f32_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - ; - ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic - ; GFX11: bb.0 (%ir-block.0): - ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] - ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret float %ret -} - -define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, float %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_rtn_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - ; - ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_intrinsic - ; GFX11: bb.0 (%ir-block.0): - ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] - ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data) - ret float %ret -} - -define amdgpu_ps float @global_atomic_fadd_f32_rtn_flat_intrinsic(ptr addrspace(1) %ptr, float %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - ; - ; GFX11-LABEL: name: global_atomic_fadd_f32_rtn_flat_intrinsic - ; GFX11: bb.0 (%ir-block.0): - ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_RTN]] - ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret float %ret -} - -define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, float %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_saddr_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - ; - ; GFX11-LABEL: name: global_atomic_fadd_f32_saddr_rtn_flat_intrinsic - ; GFX11: bb.0 (%ir-block.0): - ; GFX11-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) - ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] - ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) inreg %ptr, float %data) - ret float %ret -} - define amdgpu_ps float @global_atomic_fadd_f32_rtn_atomicrmw(ptr addrspace(1) %ptr, float %data) { ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_atomicrmw ; GFX90A_GFX940: bb.0 (%ir-block.0): diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll index 50bd5bf57dd9f..ba94a53dff03b 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll @@ -2,174 +2,6 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=GFX90A_GFX940,GFX90A %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=GFX90A_GFX940,GFX940 %s -define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret void -} - -define amdgpu_ps double @global_atomic_fadd_f64_rtn_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 - %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret double %ret -} - -define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret void -} - -define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY5]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY6]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 - %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret double %ret -} - -define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret void -} - -define amdgpu_ps double @global_atomic_fadd_f64_rtn_flat_intrinsic(ptr addrspace(1) %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret double %ret -} - -define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_no_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret void -} - -define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, double %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_saddr_rtn_flat_intrinsic - ; GFX90A_GFX940: bb.0 (%ir-block.0): - ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 - ; GFX90A_GFX940-NEXT: {{ $}} - ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 - ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.ptr, addrspace 1) - ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0 - ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1 - ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY5]] - ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY6]] - ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 - %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret double %ret -} - define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_atomicrmw(ptr addrspace(1) %ptr, double %data) { ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f64_no_rtn_atomicrmw ; GFX90A_GFX940: bb.0 (%ir-block.0): @@ -299,7 +131,4 @@ define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspac ret double %ret } -declare double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1), double) -declare double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1), double) - !0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll index 60345c0b44339..02e425e6d10a8 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll @@ -1,13 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) { - ; GFX908-LABEL: name: global_atomic_fadd_v2f16_no_rtn_intrinsic + +define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn(ptr addrspace(1) %ptr, <2 x half> %data) { + ; GFX908-LABEL: name: global_atomic_fadd_v2f16_no_rtn ; GFX908: bb.0 (%ir-block.0): ; GFX908-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX908-NEXT: {{ $}} @@ -16,10 +17,10 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_intrinsic(ptr addrspace(1 ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn_intrinsic + ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX90A_GFX940-NEXT: {{ $}} @@ -28,14 +29,14 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_intrinsic(ptr addrspace(1 ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) + %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } -define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) { - ; GFX908-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic +define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn(ptr addrspace(1) inreg %ptr, <2 x half> %data) { + ; GFX908-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn ; GFX908: bb.0 (%ir-block.0): ; GFX908-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 ; GFX908-NEXT: {{ $}} @@ -44,10 +45,10 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic(ptr addrs ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic + ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 ; GFX90A_GFX940-NEXT: {{ $}} @@ -56,14 +57,14 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic(ptr addrs ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) + %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } -define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_flat_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) { - ; GFX908-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat_intrinsic +define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_flat(ptr addrspace(1) %ptr, <2 x half> %data) { + ; GFX908-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat ; GFX908: bb.0 (%ir-block.0): ; GFX908-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX908-NEXT: {{ $}} @@ -72,10 +73,10 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_flat_intrinsic(ptr addrsp ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX908-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat_intrinsic + ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX90A_GFX940-NEXT: {{ $}} @@ -84,14 +85,14 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_flat_intrinsic(ptr addrsp ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) + %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } -define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) { - ; GFX908-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic +define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_flat(ptr addrspace(1) inreg %ptr, <2 x half> %data) { + ; GFX908-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat ; GFX908: bb.0 (%ir-block.0): ; GFX908-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 ; GFX908-NEXT: {{ $}} @@ -100,10 +101,10 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic(ptr ; GFX908-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX908-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) ; GFX908-NEXT: S_ENDPGM 0 ; - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic + ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 ; GFX90A_GFX940-NEXT: {{ $}} @@ -112,11 +113,10 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic(ptr ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX940-NEXT: GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) + %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } -declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>) -declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>) +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-rtn.ll index c8caf1fe365b1..794a52b6900ea 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-rtn.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn_intrinsic +define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn(ptr addrspace(1) %ptr, <2 x half> %data) { + ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX90A_GFX940-NEXT: {{ $}} @@ -14,15 +14,15 @@ define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_intrinsic(ptr addrspac ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) + %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %ret } -define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn_intrinsic +define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn(ptr addrspace(1) inreg %ptr, <2 x half> %data) { + ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 ; GFX90A_GFX940-NEXT: {{ $}} @@ -31,15 +31,15 @@ define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_intrinsic(ptr ad ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) + %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %ret } -define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_flat_intrinsic(ptr addrspace(1) %ptr, <2 x half> %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn_flat_intrinsic +define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_flat(ptr addrspace(1) %ptr, <2 x half> %data) { + ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_rtn_flat ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX90A_GFX940-NEXT: {{ $}} @@ -48,15 +48,15 @@ define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_rtn_flat_intrinsic(ptr add ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) + %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %ret } -define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_flat_intrinsic(ptr addrspace(1) inreg %ptr, <2 x half> %data) { - ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn_flat_intrinsic +define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_flat(ptr addrspace(1) inreg %ptr, <2 x half> %data) { + ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_rtn_flat ; GFX90A_GFX940: bb.0 (%ir-block.0): ; GFX90A_GFX940-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0 ; GFX90A_GFX940-NEXT: {{ $}} @@ -65,12 +65,11 @@ define amdgpu_ps <2 x half> @global_atomic_fadd_v2f16_saddr_rtn_flat_intrinsic(p ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) + ; GFX90A_GFX940-NEXT: [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, addrspace 1) ; GFX90A_GFX940-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_PK_ADD_F16_SADDR_RTN]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) + %ret = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %ret } -declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>) -declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>) +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll index 0147084a6996f..84065c4675ab1 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 < %s | FileCheck --check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck --check-prefix=GCN %s ; Test using saddr addressing mode of global_* flat atomic instructions. @@ -11,10 +11,12 @@ define amdgpu_ps void @global_fadd_saddr_f32_nortn(ptr addrspace(1) inreg %sbase ; GCN-LABEL: global_fadd_saddr_f32_nortn: ; GCN: ; %bb.0: ; GCN-NEXT: global_atomic_add_f32 v0, v1, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_wbinvl1 ; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1(ptr addrspace(1) %gep0, float %data) + %ret = atomicrmw fadd ptr addrspace(1) %gep0, float %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret void } @@ -22,11 +24,13 @@ define amdgpu_ps void @global_fadd_saddr_f32_nortn_neg128(ptr addrspace(1) inreg ; GCN-LABEL: global_fadd_saddr_f32_nortn_neg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_atomic_add_f32 v0, v1, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_wbinvl1 ; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1(ptr addrspace(1) %gep1, float %data) + %ret = atomicrmw fadd ptr addrspace(1) %gep1, float %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret void } @@ -34,10 +38,12 @@ define amdgpu_ps void @global_fadd_saddr_v2f16_nortn(ptr addrspace(1) inreg %sba ; GCN-LABEL: global_fadd_saddr_v2f16_nortn: ; GCN: ; %bb.0: ; GCN-NEXT: global_atomic_pk_add_f16 v0, v1, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_wbinvl1 ; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1(ptr addrspace(1) %gep0, <2 x half> %data) + %ret = atomicrmw fadd ptr addrspace(1) %gep0, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } @@ -45,15 +51,14 @@ define amdgpu_ps void @global_fadd_saddr_v2f16_nortn_neg128(ptr addrspace(1) inr ; GCN-LABEL: global_fadd_saddr_v2f16_nortn_neg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_atomic_pk_add_f16 v0, v1, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_wbinvl1 ; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1(ptr addrspace(1) %gep1, <2 x half> %data) + %ret = atomicrmw fadd ptr addrspace(1) %gep1, <2 x half> %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } -declare float @llvm.amdgcn.global.atomic.fadd.f32.p1(ptr addrspace(1) nocapture, float) #0 -declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1(ptr addrspace(1) nocapture, <2 x half>) #0 - -attributes #0 = { argmemonly nounwind willreturn } +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.fadd.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.fadd.gfx90a.ll deleted file mode 100644 index af84105747189..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.fadd.gfx90a.ll +++ /dev/null @@ -1,56 +0,0 @@ -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs | FileCheck %s -check-prefix=GFX90A - -declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1), float) -declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>) - -; GFX90A-LABEL: {{^}}global_atomic_add_f32: -; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off glc -define amdgpu_ps float @global_atomic_add_f32(ptr addrspace(1) %ptr, float %data) { -main_body: - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret float %ret -} - -; GFX90A-LABEL: {{^}}global_atomic_add_f32_off4: -; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off offset:4 glc -define amdgpu_ps float @global_atomic_add_f32_off4(ptr addrspace(1) %ptr, float %data) { -main_body: - %p = getelementptr float, ptr addrspace(1) %ptr, i64 1 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %p, float %data) - ret float %ret -} - -; GFX90A-LABEL: {{^}}global_atomic_add_f32_offneg4: -; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off offset:-4 glc -define amdgpu_ps float @global_atomic_add_f32_offneg4(ptr addrspace(1) %ptr, float %data) { -main_body: - %p = getelementptr float, ptr addrspace(1) %ptr, i64 -1 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %p, float %data) - ret float %ret -} - -; GFX90A-LABEL: {{^}}global_atomic_pk_add_v2f16: -; GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc -define amdgpu_ps <2 x half> @global_atomic_pk_add_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) { -main_body: - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) - ret <2 x half> %ret -} - -; GFX90A-LABEL: {{^}}global_atomic_pk_add_v2f16_off4: -; GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:4 glc -define amdgpu_ps <2 x half> @global_atomic_pk_add_v2f16_off4(ptr addrspace(1) %ptr, <2 x half> %data) { -main_body: - %p = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 1 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %p, <2 x half> %data) - ret <2 x half> %ret -} - -; GFX90A-LABEL: {{^}}global_atomic_pk_add_v2f16_offneg4: -; GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-4 glc -define amdgpu_ps <2 x half> @global_atomic_pk_add_v2f16_offneg4(ptr addrspace(1) %ptr, <2 x half> %data) { -main_body: - %p = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -1 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %p, <2 x half> %data) - ret <2 x half> %ret -} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.fadd.ll deleted file mode 100644 index 0c3ce3308dd8f..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.fadd.ll +++ /dev/null @@ -1,77 +0,0 @@ -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP | FileCheck %s -check-prefix=GCN -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP | FileCheck %s -check-prefix=GCN - -declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1), float) -declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>) -declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr, float) - -; GCN-LABEL: {{^}}global_atomic_add_f32: -; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} -define amdgpu_kernel void @global_atomic_add_f32(ptr addrspace(1) %ptr, float %data) { -main_body: - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret void -} - -; GCN-LABEL: {{^}}global_atomic_add_f32_off4: -; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 -define amdgpu_kernel void @global_atomic_add_f32_off4(ptr addrspace(1) %ptr, float %data) { -main_body: - %p = getelementptr float, ptr addrspace(1) %ptr, i64 1 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %p, float %data) - ret void -} - -; GCN-LABEL: {{^}}global_atomic_add_f32_offneg4: -; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:-4 -define amdgpu_kernel void @global_atomic_add_f32_offneg4(ptr addrspace(1) %ptr, float %data) { -main_body: - %p = getelementptr float, ptr addrspace(1) %ptr, i64 -1 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %p, float %data) - ret void -} - -; GCN-LABEL: {{^}}global_atomic_pk_add_v2f16: -; GCN: global_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @global_atomic_pk_add_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) { -main_body: - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) - ret void -} - -; GCN-LABEL: {{^}}global_atomic_pk_add_v2f16_off4: -; GCN: global_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 -define amdgpu_kernel void @global_atomic_pk_add_v2f16_off4(ptr addrspace(1) %ptr, <2 x half> %data) { -main_body: - %p = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 1 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %p, <2 x half> %data) - ret void -} - -; GCN-LABEL: {{^}}global_atomic_pk_add_v2f16_offneg4: -; GCN: global_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:-4{{$}} -define amdgpu_kernel void @global_atomic_pk_add_v2f16_offneg4(ptr addrspace(1) %ptr, <2 x half> %data) { -main_body: - %p = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -1 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %p, <2 x half> %data) - ret void -} - -; Make sure this artificially selects with an incorrect subtarget, but -; the feature set. -; GCN-LABEL: {{^}}global_atomic_fadd_f32_wrong_subtarget: -; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @global_atomic_fadd_f32_wrong_subtarget(ptr addrspace(1) %ptr, float %data) #0 { - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret void -} - -; GCN-LABEL: {{^}}flat_atomic_fadd_f32_wrong_subtarget: -; GCN: flat_atomic_add_f32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} -define amdgpu_kernel void @flat_atomic_fadd_f32_wrong_subtarget(ptr %ptr, float %data) #1 { - %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data) - ret void -} - -attributes #0 = { "target-cpu"="gfx803" "target-features"="+atomic-fadd-no-rtn-insts"} -attributes #1 = { "target-cpu"="gfx803" "target-features"="+flat-atomic-fadd-f32-inst"} diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll index 39541537b3647..8ea83da78f889 100644 --- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll +++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll @@ -40,12 +40,12 @@ define void @shl_base_global_ptr_global_atomic_fadd(ptr addrspace(1) %out, ptr a %cast = ptrtoint ptr addrspace(1) %arrayidx0 to i64 %shl = shl i64 %cast, 2 %castback = inttoptr i64 %shl to ptr addrspace(1) - call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %castback, float 100.0) + %unused = atomicrmw fadd ptr addrspace(1) %castback, float 100.0 syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 store volatile i64 %cast, ptr addrspace(1) %extra.use, align 4 ret void } -declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) nocapture, float) #1 - attributes #0 = { nounwind } attributes #1 = { argmemonly nounwind willreturn } + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/unsupported-atomics.ll b/llvm/test/CodeGen/AMDGPU/unsupported-atomics.ll deleted file mode 100644 index 4b84f9175307f..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/unsupported-atomics.ll +++ /dev/null @@ -1,28 +0,0 @@ -; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefixes=GFX906 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX908 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940 %s -; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefixes=GFX1030 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100 %s - -; GFX906: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.global.atomic.fadd - -; GFX908-LABEL: fadd_test: -; GFX908: global_atomic_add_f32 - -; GFX90A-LABEL: fadd_test: -; GFX90A: global_atomic_add_f32 - -; GFX940-LABEL: fadd_test: -; GFX940: global_atomic_add_f32 - -; GFX1030: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.global.atomic.fadd - -; GFX1100-LABEL: fadd_test: -; GFX1100: global_atomic_add_f32 - -define fastcc void @fadd_test(ptr addrspace(1) nocapture noundef %0, float noundef %1) unnamed_addr { - %3 = tail call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) noundef %0, float noundef %1) - ret void -} -declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) nocapture, float) diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat-fadd-fmin-fmax-intrinsics.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat-fadd-fmin-fmax-intrinsics.ll index b05caa440ddcb..3d529a2c6ef69 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat-fadd-fmin-fmax-intrinsics.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat-fadd-fmin-fmax-intrinsics.ll @@ -1,20 +1,17 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=infer-address-spaces %s | FileCheck %s -declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data) declare float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %ptr, float %data) declare float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %ptr, float %data) define amdgpu_kernel void @flat_atomic_fadd_f32_p1(ptr addrspace(1) %ptr, float %data) { ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p1 ; CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]], float [[DATA:%.*]]) { -; CHECK-NEXT: [[ADD:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr addrspace(1) [[PTR]], float [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p1.f32(ptr addrspace(1) [[PTR]], float [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p1.f32(ptr addrspace(1) [[PTR]], float [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(1) %ptr to ptr - %add = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %cast, float %data) %max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data) %min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data) ret void @@ -24,13 +21,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_p2(ptr addrspace(2) %ptr, float ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p2 ; CHECK-SAME: (ptr addrspace(2) [[PTR:%.*]], float [[DATA:%.*]]) { ; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(2) [[PTR]] to ptr -; CHECK-NEXT: [[ADD:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr [[CAST]], float [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr [[CAST]], float [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr [[CAST]], float [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(2) %ptr to ptr - %add = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %cast, float %data) %max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data) %min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data) ret void @@ -40,13 +35,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_p3(ptr addrspace(3) %ptr, float ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p3 ; CHECK-SAME: (ptr addrspace(3) [[PTR:%.*]], float [[DATA:%.*]]) { ; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr -; CHECK-NEXT: [[ADD:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr [[CAST]], float [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr [[CAST]], float [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr [[CAST]], float [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(3) %ptr to ptr - %add = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %cast, float %data) %max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data) %min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data) ret void @@ -55,13 +48,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_p3(ptr addrspace(3) %ptr, float define amdgpu_kernel void @flat_atomic_fadd_f32_p4(ptr addrspace(4) %ptr, float %data) { ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p4 ; CHECK-SAME: (ptr addrspace(4) [[PTR:%.*]], float [[DATA:%.*]]) { -; CHECK-NEXT: [[ADD:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p4.f32(ptr addrspace(4) [[PTR]], float [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p4.f32(ptr addrspace(4) [[PTR]], float [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p4.f32(ptr addrspace(4) [[PTR]], float [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(4) %ptr to ptr - %add = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %cast, float %data) %max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data) %min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data) ret void @@ -71,13 +62,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_p5(ptr addrspace(5) %ptr, float ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p5 ; CHECK-SAME: (ptr addrspace(5) [[PTR:%.*]], float [[DATA:%.*]]) { ; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr -; CHECK-NEXT: [[ADD:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr [[CAST]], float [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr [[CAST]], float [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr [[CAST]], float [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(5) %ptr to ptr - %add = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %cast, float %data) %max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data) %min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data) ret void @@ -86,13 +75,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_p5(ptr addrspace(5) %ptr, float define amdgpu_kernel void @flat_atomic_fadd_f32_p6(ptr addrspace(6) %ptr, float %data) { ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p6 ; CHECK-SAME: (ptr addrspace(6) [[PTR:%.*]], float [[DATA:%.*]]) { -; CHECK-NEXT: [[ADD:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p6.f32(ptr addrspace(6) [[PTR]], float [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p6.f32(ptr addrspace(6) [[PTR]], float [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p6.f32(ptr addrspace(6) [[PTR]], float [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(6) %ptr to ptr - %add = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %cast, float %data) %max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data) %min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data) ret void @@ -102,13 +89,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_p7(ptr addrspace(7) %ptr, float ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p7 ; CHECK-SAME: (ptr addrspace(7) [[PTR:%.*]], float [[DATA:%.*]]) { ; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(7) [[PTR]] to ptr -; CHECK-NEXT: [[ADD:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr [[CAST]], float [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr [[CAST]], float [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr [[CAST]], float [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(7) %ptr to ptr - %add = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %cast, float %data) %max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data) %min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data) ret void @@ -117,32 +102,27 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_p7(ptr addrspace(7) %ptr, float define amdgpu_kernel void @flat_atomic_fadd_f32_p99(ptr addrspace(99) %ptr, float %data) { ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p99 ; CHECK-SAME: (ptr addrspace(99) [[PTR:%.*]], float [[DATA:%.*]]) { -; CHECK-NEXT: [[ADD:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p99.f32(ptr addrspace(99) [[PTR]], float [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p99.f32(ptr addrspace(99) [[PTR]], float [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p99.f32(ptr addrspace(99) [[PTR]], float [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(99) %ptr to ptr - %add = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %cast, float %data) %max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data) %min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data) ret void } -declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data) declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data) define amdgpu_kernel void @flat_atomic_fadd_f64_p1(ptr addrspace(1) %ptr, double %data) { ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p1 ; CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]], double [[DATA:%.*]]) { -; CHECK-NEXT: [[ADD:%.*]] = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr addrspace(1) [[PTR]], double [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p1.f64(ptr addrspace(1) [[PTR]], double [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p1.f64(ptr addrspace(1) [[PTR]], double [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(1) %ptr to ptr - %add = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %cast, double %data) %max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data) %min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data) ret void @@ -152,13 +132,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_p2(ptr addrspace(2) %ptr, double ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p2 ; CHECK-SAME: (ptr addrspace(2) [[PTR:%.*]], double [[DATA:%.*]]) { ; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(2) [[PTR]] to ptr -; CHECK-NEXT: [[ADD:%.*]] = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr [[CAST]], double [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr [[CAST]], double [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr [[CAST]], double [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(2) %ptr to ptr - %add = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %cast, double %data) %max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data) %min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data) ret void @@ -168,13 +146,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_p3(ptr addrspace(3) %ptr, double ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p3 ; CHECK-SAME: (ptr addrspace(3) [[PTR:%.*]], double [[DATA:%.*]]) { ; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr -; CHECK-NEXT: [[ADD:%.*]] = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr [[CAST]], double [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr [[CAST]], double [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr [[CAST]], double [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(3) %ptr to ptr - %add = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %cast, double %data) %max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data) %min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data) ret void @@ -183,13 +159,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_p3(ptr addrspace(3) %ptr, double define amdgpu_kernel void @flat_atomic_fadd_f64_p4(ptr addrspace(4) %ptr, double %data) { ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p4 ; CHECK-SAME: (ptr addrspace(4) [[PTR:%.*]], double [[DATA:%.*]]) { -; CHECK-NEXT: [[ADD:%.*]] = call double @llvm.amdgcn.flat.atomic.fadd.f64.p4.f64(ptr addrspace(4) [[PTR]], double [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p4.f64(ptr addrspace(4) [[PTR]], double [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p4.f64(ptr addrspace(4) [[PTR]], double [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(4) %ptr to ptr - %add = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %cast, double %data) %max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data) %min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data) ret void @@ -199,13 +173,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_p5(ptr addrspace(5) %ptr, double ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p5 ; CHECK-SAME: (ptr addrspace(5) [[PTR:%.*]], double [[DATA:%.*]]) { ; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr -; CHECK-NEXT: [[ADD:%.*]] = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr [[CAST]], double [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr [[CAST]], double [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr [[CAST]], double [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(5) %ptr to ptr - %add = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %cast, double %data) %max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data) %min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data) ret void @@ -214,13 +186,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_p5(ptr addrspace(5) %ptr, double define amdgpu_kernel void @flat_atomic_fadd_f64_p6(ptr addrspace(6) %ptr, double %data) { ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p6 ; CHECK-SAME: (ptr addrspace(6) [[PTR:%.*]], double [[DATA:%.*]]) { -; CHECK-NEXT: [[ADD:%.*]] = call double @llvm.amdgcn.flat.atomic.fadd.f64.p6.f64(ptr addrspace(6) [[PTR]], double [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p6.f64(ptr addrspace(6) [[PTR]], double [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p6.f64(ptr addrspace(6) [[PTR]], double [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(6) %ptr to ptr - %add = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %cast, double %data) %max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data) %min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data) ret void @@ -230,13 +200,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_p7(ptr addrspace(7) %ptr, double ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p7 ; CHECK-SAME: (ptr addrspace(7) [[PTR:%.*]], double [[DATA:%.*]]) { ; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(7) [[PTR]] to ptr -; CHECK-NEXT: [[ADD:%.*]] = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr [[CAST]], double [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr [[CAST]], double [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr [[CAST]], double [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(7) %ptr to ptr - %add = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %cast, double %data) %max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data) %min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data) ret void @@ -245,13 +213,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_p7(ptr addrspace(7) %ptr, double define amdgpu_kernel void @flat_atomic_fadd_f64_p99(ptr addrspace(99) %ptr, double %data) { ; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p99 ; CHECK-SAME: (ptr addrspace(99) [[PTR:%.*]], double [[DATA:%.*]]) { -; CHECK-NEXT: [[ADD:%.*]] = call double @llvm.amdgcn.flat.atomic.fadd.f64.p99.f64(ptr addrspace(99) [[PTR]], double [[DATA]]) ; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p99.f64(ptr addrspace(99) [[PTR]], double [[DATA]]) ; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p99.f64(ptr addrspace(99) [[PTR]], double [[DATA]]) ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(99) %ptr to ptr - %add = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %cast, double %data) %max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data) %min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data) ret void diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll index d9c3c4b17090b..57e6fdb35113e 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll @@ -1,9 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s -declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr nocapture, double) #8 -declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr nocapture, double) #8 -declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr nocapture, double) #8 +declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr nocapture, double) #0 +declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr nocapture, double) #0 define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) { ; CHECK-LABEL: InferNothing: @@ -21,37 +20,49 @@ define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) { ; CHECK-NEXT: v_mov_b32_e32 v1, s7 ; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; CHECK-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: buffer_wbinvl1_vol ; CHECK-NEXT: s_endpgm entry: %i = add nsw i32 %a, -1 %i.2 = sext i32 %i to i64 %i.3 = getelementptr inbounds double, ptr %b, i64 %i.2 - %i.4 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %i.3, double %c) #8 + %i.4 = atomicrmw fadd ptr %i.3, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret void } - define protected amdgpu_kernel void @InferFadd(i32 %a, ptr addrspace(1) %b, double %c) { ; CHECK-LABEL: InferFadd: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24 +; CHECK-NEXT: s_mov_b64 s[0:1], exec +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_cbranch_execz .LBB1_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s8, s[2:3], 0x24 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s1, s0, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CHECK-NEXT: s_add_u32 s0, s4, s0 -; CHECK-NEXT: v_mov_b32_e32 v0, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s7 -; CHECK-NEXT: s_addc_u32 s1, s5, s1 -; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] offset:-8 +; CHECK-NEXT: s_ashr_i32 s9, s8, 31 +; CHECK-NEXT: s_lshl_b64 s[2:3], s[8:9], 3 +; CHECK-NEXT: s_add_u32 s2, s4, s2 +; CHECK-NEXT: s_addc_u32 s3, s5, s3 +; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; CHECK-NEXT: v_mul_f64 v[0:1], s[6:7], v[0:1] +; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] offset:-8 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_wbinvl1_vol +; CHECK-NEXT: .LBB1_2: ; CHECK-NEXT: s_endpgm entry: %i = add nsw i32 %a, -1 %i.2 = sext i32 %i to i64 %i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2 %i.4 = addrspacecast ptr addrspace(1) %i.3 to ptr - %i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %i.4, double %c) #8 + %0 = atomicrmw fadd ptr %i.4, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret void } @@ -75,7 +86,7 @@ entry: %i.2 = sext i32 %i to i64 %i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2 %i.4 = addrspacecast ptr addrspace(1) %i.3 to ptr - %i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %i.4, double %c) #8 + %i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %i.4, double %c) #1 ret void } @@ -99,27 +110,43 @@ entry: %i.2 = sext i32 %i to i64 %i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2 %i.4 = addrspacecast ptr addrspace(1) %i.3 to ptr - %i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %i.4, double %c) #8 + %i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %i.4, double %c) #1 ret void } define protected amdgpu_kernel void @InferMixed(i32 %a, ptr addrspace(1) %b, double %c, ptr %d) { ; CHECK-LABEL: InferMixed: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24 ; CHECK-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x3c ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: s_mov_b64 s[0:1], exec ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s1, s0, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 ; CHECK-NEXT: v_mov_b32_e32 v0, s8 ; CHECK-NEXT: v_mov_b32_e32 v1, s9 -; CHECK-NEXT: s_add_u32 s0, s4, s0 ; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] -; CHECK-NEXT: s_addc_u32 s1, s5, s1 ; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] -; CHECK-NEXT: global_atomic_add_f64 v4, v[2:3], s[0:1] offset:-7 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: buffer_wbinvl1_vol +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: s_and_saveexec_b64 s[8:9], vcc +; CHECK-NEXT: s_cbranch_execz .LBB4_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s2, s[2:3], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_ashr_i32 s3, s2, 31 +; CHECK-NEXT: s_lshl_b64 s[2:3], s[2:3], 3 +; CHECK-NEXT: s_add_u32 s2, s4, s2 +; CHECK-NEXT: s_addc_u32 s3, s5, s3 +; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; CHECK-NEXT: v_mul_f64 v[0:1], s[6:7], v[0:1] +; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] offset:-7 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_wbinvl1_vol +; CHECK-NEXT: .LBB4_2: ; CHECK-NEXT: s_endpgm entry: %i = add nsw i32 %a, -1 @@ -127,13 +154,13 @@ entry: %i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2 br label %bb1 -bb1: +bb1: ; preds = %entry %i.7 = ptrtoint ptr addrspace(1) %i.3 to i64 %i.8 = add nsw i64 %i.7, 1 %i.9 = inttoptr i64 %i.8 to ptr addrspace(1) - %i.10 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %d, double %c) #23 + %0 = atomicrmw fadd ptr %d, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 %i.11 = addrspacecast ptr addrspace(1) %i.9 to ptr - %i.12 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %i.11, double %c) #23 + %1 = atomicrmw fadd ptr %i.11, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret void } @@ -158,10 +185,21 @@ define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, doubl ; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] ; CHECK-NEXT: s_cbranch_vccnz .LBB5_1 ; CHECK-NEXT: ; %bb.2: ; %bb1 -; CHECK-NEXT: v_mov_b32_e32 v0, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s7 +; CHECK-NEXT: s_mov_b64 s[0:1], exec +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_cbranch_execz .LBB5_4 +; CHECK-NEXT: ; %bb.3: +; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; CHECK-NEXT: v_mul_f64 v[0:1], s[6:7], v[0:1] ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_wbinvl1_vol +; CHECK-NEXT: .LBB5_4: ; CHECK-NEXT: s_endpgm entry: %i = add nsw i32 %a, -1 @@ -170,7 +208,7 @@ entry: %i.4 = ptrtoint ptr addrspace(1) %i.3 to i64 br label %bb0 -bb0: +bb0: ; preds = %bb0, %entry %phi = phi ptr addrspace(1) [ %i.3, %entry ], [ %i.9, %bb0 ] %i.7 = ptrtoint ptr addrspace(1) %phi to i64 %i.8 = sub nsw i64 %i.7, 1 @@ -178,12 +216,13 @@ bb0: %i.9 = inttoptr i64 %i.7 to ptr addrspace(1) br i1 %cmp2, label %bb1, label %bb0 -bb1: +bb1: ; preds = %bb0 %i.10 = addrspacecast ptr addrspace(1) %i.9 to ptr - %i.11 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %i.10, double %c) #23 + %0 = atomicrmw fadd ptr %i.10, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret void } +attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } +attributes #1 = { mustprogress nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" } -attributes #8 = { argmemonly mustprogress nounwind willreturn "target-cpu"="gfx90a" } - +!0 = !{}