diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 50179c1ceddb4..2738eb77b675a 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2966,11 +2966,6 @@ def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic; -def int_amdgcn_global_atomic_fmax : AMDGPUAtomicRtn; -def int_amdgcn_flat_atomic_fmin : AMDGPUAtomicRtn; -def int_amdgcn_flat_atomic_fmax : AMDGPUAtomicRtn; - defset list AMDGPUMFMAIntrinsics90A = { def int_amdgcn_mfma_f32_32x32x4bf16_1k : AMDGPUMfmaIntrinsic; def int_amdgcn_mfma_f32_16x16x4bf16_1k : AMDGPUMfmaIntrinsic; diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 3753509f9aa71..215bfc8c6cfe3 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -1041,14 +1041,17 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, break; // No other 'amdgcn.atomic.*' } - if (Name.starts_with("ds.fadd") || Name.starts_with("ds.fmin") || - Name.starts_with("ds.fmax") || - Name.starts_with("global.atomic.fadd") || - Name.starts_with("flat.atomic.fadd")) { - // Replaced with atomicrmw fadd/fmin/fmax, so there's no new - // declaration. - NewFn = nullptr; - return true; + if (Name.consume_front("ds.") || Name.consume_front("global.atomic.") || + Name.consume_front("flat.atomic.")) { + if (Name.starts_with("fadd") || + // FIXME: We should also remove fmin.num and fmax.num intrinsics. + (Name.starts_with("fmin") && !Name.starts_with("fmin.num")) || + (Name.starts_with("fmax") && !Name.starts_with("fmax.num"))) { + // Replaced with atomicrmw fadd/fmin/fmax, so there's no new + // declaration. + NewFn = nullptr; + return true; + } } if (Name.starts_with("ldexp.")) { @@ -4218,7 +4221,11 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI, .StartsWith("atomic.inc.", AtomicRMWInst::UIncWrap) .StartsWith("atomic.dec.", AtomicRMWInst::UDecWrap) .StartsWith("global.atomic.fadd", AtomicRMWInst::FAdd) - .StartsWith("flat.atomic.fadd", AtomicRMWInst::FAdd); + .StartsWith("flat.atomic.fadd", AtomicRMWInst::FAdd) + .StartsWith("global.atomic.fmin", AtomicRMWInst::FMin) + .StartsWith("flat.atomic.fmin", AtomicRMWInst::FMin) + .StartsWith("global.atomic.fmax", AtomicRMWInst::FMax) + .StartsWith("flat.atomic.fmax", AtomicRMWInst::FMax); unsigned NumOperands = CI->getNumOperands(); if (NumOperands < 3) // Malformed bitcode. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index aa5b151adef3a..09987a6504b9d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -618,10 +618,6 @@ multiclass local_addr_space_atomic_op { } } -defm int_amdgcn_flat_atomic_fmin : noret_op; -defm int_amdgcn_flat_atomic_fmax : noret_op; -defm int_amdgcn_global_atomic_fmin : noret_op; -defm int_amdgcn_global_atomic_fmax : noret_op; defm int_amdgcn_global_atomic_csub : noret_op; defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op; defm int_amdgcn_flat_atomic_fmin_num : noret_op; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index f597c1ae68a17..32dfbc98df581 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4913,12 +4913,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } case Intrinsic::amdgcn_global_atomic_csub: - case Intrinsic::amdgcn_global_atomic_fmin: - case Intrinsic::amdgcn_global_atomic_fmax: case Intrinsic::amdgcn_global_atomic_fmin_num: case Intrinsic::amdgcn_global_atomic_fmax_num: - case Intrinsic::amdgcn_flat_atomic_fmin: - case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fmin_num: case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_atomic_cond_sub_u32: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 2cd5fb2b94285..60fa2adc62dc8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -239,13 +239,9 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index d348166c2d9a0..0a2d4e6494305 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1045,8 +1045,6 @@ bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl &OpIndexes, switch (IID) { case Intrinsic::amdgcn_is_shared: case Intrinsic::amdgcn_is_private: - case Intrinsic::amdgcn_flat_atomic_fmax: - case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_flat_atomic_fmin_num: OpIndexes.push_back(0); @@ -1106,8 +1104,6 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy}, {NewV, MaskOp}); } - case Intrinsic::amdgcn_flat_atomic_fmax: - case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_flat_atomic_fmin_num: { Type *DestTy = II->getType(); diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 6b5e47902c5a5..a9ab0c5a453e8 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1604,15 +1604,11 @@ let OtherPredicates = [isGFX12Plus] in { let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>; -defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>; -defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>; } let SubtargetPredicate = HasAtomicFMinFMaxF32FlatInsts in { defm : FlatAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>; defm : FlatAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>; -defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin", f32>; -defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax", f32>; } let OtherPredicates = [isGFX12Only] in { @@ -1642,13 +1638,6 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_globa let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>; -defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MIN_F64", "int_amdgcn_global_atomic_fmin", f64>; -defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MAX_F64", "int_amdgcn_global_atomic_fmax", f64>; -} - -let SubtargetPredicate = HasAtomicFMinFMaxF64FlatInsts in { -defm : FlatAtomicIntrPat <"FLAT_ATOMIC_MIN_F64", "int_amdgcn_flat_atomic_fmin", f64>; -defm : FlatAtomicIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax", f64>; } let OtherPredicates = [HasFlatBufferGlobalAtomicFaddF64Inst] in { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 6172687f4b4ab..bbdc006b9afcf 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1367,13 +1367,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, MachineMemOperand::MODereferenceable; return true; } - case Intrinsic::amdgcn_global_atomic_fmin: - case Intrinsic::amdgcn_global_atomic_fmax: case Intrinsic::amdgcn_global_atomic_fmin_num: case Intrinsic::amdgcn_global_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_ordered_add_b64: - case Intrinsic::amdgcn_flat_atomic_fmin: - case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fmin_num: case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_atomic_cond_sub_u32: { @@ -1485,14 +1481,10 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, case Intrinsic::amdgcn_ds_consume: case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: - case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fmax_num: - case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmin_num: case Intrinsic::amdgcn_global_atomic_csub: - case Intrinsic::amdgcn_global_atomic_fmax: case Intrinsic::amdgcn_global_atomic_fmax_num: - case Intrinsic::amdgcn_global_atomic_fmin: case Intrinsic::amdgcn_global_atomic_fmin_num: case Intrinsic::amdgcn_global_atomic_ordered_add_b64: case Intrinsic::amdgcn_global_load_tr_b64: @@ -9397,12 +9389,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, DAG.setNodeMemRefs(NewNode, {MemRef}); return SDValue(NewNode, 0); } - case Intrinsic::amdgcn_global_atomic_fmin: - case Intrinsic::amdgcn_global_atomic_fmax: case Intrinsic::amdgcn_global_atomic_fmin_num: case Intrinsic::amdgcn_global_atomic_fmax_num: - case Intrinsic::amdgcn_flat_atomic_fmin: - case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fmin_num: case Intrinsic::amdgcn_flat_atomic_fmax_num: { MemSDNode *M = cast(Op); @@ -9413,16 +9401,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, }; unsigned Opcode = 0; switch (IntrID) { - case Intrinsic::amdgcn_global_atomic_fmin: case Intrinsic::amdgcn_global_atomic_fmin_num: - case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmin_num: { Opcode = ISD::ATOMIC_LOAD_FMIN; break; } - case Intrinsic::amdgcn_global_atomic_fmax: case Intrinsic::amdgcn_global_atomic_fmax_num: - case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fmax_num: { Opcode = ISD::ATOMIC_LOAD_FMAX; break; diff --git a/llvm/test/Bitcode/amdgcn-atomic.ll b/llvm/test/Bitcode/amdgcn-atomic.ll index 87ca1e3a617ed..3e28cd050fc88 100644 --- a/llvm/test/Bitcode/amdgcn-atomic.ll +++ b/llvm/test/Bitcode/amdgcn-atomic.ll @@ -354,6 +354,70 @@ define float @upgrade_amdgcn_global_atomic_fadd_f32_p1_f32(ptr addrspace(1) %ptr ret float %result } +declare float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr nocapture, float) #0 + +define float @upgrade_amdgcn_flat_atomic_fmin_f32_p0_f32(ptr %ptr, float %data) { + ; CHECK: %{{.+}} = atomicrmw fmin ptr %ptr, float %data syncscope("agent") seq_cst, align 4, !noalias.addrspace !0, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + %result = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %ptr, float %data) + ret float %result +} + +declare float @llvm.amdgcn.global.atomic.fmin.f32.p1.f32(ptr addrspace(1) nocapture, float) #0 + +define float @upgrade_amdgcn_global_atomic_fmin_f32_p1_f32(ptr addrspace(1) %ptr, float %data) { + ; CHECK: %{{.+}} = atomicrmw fmin ptr addrspace(1) %ptr, float %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + %result = call float @llvm.amdgcn.global.atomic.fmin.f32.p1.f32(ptr addrspace(1) %ptr, float %data) + ret float %result +} + +declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr nocapture, double) #0 + +define double @upgrade_amdgcn_flat_atomic_fmin_f64_p0_f64(ptr %ptr, double %data) { + ; CHECK: %{{.+}} = atomicrmw fmin ptr %ptr, double %data syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + %result = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data) + ret double %result +} + +declare double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) nocapture, double) #0 + +define double @upgrade_amdgcn_global_atomic_fmin_f64_p1_f64(ptr addrspace(1) %ptr, double %data) { + ; CHECK: %{{.+}} = atomicrmw fmin ptr addrspace(1) %ptr, double %data syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + %result = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data) + ret double %result +} + +declare float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr nocapture, float) #0 + +define float @upgrade_amdgcn_flat_atomic_fmax_f32_p0_f32(ptr %ptr, float %data) { + ; CHECK: %{{.+}} = atomicrmw fmax ptr %ptr, float %data syncscope("agent") seq_cst, align 4, !noalias.addrspace !0, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + %result = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %ptr, float %data) + ret float %result +} + +declare float @llvm.amdgcn.global.atomic.fmax.f32.p1.f32(ptr addrspace(1) nocapture, float) #0 + +define float @upgrade_amdgcn_global_atomic_fmax_f32_p1_f32(ptr addrspace(1) %ptr, float %data) { + ; CHECK: %{{.+}} = atomicrmw fmax ptr addrspace(1) %ptr, float %data syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + %result = call float @llvm.amdgcn.global.atomic.fmax.f32.p1.f32(ptr addrspace(1) %ptr, float %data) + ret float %result +} + +declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr nocapture, double) #0 + +define double @upgrade_amdgcn_flat_atomic_fmax_f64_p0_f64(ptr %ptr, double %data) { + ; CHECK: %{{.+}} = atomicrmw fmax ptr %ptr, double %data syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + %result = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) + ret double %result +} + +declare double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) nocapture, double) #0 + +define double @upgrade_amdgcn_global_atomic_fmax_f64_p1_f64(ptr addrspace(1) %ptr, double %data) { + ; CHECK: %{{.+}} = atomicrmw fmax ptr addrspace(1) %ptr, double %data syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} + %result = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data) + ret double %result +} + attributes #0 = { argmemonly nounwind willreturn } ; CHECK: !0 = !{i32 5, i32 6} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index eb39ca2d7daa7..92ce2af47e22a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -14,10 +14,6 @@ declare double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double, <4 x i32>, i32 declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg) declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32 immarg) declare double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32 immarg) -declare double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data) -declare double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data) -declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data) -declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_add_noret_f64: @@ -1015,52 +1011,6 @@ main_body: ret void } -define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) { -; GFX90A-LABEL: global_atomic_fmin_f64_noret: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: global_atomic_min_f64 v2, v[0:1], s[4:5] -; GFX90A-NEXT: s_endpgm -; -; GFX940-LABEL: global_atomic_fmin_f64_noret: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: global_atomic_min_f64 v2, v[0:1], s[4:5] -; GFX940-NEXT: s_endpgm -main_body: - %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret void -} - -define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, double %data) { -; GFX90A-LABEL: global_atomic_fmax_f64_noret: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: global_atomic_max_f64 v2, v[0:1], s[4:5] -; GFX90A-NEXT: s_endpgm -; -; GFX940-LABEL: global_atomic_fmax_f64_noret: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: global_atomic_max_f64 v2, v[0:1], s[4:5] -; GFX940-NEXT: s_endpgm -main_body: - %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret void -} - define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body @@ -1070,7 +1020,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB38_2 +; GFX90A-NEXT: s_cbranch_execz .LBB36_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1083,7 +1033,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: .LBB38_2: +; GFX90A-NEXT: .LBB36_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat: @@ -1094,7 +1044,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB38_2 +; GFX940-NEXT: s_cbranch_execz .LBB36_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1106,7 +1056,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: .LBB38_2: +; GFX940-NEXT: .LBB36_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1122,7 +1072,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB39_2 +; GFX90A-NEXT: s_cbranch_execz .LBB37_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1133,7 +1083,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: .LBB39_2: +; GFX90A-NEXT: .LBB37_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent: @@ -1144,7 +1094,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB39_2 +; GFX940-NEXT: s_cbranch_execz .LBB37_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1156,7 +1106,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: .LBB39_2: +; GFX940-NEXT: .LBB37_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1172,7 +1122,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB40_2 +; GFX90A-NEXT: s_cbranch_execz .LBB38_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1185,7 +1135,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: .LBB40_2: +; GFX90A-NEXT: .LBB38_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system: @@ -1196,7 +1146,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB40_2 +; GFX940-NEXT: s_cbranch_execz .LBB38_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1208,7 +1158,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: .LBB40_2: +; GFX940-NEXT: .LBB38_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1224,7 +1174,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB41_2 +; GFX90A-NEXT: s_cbranch_execz .LBB39_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1235,7 +1185,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: .LBB41_2: +; GFX90A-NEXT: .LBB39_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_flush: @@ -1246,7 +1196,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB41_2 +; GFX940-NEXT: s_cbranch_execz .LBB39_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1258,7 +1208,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: .LBB41_2: +; GFX940-NEXT: .LBB39_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1344,44 +1294,6 @@ main_body: ret double %ret } -define double @global_atomic_fmax_f64_rtn(ptr addrspace(1) %ptr, double %data) { -; GFX90A-LABEL: global_atomic_fmax_f64_rtn: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_atomic_fmax_f64_rtn: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret double %ret -} - -define double @global_atomic_fmin_f64_rtn(ptr addrspace(1) %ptr, double %data) { -; GFX90A-LABEL: global_atomic_fmin_f64_rtn: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_atomic_fmin_f64_rtn: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret double %ret -} - define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrspace(1) %ptr) { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body @@ -1391,7 +1303,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB47_2 +; GFX90A-NEXT: s_cbranch_execz .LBB43_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1402,7 +1314,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: .LBB47_2: +; GFX90A-NEXT: .LBB43_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: @@ -1413,7 +1325,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB47_2 +; GFX940-NEXT: s_cbranch_execz .LBB43_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1425,7 +1337,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: .LBB47_2: +; GFX940-NEXT: .LBB43_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1633,90 +1545,6 @@ main_body: ret void } -define amdgpu_kernel void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) { -; GFX90A-LABEL: flat_atomic_fmin_f64_noret: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] -; GFX90A-NEXT: s_endpgm -; -; GFX940-LABEL: flat_atomic_fmin_f64_noret: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_endpgm -main_body: - %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data) - ret void -} - -define double @flat_atomic_fmin_f64_rtn(ptr %ptr, double %data) { -; GFX90A-LABEL: flat_atomic_fmin_f64_rtn: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fmin_f64_rtn: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data) - ret double %ret -} - -define amdgpu_kernel void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) { -; GFX90A-LABEL: flat_atomic_fmax_f64_noret: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] -; GFX90A-NEXT: s_endpgm -; -; GFX940-LABEL: flat_atomic_fmax_f64_noret: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_endpgm -main_body: - %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) - ret void -} - -define double @flat_atomic_fmax_f64_rtn(ptr %ptr, double %data) { -; GFX90A-LABEL: flat_atomic_fmax_f64_rtn: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fmax_f64_rtn: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) - ret double %ret -} - define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr) #1 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body @@ -1726,7 +1554,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB59_2 +; GFX90A-NEXT: s_cbranch_execz .LBB51_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1736,7 +1564,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; GFX90A-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB59_2: +; GFX90A-NEXT: .LBB51_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat: @@ -1747,7 +1575,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB59_2 +; GFX940-NEXT: s_cbranch_execz .LBB51_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1757,7 +1585,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; GFX940-NEXT: v_mov_b32_e32 v2, s2 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB59_2: +; GFX940-NEXT: .LBB51_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1773,7 +1601,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB60_2 +; GFX90A-NEXT: s_cbranch_execz .LBB52_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1783,7 +1611,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; GFX90A-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB60_2: +; GFX90A-NEXT: .LBB52_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush: @@ -1794,7 +1622,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB60_2 +; GFX940-NEXT: s_cbranch_execz .LBB52_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1804,7 +1632,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; GFX940-NEXT: v_mov_b32_e32 v2, s2 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB60_2: +; GFX940-NEXT: .LBB52_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -1820,7 +1648,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB61_2 +; GFX90A-NEXT: s_cbranch_execz .LBB53_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1830,7 +1658,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX90A-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB61_2: +; GFX90A-NEXT: .LBB53_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: @@ -1841,7 +1669,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB61_2 +; GFX940-NEXT: s_cbranch_execz .LBB53_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1] @@ -1851,7 +1679,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX940-NEXT: v_mov_b32_e32 v2, s2 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB61_2: +; GFX940-NEXT: .LBB53_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll index de14d64dbf7e9..017a1f047bb5f 100644 --- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll @@ -508,7 +508,8 @@ define protected amdgpu_kernel void @fmin(ptr addrspace(1) %p, ptr addrspace(1) ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %f64 = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %p, double 1.0) + + %f64 = atomicrmw fmin ptr addrspace(1) %p, double 1.0 syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 %n32 = fptoui double %f64 to i32 %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 @@ -533,7 +534,7 @@ define protected amdgpu_kernel void @fmax(ptr addrspace(1) %p, ptr addrspace(1) ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm - %f64 = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %p, double 1.0) + %f64 = atomicrmw fmax ptr addrspace(1) %p, double 1.0 syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0 %n32 = fptoui double %f64 to i32 %n64 = zext i32 %n32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0 @@ -905,8 +906,6 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.fmax(ptr addrspace(8) %rs ret void } -declare double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1), double) -declare double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1), double) declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.swap.i32(i32, ptr addrspace(8), i32, i32, i32) declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add.i32(i32, ptr addrspace(8), i32, i32, i32) declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub.i32(i32, ptr addrspace(8), i32, i32, i32) @@ -923,3 +922,5 @@ declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i32(i32, i32, ptr addrspa declare float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float, ptr addrspace(8), i32, i32, i32) declare double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double, ptr addrspace(8), i32, i32, i32) declare double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32) + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-flat-atomics-f64.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-flat-atomics-f64.ll deleted file mode 100644 index 8633a3965259b..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-flat-atomics-f64.ll +++ /dev/null @@ -1,51 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10,GFX10-SDAG -; RUN: llc < %s -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL - -declare double @llvm.amdgcn.flat.atomic.fmin.f64.p1.f64(ptr %ptr, double %data) -declare double @llvm.amdgcn.flat.atomic.fmax.f64.p1.f64(ptr %ptr, double %data) - -define amdgpu_cs void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) { -; GFX10-LABEL: flat_atomic_fmin_f64_noret: -; GFX10: ; %bb.0: -; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3] -; GFX10-NEXT: s_endpgm - %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p1.f64(ptr %ptr, double %data) - ret void -} - -define amdgpu_cs void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) { -; GFX10-LABEL: flat_atomic_fmax_f64_noret: -; GFX10: ; %bb.0: -; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[2:3] -; GFX10-NEXT: s_endpgm - %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p1.f64(ptr %ptr, double %data) - ret void -} - -define amdgpu_cs void @flat_atomic_fmin_f64_rtn(ptr %ptr, double %data, ptr %out) { -; GFX10-LABEL: flat_atomic_fmin_f64_rtn: -; GFX10: ; %bb.0: -; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_store_dwordx2 v[4:5], v[0:1] -; GFX10-NEXT: s_endpgm - %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p1.f64(ptr %ptr, double %data) - store double %ret, ptr %out - ret void -} - -define amdgpu_cs void @flat_atomic_fmax_f64_rtn(ptr %ptr, double %data, ptr %out) { -; GFX10-LABEL: flat_atomic_fmax_f64_rtn: -; GFX10: ; %bb.0: -; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_store_dwordx2 v[4:5], v[0:1] -; GFX10-NEXT: s_endpgm - %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p1.f64(ptr %ptr, double %data) - store double %ret, ptr %out - ret void -} -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10-GISEL: {{.*}} -; GFX10-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-flat-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-flat-atomics.ll deleted file mode 100644 index 1d2e3fc636f44..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-flat-atomics.ll +++ /dev/null @@ -1,83 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10,GFX10-SDAG -; RUN: llc < %s -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG -; RUN: llc < %s -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL -; RUN: llc < %s -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL - -declare float @llvm.amdgcn.flat.atomic.fmin.f32.p1.f32(ptr %ptr, float %data) -declare float @llvm.amdgcn.flat.atomic.fmax.f32.p1.f32(ptr %ptr, float %data) - -define amdgpu_cs void @flat_atomic_fmin_f32_noret(ptr %ptr, float %data) { -; GFX10-LABEL: flat_atomic_fmin_f32_noret: -; GFX10: ; %bb.0: -; GFX10-NEXT: flat_atomic_fmin v[0:1], v2 -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: flat_atomic_fmin_f32_noret: -; GFX11: ; %bb.0: -; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2 -; GFX11-NEXT: s_endpgm - %ret = call float @llvm.amdgcn.flat.atomic.fmin.f32.p1.f32(ptr %ptr, float %data) - ret void -} - -define amdgpu_cs void @flat_atomic_fmax_f32_noret(ptr %ptr, float %data) { -; GFX10-LABEL: flat_atomic_fmax_f32_noret: -; GFX10: ; %bb.0: -; GFX10-NEXT: flat_atomic_fmax v[0:1], v2 -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: flat_atomic_fmax_f32_noret: -; GFX11: ; %bb.0: -; GFX11-NEXT: flat_atomic_max_f32 v[0:1], v2 -; GFX11-NEXT: s_endpgm - %ret = call float @llvm.amdgcn.flat.atomic.fmax.f32.p1.f32(ptr %ptr, float %data) - ret void -} - -define amdgpu_cs float @flat_atomic_fmin_f32_rtn(ptr %ptr, float %data, ptr %out) { -; GFX10-LABEL: flat_atomic_fmin_f32_rtn: -; GFX10: ; %bb.0: -; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_store_dword v[3:4], v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: ; return to shader part epilog -; -; GFX11-LABEL: flat_atomic_fmin_f32_rtn: -; GFX11: ; %bb.0: -; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_store_b32 v[3:4], v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: ; return to shader part epilog - %ret = call float @llvm.amdgcn.flat.atomic.fmin.f32.p1.f32(ptr %ptr, float %data) - store float %ret, ptr %out - ret float %ret -} - -define amdgpu_cs float @flat_atomic_fmax_f32_rtn(ptr %ptr, float %data, ptr %out) { -; GFX10-LABEL: flat_atomic_fmax_f32_rtn: -; GFX10: ; %bb.0: -; GFX10-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: flat_store_dword v[3:4], v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: ; return to shader part epilog -; -; GFX11-LABEL: flat_atomic_fmax_f32_rtn: -; GFX11: ; %bb.0: -; GFX11-NEXT: flat_atomic_max_f32 v0, v[0:1], v2 glc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_store_b32 v[3:4], v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: ; return to shader part epilog - %ret = call float @llvm.amdgcn.flat.atomic.fmax.f32.p1.f32(ptr %ptr, float %data) - store float %ret, ptr %out - ret float %ret -} -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10-GISEL: {{.*}} -; GFX10-SDAG: {{.*}} -; GFX11-GISEL: {{.*}} -; GFX11-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics-f64.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics-f64.ll deleted file mode 100644 index bb06ee3165e3a..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics-f64.ll +++ /dev/null @@ -1,51 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10,GFX10-SDAG -; RUN: llc < %s -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL - -declare double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data) -declare double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - -define amdgpu_cs void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) { -; GFX10-LABEL: global_atomic_fmin_f64_noret: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[2:3], off -; GFX10-NEXT: s_endpgm - %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret void -} - -define amdgpu_cs void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, double %data) { -; GFX10-LABEL: global_atomic_fmax_f64_noret: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off -; GFX10-NEXT: s_endpgm - %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret void -} - -define amdgpu_cs void @global_atomic_fmin_f64_rtn(ptr addrspace(1) %ptr, double %data, ptr addrspace(1) %out) { -; GFX10-LABEL: global_atomic_fmin_f64_rtn: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dwordx2 v[4:5], v[0:1], off -; GFX10-NEXT: s_endpgm - %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - store double %ret, ptr addrspace(1) %out - ret void -} - -define amdgpu_cs void @global_atomic_fmax_f64_rtn(ptr addrspace(1) %ptr, double %data, ptr addrspace(1) %out) { -; GFX10-LABEL: global_atomic_fmax_f64_rtn: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dwordx2 v[4:5], v[0:1], off -; GFX10-NEXT: s_endpgm - %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - store double %ret, ptr addrspace(1) %out - ret void -} -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10-GISEL: {{.*}} -; GFX10-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics.ll deleted file mode 100644 index 699bb8b41b69d..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics.ll +++ /dev/null @@ -1,87 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10,GFX10-SDAG -; RUN: llc < %s -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG -; RUN: llc < %s -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL -; RUN: llc < %s -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL - -declare float @llvm.amdgcn.global.atomic.fmin.f32.p1.f32(ptr addrspace(1) %ptr, float %data) -declare float @llvm.amdgcn.global.atomic.fmax.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - -define amdgpu_cs void @global_atomic_fmin_f32_noret(ptr addrspace(1) %ptr, float %data) { -; GFX10-LABEL: global_atomic_fmin_f32_noret: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: global_atomic_fmin_f32_noret: -; GFX11: ; %bb.0: -; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm - %ret = call float @llvm.amdgcn.global.atomic.fmin.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret void -} - -define amdgpu_cs void @global_atomic_fmax_f32_noret(ptr addrspace(1) %ptr, float %data) { -; GFX10-LABEL: global_atomic_fmax_f32_noret: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: global_atomic_fmax_f32_noret: -; GFX11: ; %bb.0: -; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm - %ret = call float @llvm.amdgcn.global.atomic.fmax.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret void -} - -define amdgpu_cs void @global_atomic_fmax_f32_rtn(ptr addrspace(1) %ptr, float %data, ptr addrspace(1) %out) { -; GFX10-LABEL: global_atomic_fmax_f32_rtn: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v[3:4], v0, off -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: global_atomic_fmax_f32_rtn: -; GFX11: ; %bb.0: -; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_store_b32 v[3:4], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm - %ret = call float @llvm.amdgcn.global.atomic.fmax.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - store float %ret, ptr addrspace(1) %out - ret void -} - -define amdgpu_cs void @global_atomic_fmin_f32_rtn(ptr addrspace(1) %ptr, float %data, ptr addrspace(1) %out) { -; GFX10-LABEL: global_atomic_fmin_f32_rtn: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v[3:4], v0, off -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: global_atomic_fmin_f32_rtn: -; GFX11: ; %bb.0: -; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_store_b32 v[3:4], v0, off -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm - %ret = call float @llvm.amdgcn.global.atomic.fmin.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - store float %ret, ptr addrspace(1) %out - ret void -} -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10-GISEL: {{.*}} -; GFX10-SDAG: {{.*}} -; GFX11-GISEL: {{.*}} -; GFX11-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index 957c10ddf85e5..e45b5cb30ab89 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -14,10 +14,6 @@ declare double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double, <4 x i32>, i32 declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg) declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32 immarg) declare double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32 immarg) -declare double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data) -declare double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data) -declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data) -declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) declare double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) nocapture, double, i32, i32, i1) define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { @@ -1016,56 +1012,6 @@ main_body: ret void } -define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) { -; GFX90A-LABEL: global_atomic_fmin_f64_noret: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s7 -; GFX90A-NEXT: global_atomic_min_f64 v2, v[0:1], s[4:5] -; GFX90A-NEXT: s_endpgm -; -; GFX940-LABEL: global_atomic_fmin_f64_noret: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s6 -; GFX940-NEXT: v_mov_b32_e32 v1, s7 -; GFX940-NEXT: global_atomic_min_f64 v2, v[0:1], s[4:5] -; GFX940-NEXT: s_endpgm -main_body: - %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret void -} - -define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, double %data) { -; GFX90A-LABEL: global_atomic_fmax_f64_noret: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s6 -; GFX90A-NEXT: v_mov_b32_e32 v1, s7 -; GFX90A-NEXT: global_atomic_max_f64 v2, v[0:1], s[4:5] -; GFX90A-NEXT: s_endpgm -; -; GFX940-LABEL: global_atomic_fmax_f64_noret: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s6 -; GFX940-NEXT: v_mov_b32_e32 v1, s7 -; GFX940-NEXT: global_atomic_max_f64 v2, v[0:1], s[4:5] -; GFX940-NEXT: s_endpgm -main_body: - %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret void -} - define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body @@ -1265,44 +1211,6 @@ main_body: ret double %ret } -define double @global_atomic_fmax_f64_rtn(ptr addrspace(1) %ptr, double %data) { -; GFX90A-LABEL: global_atomic_fmax_f64_rtn: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_atomic_fmax_f64_rtn: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret double %ret -} - -define double @global_atomic_fmin_f64_rtn(ptr addrspace(1) %ptr, double %data) { -; GFX90A-LABEL: global_atomic_fmin_f64_rtn: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: global_atomic_fmin_f64_rtn: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data) - ret double %ret -} - define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrspace(1) %ptr) { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body @@ -1313,7 +1221,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc @@ -1323,7 +1231,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm ; @@ -1524,7 +1432,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 @@ -1535,7 +1443,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm ; @@ -1555,98 +1463,6 @@ main_body: ret void } -define amdgpu_kernel void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) { -; GFX90A-LABEL: flat_atomic_fmin_f64_noret: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s4 -; GFX90A-NEXT: v_mov_b32_e32 v1, s5 -; GFX90A-NEXT: v_mov_b32_e32 v2, s6 -; GFX90A-NEXT: v_mov_b32_e32 v3, s7 -; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] -; GFX90A-NEXT: s_endpgm -; -; GFX940-LABEL: flat_atomic_fmin_f64_noret: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_endpgm -main_body: - %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data) - ret void -} - -define double @flat_atomic_fmin_f64_rtn(ptr %ptr, double %data) { -; GFX90A-LABEL: flat_atomic_fmin_f64_rtn: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fmin_f64_rtn: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data) - ret double %ret -} - -define amdgpu_kernel void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) { -; GFX90A-LABEL: flat_atomic_fmax_f64_noret: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s4 -; GFX90A-NEXT: v_mov_b32_e32 v1, s5 -; GFX90A-NEXT: v_mov_b32_e32 v2, s6 -; GFX90A-NEXT: v_mov_b32_e32 v3, s7 -; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] -; GFX90A-NEXT: s_endpgm -; -; GFX940-LABEL: flat_atomic_fmax_f64_noret: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s4 -; GFX940-NEXT: v_mov_b32_e32 v1, s5 -; GFX940-NEXT: v_mov_b32_e32 v2, s6 -; GFX940-NEXT: v_mov_b32_e32 v3, s7 -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_endpgm -main_body: - %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) - ret void -} - -define double @flat_atomic_fmax_f64_rtn(ptr %ptr, double %data) { -; GFX90A-LABEL: flat_atomic_fmax_f64_rtn: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fmax_f64_rtn: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] -main_body: - %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) - ret double %ret -} - define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, double %data) { ; GFX90A-LABEL: local_atomic_fadd_f64_noret: ; GFX90A: ; %bb.0: ; %main_body @@ -1843,178 +1659,6 @@ main_body: ret double %ret } -define double @flat_atomic_fmin_f64_intrinsic_rtn__posoffset(ptr %ptr, double %data) #1 { -; GFX90A-LABEL: flat_atomic_fmin_f64_intrinsic_rtn__posoffset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fmin_f64_intrinsic_rtn__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 511 - %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data) - ret double %ret -} - -define double @flat_atomic_fmin_f64_intrinsic_rtn__negoffset(ptr %ptr, double %data) #1 { -; GFX90A-LABEL: flat_atomic_fmin_f64_intrinsic_rtn__negoffset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fmin_f64_intrinsic_rtn__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 -511 - %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %gep, double %data) - ret double %ret -} - -define void @flat_atomic_fmin_f64_intrinsic_noret__posoffset(ptr %ptr, double %data) #1 { -; GFX90A-LABEL: flat_atomic_fmin_f64_intrinsic_noret__posoffset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fmin_f64_intrinsic_noret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 511 - %unused = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data) - ret void -} - -define void @flat_atomic_fmin_f64_intrinsic_noret__negoffset(ptr %ptr, double %data) #1 { -; GFX90A-LABEL: flat_atomic_fmin_f64_intrinsic_noret__negoffset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fmin_f64_intrinsic_noret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 -511 - %unused = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %gep, double %data) - ret void -} - -define double @flat_atomic_fmax_f64_intrinsic_rtn__posoffset(ptr %ptr, double %data) #1 { -; GFX90A-LABEL: flat_atomic_fmax_f64_intrinsic_rtn__posoffset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fmax_f64_intrinsic_rtn__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 511 - %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) - ret double %ret -} - -define double @flat_atomic_fmax_f64_intrinsic_rtn__negoffset(ptr %ptr, double %data) #1 { -; GFX90A-LABEL: flat_atomic_fmax_f64_intrinsic_rtn__negoffset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fmax_f64_intrinsic_rtn__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 -511 - %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %gep, double %data) - ret double %ret -} - -define void @flat_atomic_fmax_f64_intrinsic_noret__posoffset(ptr %ptr, double %data) #1 { -; GFX90A-LABEL: flat_atomic_fmax_f64_intrinsic_noret__posoffset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fmax_f64_intrinsic_noret__posoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 511 - %unused = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) - ret void -} - -define void @flat_atomic_fmax_f64_intrinsic_noret__negoffset(ptr %ptr, double %data) #1 { -; GFX90A-LABEL: flat_atomic_fmax_f64_intrinsic_noret__negoffset: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: flat_atomic_fmax_f64_intrinsic_noret__negoffset: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff008, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 -511 - %unused = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %gep, double %data) - ret void -} - attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" } attributes #1 = { nounwind } attributes #2 = { "denormal-fp-math"="ieee,ieee" } diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat-fadd-fmin-fmax-intrinsics.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat-fadd-fmin-fmax-intrinsics.ll deleted file mode 100644 index 3d529a2c6ef69..0000000000000 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat-fadd-fmin-fmax-intrinsics.ll +++ /dev/null @@ -1,224 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=infer-address-spaces %s | FileCheck %s - -declare float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %ptr, float %data) -declare float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %ptr, float %data) - -define amdgpu_kernel void @flat_atomic_fadd_f32_p1(ptr addrspace(1) %ptr, float %data) { -; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p1 -; CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]], float [[DATA:%.*]]) { -; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p1.f32(ptr addrspace(1) [[PTR]], float [[DATA]]) -; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p1.f32(ptr addrspace(1) [[PTR]], float [[DATA]]) -; CHECK-NEXT: ret void -; - %cast = addrspacecast ptr addrspace(1) %ptr to ptr - %max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data) - %min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data) - ret void -} - -define amdgpu_kernel void @flat_atomic_fadd_f32_p2(ptr addrspace(2) %ptr, float %data) { -; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p2 -; CHECK-SAME: (ptr addrspace(2) [[PTR:%.*]], float [[DATA:%.*]]) { -; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(2) [[PTR]] to ptr -; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr [[CAST]], float [[DATA]]) -; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr [[CAST]], float [[DATA]]) -; CHECK-NEXT: ret void -; - %cast = addrspacecast ptr addrspace(2) %ptr to ptr - %max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data) - %min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data) - ret void -} - -define amdgpu_kernel void @flat_atomic_fadd_f32_p3(ptr addrspace(3) %ptr, float %data) { -; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p3 -; CHECK-SAME: (ptr addrspace(3) [[PTR:%.*]], float [[DATA:%.*]]) { -; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr -; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr [[CAST]], float [[DATA]]) -; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr [[CAST]], float [[DATA]]) -; CHECK-NEXT: ret void -; - %cast = addrspacecast ptr addrspace(3) %ptr to ptr - %max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data) - %min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data) - ret void -} - -define amdgpu_kernel void @flat_atomic_fadd_f32_p4(ptr addrspace(4) %ptr, float %data) { -; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p4 -; CHECK-SAME: (ptr addrspace(4) [[PTR:%.*]], float [[DATA:%.*]]) { -; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p4.f32(ptr addrspace(4) [[PTR]], float [[DATA]]) -; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p4.f32(ptr addrspace(4) [[PTR]], float [[DATA]]) -; CHECK-NEXT: ret void -; - %cast = addrspacecast ptr addrspace(4) %ptr to ptr - %max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data) - %min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data) - ret void -} - -define amdgpu_kernel void @flat_atomic_fadd_f32_p5(ptr addrspace(5) %ptr, float %data) { -; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p5 -; CHECK-SAME: (ptr addrspace(5) [[PTR:%.*]], float [[DATA:%.*]]) { -; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr -; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr [[CAST]], float [[DATA]]) -; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr [[CAST]], float [[DATA]]) -; CHECK-NEXT: ret void -; - %cast = addrspacecast ptr addrspace(5) %ptr to ptr - %max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data) - %min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data) - ret void -} - -define amdgpu_kernel void @flat_atomic_fadd_f32_p6(ptr addrspace(6) %ptr, float %data) { -; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p6 -; CHECK-SAME: (ptr addrspace(6) [[PTR:%.*]], float [[DATA:%.*]]) { -; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p6.f32(ptr addrspace(6) [[PTR]], float [[DATA]]) -; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p6.f32(ptr addrspace(6) [[PTR]], float [[DATA]]) -; CHECK-NEXT: ret void -; - %cast = addrspacecast ptr addrspace(6) %ptr to ptr - %max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data) - %min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data) - ret void -} - -define amdgpu_kernel void @flat_atomic_fadd_f32_p7(ptr addrspace(7) %ptr, float %data) { -; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p7 -; CHECK-SAME: (ptr addrspace(7) [[PTR:%.*]], float [[DATA:%.*]]) { -; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(7) [[PTR]] to ptr -; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr [[CAST]], float [[DATA]]) -; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr [[CAST]], float [[DATA]]) -; CHECK-NEXT: ret void -; - %cast = addrspacecast ptr addrspace(7) %ptr to ptr - %max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data) - %min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data) - ret void -} - -define amdgpu_kernel void @flat_atomic_fadd_f32_p99(ptr addrspace(99) %ptr, float %data) { -; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f32_p99 -; CHECK-SAME: (ptr addrspace(99) [[PTR:%.*]], float [[DATA:%.*]]) { -; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.amdgcn.flat.atomic.fmax.f32.p99.f32(ptr addrspace(99) [[PTR]], float [[DATA]]) -; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.amdgcn.flat.atomic.fmin.f32.p99.f32(ptr addrspace(99) [[PTR]], float [[DATA]]) -; CHECK-NEXT: ret void -; - %cast = addrspacecast ptr addrspace(99) %ptr to ptr - %max = call float @llvm.amdgcn.flat.atomic.fmax.f32.p0.f32(ptr %cast, float %data) - %min = call float @llvm.amdgcn.flat.atomic.fmin.f32.p0.f32(ptr %cast, float %data) - ret void -} - -declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) -declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data) - -define amdgpu_kernel void @flat_atomic_fadd_f64_p1(ptr addrspace(1) %ptr, double %data) { -; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p1 -; CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]], double [[DATA:%.*]]) { -; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p1.f64(ptr addrspace(1) [[PTR]], double [[DATA]]) -; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p1.f64(ptr addrspace(1) [[PTR]], double [[DATA]]) -; CHECK-NEXT: ret void -; - %cast = addrspacecast ptr addrspace(1) %ptr to ptr - %max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data) - %min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data) - ret void -} - -define amdgpu_kernel void @flat_atomic_fadd_f64_p2(ptr addrspace(2) %ptr, double %data) { -; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p2 -; CHECK-SAME: (ptr addrspace(2) [[PTR:%.*]], double [[DATA:%.*]]) { -; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(2) [[PTR]] to ptr -; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr [[CAST]], double [[DATA]]) -; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr [[CAST]], double [[DATA]]) -; CHECK-NEXT: ret void -; - %cast = addrspacecast ptr addrspace(2) %ptr to ptr - %max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data) - %min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data) - ret void -} - -define amdgpu_kernel void @flat_atomic_fadd_f64_p3(ptr addrspace(3) %ptr, double %data) { -; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p3 -; CHECK-SAME: (ptr addrspace(3) [[PTR:%.*]], double [[DATA:%.*]]) { -; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr -; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr [[CAST]], double [[DATA]]) -; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr [[CAST]], double [[DATA]]) -; CHECK-NEXT: ret void -; - %cast = addrspacecast ptr addrspace(3) %ptr to ptr - %max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data) - %min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data) - ret void -} - -define amdgpu_kernel void @flat_atomic_fadd_f64_p4(ptr addrspace(4) %ptr, double %data) { -; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p4 -; CHECK-SAME: (ptr addrspace(4) [[PTR:%.*]], double [[DATA:%.*]]) { -; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p4.f64(ptr addrspace(4) [[PTR]], double [[DATA]]) -; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p4.f64(ptr addrspace(4) [[PTR]], double [[DATA]]) -; CHECK-NEXT: ret void -; - %cast = addrspacecast ptr addrspace(4) %ptr to ptr - %max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data) - %min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data) - ret void -} - -define amdgpu_kernel void @flat_atomic_fadd_f64_p5(ptr addrspace(5) %ptr, double %data) { -; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p5 -; CHECK-SAME: (ptr addrspace(5) [[PTR:%.*]], double [[DATA:%.*]]) { -; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr -; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr [[CAST]], double [[DATA]]) -; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr [[CAST]], double [[DATA]]) -; CHECK-NEXT: ret void -; - %cast = addrspacecast ptr addrspace(5) %ptr to ptr - %max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data) - %min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data) - ret void -} - -define amdgpu_kernel void @flat_atomic_fadd_f64_p6(ptr addrspace(6) %ptr, double %data) { -; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p6 -; CHECK-SAME: (ptr addrspace(6) [[PTR:%.*]], double [[DATA:%.*]]) { -; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p6.f64(ptr addrspace(6) [[PTR]], double [[DATA]]) -; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p6.f64(ptr addrspace(6) [[PTR]], double [[DATA]]) -; CHECK-NEXT: ret void -; - %cast = addrspacecast ptr addrspace(6) %ptr to ptr - %max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data) - %min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data) - ret void -} - -define amdgpu_kernel void @flat_atomic_fadd_f64_p7(ptr addrspace(7) %ptr, double %data) { -; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p7 -; CHECK-SAME: (ptr addrspace(7) [[PTR:%.*]], double [[DATA:%.*]]) { -; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(7) [[PTR]] to ptr -; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr [[CAST]], double [[DATA]]) -; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr [[CAST]], double [[DATA]]) -; CHECK-NEXT: ret void -; - %cast = addrspacecast ptr addrspace(7) %ptr to ptr - %max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data) - %min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data) - ret void -} - -define amdgpu_kernel void @flat_atomic_fadd_f64_p99(ptr addrspace(99) %ptr, double %data) { -; CHECK-LABEL: define amdgpu_kernel void @flat_atomic_fadd_f64_p99 -; CHECK-SAME: (ptr addrspace(99) [[PTR:%.*]], double [[DATA:%.*]]) { -; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.amdgcn.flat.atomic.fmax.f64.p99.f64(ptr addrspace(99) [[PTR]], double [[DATA]]) -; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.amdgcn.flat.atomic.fmin.f64.p99.f64(ptr addrspace(99) [[PTR]], double [[DATA]]) -; CHECK-NEXT: ret void -; - %cast = addrspacecast ptr addrspace(99) %ptr to ptr - %max = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %cast, double %data) - %min = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %cast, double %data) - ret void -} diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll index 57e6fdb35113e..a0856ac9127e6 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll @@ -1,9 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s -declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr nocapture, double) #0 -declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr nocapture, double) #0 - define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) { ; CHECK-LABEL: InferNothing: ; CHECK: ; %bb.0: ; %entry @@ -66,54 +63,6 @@ entry: ret void } -define protected amdgpu_kernel void @InferFmax(i32 %a, ptr addrspace(1) %b, double %c) { -; CHECK-LABEL: InferFmax: -; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s1, s0, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CHECK-NEXT: s_add_u32 s0, s4, s0 -; CHECK-NEXT: v_mov_b32_e32 v0, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s7 -; CHECK-NEXT: s_addc_u32 s1, s5, s1 -; CHECK-NEXT: global_atomic_max_f64 v2, v[0:1], s[0:1] offset:-8 -; CHECK-NEXT: s_endpgm -entry: - %i = add nsw i32 %a, -1 - %i.2 = sext i32 %i to i64 - %i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2 - %i.4 = addrspacecast ptr addrspace(1) %i.3 to ptr - %i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %i.4, double %c) #1 - ret void -} - -define protected amdgpu_kernel void @InferFmin(i32 %a, ptr addrspace(1) %b, double %c) { -; CHECK-LABEL: InferFmin: -; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s0, s[2:3], 0x24 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c -; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s1, s0, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 -; CHECK-NEXT: s_add_u32 s0, s4, s0 -; CHECK-NEXT: v_mov_b32_e32 v0, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s7 -; CHECK-NEXT: s_addc_u32 s1, s5, s1 -; CHECK-NEXT: global_atomic_min_f64 v2, v[0:1], s[0:1] offset:-8 -; CHECK-NEXT: s_endpgm -entry: - %i = add nsw i32 %a, -1 - %i.2 = sext i32 %i to i64 - %i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2 - %i.4 = addrspacecast ptr addrspace(1) %i.3 to ptr - %i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %i.4, double %c) #1 - ret void -} - define protected amdgpu_kernel void @InferMixed(i32 %a, ptr addrspace(1) %b, double %c, ptr %d) { ; CHECK-LABEL: InferMixed: ; CHECK: ; %bb.0: ; %entry @@ -131,7 +80,7 @@ define protected amdgpu_kernel void @InferMixed(i32 %a, ptr addrspace(1) %b, dou ; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CHECK-NEXT: s_and_saveexec_b64 s[8:9], vcc -; CHECK-NEXT: s_cbranch_execz .LBB4_2 +; CHECK-NEXT: s_cbranch_execz .LBB2_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: s_load_dword s2, s[2:3], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 @@ -146,7 +95,7 @@ define protected amdgpu_kernel void @InferMixed(i32 %a, ptr addrspace(1) %b, dou ; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] offset:-7 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: .LBB4_2: +; CHECK-NEXT: .LBB2_2: ; CHECK-NEXT: s_endpgm entry: %i = add nsw i32 %a, -1 @@ -180,17 +129,17 @@ define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, doubl ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 -; CHECK-NEXT: .LBB5_1: ; %bb0 +; CHECK-NEXT: .LBB3_1: ; %bb0 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] -; CHECK-NEXT: s_cbranch_vccnz .LBB5_1 +; CHECK-NEXT: s_cbranch_vccnz .LBB3_1 ; CHECK-NEXT: ; %bb.2: ; %bb1 ; CHECK-NEXT: s_mov_b64 s[0:1], exec ; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc -; CHECK-NEXT: s_cbranch_execz .LBB5_4 +; CHECK-NEXT: s_cbranch_execz .LBB3_4 ; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1] ; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 @@ -199,7 +148,7 @@ define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, doubl ; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: .LBB5_4: +; CHECK-NEXT: .LBB3_4: ; CHECK-NEXT: s_endpgm entry: %i = add nsw i32 %a, -1