From 8f757504c4fcbbe8a80c1a81bf59bf10b03b873c Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 10 Jun 2024 12:29:34 +0200 Subject: [PATCH 1/3] AtomicExpand: Fix creating invalid ptrmask for fat pointers The ptrmask intrinsic requires the integer mask to be the index size, not the pointer size. --- llvm/lib/CodeGen/AtomicExpandPass.cpp | 2 +- .../AtomicExpand/AMDGPU/expand-atomic-i16.ll | 104 ++++++++++++++ .../AtomicExpand/AMDGPU/expand-atomic-i8.ll | 104 ++++++++++++++ .../AMDGPU/expand-atomic-rmw-fadd.ll | 130 ++++++++++++++++++ 4 files changed, 339 insertions(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index d2b756e82964e..7728cc50fc9f9 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -765,7 +765,7 @@ static PartwordMaskValues createMaskInstrs(IRBuilderBase &Builder, assert(ValueSize < MinWordSize); PointerType *PtrTy = cast(Addr->getType()); - IntegerType *IntTy = DL.getIntPtrType(Ctx, PtrTy->getAddressSpace()); + IntegerType *IntTy = DL.getIndexType(Ctx, PtrTy->getAddressSpace()); Value *PtrLSB; if (AddrAlign < MinWordSize) { diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll index 0acb8f8d0fcf6..b8196cfcc3510 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll @@ -1262,6 +1262,110 @@ define bfloat @test_atomicrmw_xchg_bf16_global_agent_align4(ptr addrspace(1) %pt ret bfloat %res } +define i16 @test_atomicrmw_xchg_i16_buffer_fat_agent(ptr addrspace(7) %ptr, i16 %value) { +; CHECK-LABEL: @test_atomicrmw_xchg_i16_buffer_fat_agent( +; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(7) @llvm.ptrmask.p7.i32(ptr addrspace(7) [[PTR:%.*]], i32 -4) +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(7) [[PTR]] to i32 +; CHECK-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] +; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(7) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; CHECK-NEXT: [[TMP6:%.*]] = or i32 [[TMP5]], [[VALOPERAND_SHIFTED]] +; CHECK-NEXT: [[TMP7:%.*]] = cmpxchg ptr addrspace(7) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP6]] syncscope("agent") seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP7]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] +; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; CHECK-NEXT: ret i16 [[EXTRACTED]] +; + %res = atomicrmw xchg ptr addrspace(7) %ptr, i16 %value syncscope("agent") seq_cst + ret i16 %res +} + +define i16 @test_atomicrmw_xchg_i16_buffer_fat_agent_align4(ptr addrspace(7) %ptr, i16 %value) { +; CHECK-LABEL: @test_atomicrmw_xchg_i16_buffer_fat_agent_align4( +; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(7) [[PTR:%.*]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP2]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[LOADED]], -65536 +; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(7) [[PTR]], i32 [[LOADED]], i32 [[TMP4]] syncscope("agent") seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[NEWLOADED]] to i16 +; CHECK-NEXT: ret i16 [[EXTRACTED]] +; + %res = atomicrmw xchg ptr addrspace(7) %ptr, i16 %value syncscope("agent") seq_cst, align 4 + ret i16 %res +} + +define i16 @test_atomicrmw_add_i16_buffer_fat_agent(ptr addrspace(7) %ptr, i16 %value) { +; CHECK-LABEL: @test_atomicrmw_add_i16_buffer_fat_agent( +; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(7) @llvm.ptrmask.p7.i32(ptr addrspace(7) [[PTR:%.*]], i32 -4) +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(7) [[PTR]] to i32 +; CHECK-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] +; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(7) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = add i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]] +; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(7) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] +; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; CHECK-NEXT: ret i16 [[EXTRACTED]] +; + %res = atomicrmw add ptr addrspace(7) %ptr, i16 %value syncscope("agent") seq_cst + ret i16 %res +} + +define i16 @test_atomicrmw_add_i16_buffer_fat_agent_align4(ptr addrspace(7) %ptr, i16 %value) { +; CHECK-LABEL: @test_atomicrmw_add_i16_buffer_fat_agent_align4( +; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(7) [[PTR:%.*]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP2]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = add i32 [[LOADED]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[NEW]], 65535 +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[LOADED]], -65536 +; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(7) [[PTR]], i32 [[LOADED]], i32 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[NEWLOADED]] to i16 +; CHECK-NEXT: ret i16 [[EXTRACTED]] +; + %res = atomicrmw add ptr addrspace(7) %ptr, i16 %value syncscope("agent") seq_cst, align 4 + ret i16 %res +} + !0 = !{} !1 = !{!"foo", !"bar"} !2 = !{!3} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll index 97651c8d23a1e..590ee63001615 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll @@ -1608,3 +1608,107 @@ define i8 @test_atomicrmw_dec_i8_flat_agent_align4(ptr %ptr, i8 %value) { %res = atomicrmw udec_wrap ptr %ptr, i8 %value syncscope("agent") seq_cst, align 4 ret i8 %res } + +define i8 @test_atomicrmw_xchg_i8_buffer_fat_agent(ptr addrspace(7) %ptr, i8 %value) { +; CHECK-LABEL: @test_atomicrmw_xchg_i8_buffer_fat_agent( +; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(7) @llvm.ptrmask.p7.i32(ptr addrspace(7) [[PTR:%.*]], i32 -4) +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(7) [[PTR]] to i32 +; CHECK-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] +; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 +; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(7) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; CHECK-NEXT: [[TMP6:%.*]] = or i32 [[TMP5]], [[VALOPERAND_SHIFTED]] +; CHECK-NEXT: [[TMP7:%.*]] = cmpxchg ptr addrspace(7) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP6]] syncscope("agent") seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP7]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] +; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; CHECK-NEXT: ret i8 [[EXTRACTED]] +; + %res = atomicrmw xchg ptr addrspace(7) %ptr, i8 %value syncscope("agent") seq_cst + ret i8 %res +} + +define i8 @test_atomicrmw_xchg_i8_buffer_fat_agent_align4(ptr addrspace(7) %ptr, i8 %value) { +; CHECK-LABEL: @test_atomicrmw_xchg_i8_buffer_fat_agent_align4( +; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[VALUE:%.*]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(7) [[PTR:%.*]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP2]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[LOADED]], -256 +; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(7) [[PTR]], i32 [[LOADED]], i32 [[TMP4]] syncscope("agent") seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[NEWLOADED]] to i8 +; CHECK-NEXT: ret i8 [[EXTRACTED]] +; + %res = atomicrmw xchg ptr addrspace(7) %ptr, i8 %value syncscope("agent") seq_cst, align 4 + ret i8 %res +} + +define i8 @test_atomicrmw_add_i8_buffer_fat_agent(ptr addrspace(7) %ptr, i8 %value) { +; CHECK-LABEL: @test_atomicrmw_add_i8_buffer_fat_agent( +; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(7) @llvm.ptrmask.p7.i32(ptr addrspace(7) [[PTR:%.*]], i32 -4) +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(7) [[PTR]] to i32 +; CHECK-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]] +; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 +; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(7) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = add i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]] +; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(7) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] +; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 +; CHECK-NEXT: ret i8 [[EXTRACTED]] +; + %res = atomicrmw add ptr addrspace(7) %ptr, i8 %value syncscope("agent") seq_cst + ret i8 %res +} + +define i8 @test_atomicrmw_add_i8_buffer_fat_agent_align4(ptr addrspace(7) %ptr, i8 %value) { +; CHECK-LABEL: @test_atomicrmw_add_i8_buffer_fat_agent_align4( +; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[VALUE:%.*]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(7) [[PTR:%.*]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP2]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = add i32 [[LOADED]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[NEW]], 255 +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[LOADED]], -256 +; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(7) [[PTR]], i32 [[LOADED]], i32 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[NEWLOADED]] to i8 +; CHECK-NEXT: ret i8 [[EXTRACTED]] +; + %res = atomicrmw add ptr addrspace(7) %ptr, i8 %value syncscope("agent") seq_cst, align 4 + ret i8 %res +} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll index 17318b2c62ca8..34c6cdfc8d9c1 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll @@ -4669,6 +4669,136 @@ define void @test_atomicrmw_fadd_v2bf16_flat_local_noret(ptr addrspace(3) %ptr, ret void } +define half @buffer_atomicrmw_fadd_f16_agent(ptr addrspace(7) %ptr, half %f) { +; ALL-LABEL: @buffer_atomicrmw_fadd_f16_agent( +; ALL-NEXT: [[P:%.*]] = getelementptr half, ptr addrspace(7) [[PTR:%.*]], i32 4 +; ALL-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(7) @llvm.ptrmask.p7.i32(ptr addrspace(7) [[P]], i32 -4) +; ALL-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(7) [[P]] to i32 +; ALL-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; ALL-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; ALL-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] +; ALL-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; ALL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(7) [[ALIGNEDADDR]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] +; ALL-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; ALL-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half +; ALL-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[F:%.*]] +; ALL-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16 +; ALL-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; ALL-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] +; ALL-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; ALL-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; ALL-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(7) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; ALL-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] +; ALL-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 +; ALL-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half +; ALL-NEXT: ret half [[TMP7]] +; + %p = getelementptr half, ptr addrspace(7) %ptr, i32 4 + %fadd = atomicrmw fadd ptr addrspace(7) %p, half %f syncscope("agent") seq_cst + ret half %fadd +} + +define half @buffer_atomicrmw_fadd_f16_align4_agent(ptr addrspace(7) %ptr, half %f) { +; ALL-LABEL: @buffer_atomicrmw_fadd_f16_align4_agent( +; ALL-NEXT: [[P:%.*]] = getelementptr half, ptr addrspace(7) [[PTR:%.*]], i32 4 +; ALL-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(7) [[P]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 +; ALL-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to half +; ALL-NEXT: [[NEW:%.*]] = fadd half [[TMP2]], [[F:%.*]] +; ALL-NEXT: [[TMP3:%.*]] = bitcast half [[NEW]] to i16 +; ALL-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 +; ALL-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 +; ALL-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(7) [[P]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 +; ALL-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to half +; ALL-NEXT: ret half [[TMP5]] +; + %p = getelementptr half, ptr addrspace(7) %ptr, i32 4 + %fadd = atomicrmw fadd ptr addrspace(7) %p, half %f syncscope("agent") seq_cst, align 4 + ret half %fadd +} + +define bfloat @buffer_atomicrmw_fadd_bf16_agent(ptr addrspace(7) %ptr, bfloat %f) { +; ALL-LABEL: @buffer_atomicrmw_fadd_bf16_agent( +; ALL-NEXT: [[P:%.*]] = getelementptr bfloat, ptr addrspace(7) [[PTR:%.*]], i32 4 +; ALL-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(7) @llvm.ptrmask.p7.i32(ptr addrspace(7) [[P]], i32 -4) +; ALL-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(7) [[P]] to i32 +; ALL-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 +; ALL-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 +; ALL-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]] +; ALL-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 +; ALL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(7) [[ALIGNEDADDR]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] +; ALL-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 +; ALL-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat +; ALL-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[F:%.*]] +; ALL-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 +; ALL-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 +; ALL-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] +; ALL-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]] +; ALL-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]] +; ALL-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(7) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; ALL-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]] +; ALL-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16 +; ALL-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat +; ALL-NEXT: ret bfloat [[TMP7]] +; + %p = getelementptr bfloat, ptr addrspace(7) %ptr, i32 4 + %fadd = atomicrmw fadd ptr addrspace(7) %p, bfloat %f syncscope("agent") seq_cst + ret bfloat %fadd +} + +define bfloat @buffer_atomicrmw_fadd_bf16_align4_agent(ptr addrspace(7) %ptr, bfloat %f) { +; ALL-LABEL: @buffer_atomicrmw_fadd_bf16_align4_agent( +; ALL-NEXT: [[P:%.*]] = getelementptr bfloat, ptr addrspace(7) [[PTR:%.*]], i32 4 +; ALL-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(7) [[P]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16 +; ALL-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat +; ALL-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[F:%.*]] +; ALL-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16 +; ALL-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32 +; ALL-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536 +; ALL-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]] +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(7) [[P]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16 +; ALL-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat +; ALL-NEXT: ret bfloat [[TMP5]] +; + %p = getelementptr bfloat, ptr addrspace(7) %ptr, i32 4 + %fadd = atomicrmw fadd ptr addrspace(7) %p, bfloat %f syncscope("agent") seq_cst, align 4 + ret bfloat %fadd +} + attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { strictfp "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #2 = { strictfp } From f9472f4b84ba08d7ce966b5018f46533243fb2b9 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 10 Jun 2024 12:48:06 +0200 Subject: [PATCH 2/3] AMDGPU: Add some codegen tests for ptrmask with fat pointers --- llvm/test/CodeGen/AMDGPU/ptrmask.ll | 80 +++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/ptrmask.ll b/llvm/test/CodeGen/AMDGPU/ptrmask.ll index 7062270678933..ff0b95fe9ad41 100644 --- a/llvm/test/CodeGen/AMDGPU/ptrmask.ll +++ b/llvm/test/CodeGen/AMDGPU/ptrmask.ll @@ -65,6 +65,86 @@ define amdgpu_ps ptr addrspace(3) @s_ptrmask_local_variable_i32(ptr addrspace(3) ret ptr addrspace(3) %masked } +define ptr addrspace(7) @v_ptrmask_buffer_fat_ptr_variable_i32(ptr addrspace(7) %ptr, i32 %mask) { +; GCN-LABEL: v_ptrmask_buffer_fat_ptr_variable_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v4, v4, v5 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_ptrmask_buffer_fat_ptr_variable_i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: v_and_b32_e32 v4, v4, v5 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %masked = call ptr addrspace(7) @llvm.ptrmask.p7.i32(ptr addrspace(7) %ptr, i32 %mask) + ret ptr addrspace(7) %masked +} + +define ptr addrspace(7) @v_ptrmask_buffer_fat_ptr_i32_neg8(ptr addrspace(7) %ptr) { +; GCN-LABEL: v_ptrmask_buffer_fat_ptr_i32_neg8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v4, -8, v4 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_ptrmask_buffer_fat_ptr_i32_neg8: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: v_and_b32_e32 v4, -8, v4 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %masked = call ptr addrspace(7) @llvm.ptrmask.p7.i32(ptr addrspace(7) %ptr, i32 -8) + ret ptr addrspace(7) %masked +} + +define amdgpu_ps ptr addrspace(7) @s_ptrmask_buffer_fat_ptr_variable_i32(ptr addrspace(7) inreg %ptr, i32 inreg %mask) { +; GCN-LABEL: s_ptrmask_buffer_fat_ptr_variable_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s1, s3 +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: s_and_b32 s4, s6, s7 +; GCN-NEXT: s_mov_b32 s2, s8 +; GCN-NEXT: s_mov_b32 s3, s5 +; GCN-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_ptrmask_buffer_fat_ptr_variable_i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s8, s4 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_and_b32 s4, s6, s7 +; GFX10PLUS-NEXT: s_mov_b32 s2, s8 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: ; return to shader part epilog + %masked = call ptr addrspace(7) @llvm.ptrmask.p7.i32(ptr addrspace(7) %ptr, i32 %mask) + ret ptr addrspace(7) %masked +} + +define amdgpu_ps ptr addrspace(7) @s_ptrmask_buffer_fat_ptr_i32_neg8(ptr addrspace(7) inreg %ptr) { +; GCN-LABEL: s_ptrmask_buffer_fat_ptr_i32_neg8: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 s7, s4 +; GCN-NEXT: s_mov_b32 s1, s3 +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: s_and_b32 s4, s6, -8 +; GCN-NEXT: s_mov_b32 s2, s7 +; GCN-NEXT: s_mov_b32 s3, s5 +; GCN-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_ptrmask_buffer_fat_ptr_i32_neg8: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s7, s4 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_and_b32 s4, s6, -8 +; GFX10PLUS-NEXT: s_mov_b32 s2, s7 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: ; return to shader part epilog + %masked = call ptr addrspace(7) @llvm.ptrmask.p7.i32(ptr addrspace(7) %ptr, i32 -8) + ret ptr addrspace(7) %masked +} + declare ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3), i32) #0 declare ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1), i64) #0 From 1b0e2746fbaac2cc630152b2c8da02f1a5d96f33 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 10 Jun 2024 12:55:09 +0200 Subject: [PATCH 3/3] AMDGPU: Add more tests for ptrmask with fat pointers --- llvm/test/CodeGen/AMDGPU/ptrmask.ll | 74 +++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/ptrmask.ll b/llvm/test/CodeGen/AMDGPU/ptrmask.ll index ff0b95fe9ad41..8594549318dda 100644 --- a/llvm/test/CodeGen/AMDGPU/ptrmask.ll +++ b/llvm/test/CodeGen/AMDGPU/ptrmask.ll @@ -145,6 +145,80 @@ define amdgpu_ps ptr addrspace(7) @s_ptrmask_buffer_fat_ptr_i32_neg8(ptr addrspa ret ptr addrspace(7) %masked } +define ptr addrspace(8) @v_ptrmask_buffer_resource_variable_i128(ptr addrspace(8) %ptr, i128 %mask) { +; GCN-LABEL: v_ptrmask_buffer_resource_variable_i128: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, v1, v5 +; GCN-NEXT: v_and_b32_e32 v0, v0, v4 +; GCN-NEXT: v_and_b32_e32 v3, v3, v7 +; GCN-NEXT: v_and_b32_e32 v2, v2, v6 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_ptrmask_buffer_resource_variable_i128: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: v_and_b32_e32 v0, v0, v4 +; GFX10PLUS-NEXT: v_and_b32_e32 v1, v1, v5 +; GFX10PLUS-NEXT: v_and_b32_e32 v2, v2, v6 +; GFX10PLUS-NEXT: v_and_b32_e32 v3, v3, v7 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %masked = call ptr addrspace(8) @llvm.ptrmask.p8.i128(ptr addrspace(8) %ptr, i128 %mask) + ret ptr addrspace(8) %masked +} + +define ptr addrspace(8) @v_ptrmask_buffer_resource_variable_i128_neg8(ptr addrspace(8) %ptr) { +; GCN-LABEL: v_ptrmask_buffer_resource_variable_i128_neg8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, -8, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_ptrmask_buffer_resource_variable_i128_neg8: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: v_and_b32_e32 v0, -8, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %masked = call ptr addrspace(8) @llvm.ptrmask.p8.i128(ptr addrspace(8) %ptr, i128 -8) + ret ptr addrspace(8) %masked +} + +define amdgpu_ps ptr addrspace(8) @s_ptrmask_buffer_resource_variable_i128(ptr addrspace(8) inreg %ptr, i128 inreg %mask) { +; GCN-LABEL: s_ptrmask_buffer_resource_variable_i128: +; GCN: ; %bb.0: +; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[6:7] +; GCN-NEXT: s_and_b64 s[2:3], s[4:5], s[8:9] +; GCN-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_ptrmask_buffer_resource_variable_i128: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_and_b64 s[0:1], s[2:3], s[6:7] +; GFX10PLUS-NEXT: s_and_b64 s[2:3], s[4:5], s[8:9] +; GFX10PLUS-NEXT: ; return to shader part epilog + %masked = call ptr addrspace(8) @llvm.ptrmask.p8.i128(ptr addrspace(8) %ptr, i128 %mask) + ret ptr addrspace(8) %masked +} + +define amdgpu_ps ptr addrspace(8) @s_ptrmask_buffer_resource_variable_i128_neg8(ptr addrspace(8) inreg %ptr) { +; GCN-LABEL: s_ptrmask_buffer_resource_variable_i128_neg8: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 s1, s3 +; GCN-NEXT: s_and_b32 s0, s2, -8 +; GCN-NEXT: s_mov_b32 s2, s4 +; GCN-NEXT: s_mov_b32 s3, s5 +; GCN-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_ptrmask_buffer_resource_variable_i128_neg8: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_and_b32 s0, s2, -8 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: ; return to shader part epilog + %masked = call ptr addrspace(8) @llvm.ptrmask.p8.i128(ptr addrspace(8) %ptr, i128 -8) + ret ptr addrspace(8) %masked +} + declare ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3), i32) #0 declare ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1), i64) #0