-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[AMDGPU] Add target intrinsic for s_buffer_prefetch_data #107293
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-clang @llvm/pr-subscribers-backend-amdgpu Author: Stanislav Mekhanoshin (rampitec) ChangesFull diff: https://github.com/llvm/llvm-project/pull/107293.diff 16 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index ab29ef38f7792f..77595e1b701336 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -448,6 +448,7 @@ TARGET_BUILTIN(__builtin_amdgcn_s_barrier_join, "vi", "n", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_buffer_prefetch_data, "vQbIiUi", "nc", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl
index cd6bfbe647ff36..5d86a9b369429f 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl
@@ -22,3 +22,8 @@ kernel void builtins_amdgcn_s_barrier_signal_isfirst_err(global int* in, global
__builtin_amdgcn_s_barrier_wait(-1);
*out = *in;
}
+
+void test_s_buffer_prefetch_data(__amdgpu_buffer_rsrc_t rsrc, unsigned int off)
+{
+ __builtin_amdgcn_s_buffer_prefetch_data(rsrc, off, 31); // expected-error {{'__builtin_amdgcn_s_buffer_prefetch_data' must be a constant integer}}
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
index d9ec258e644c9d..8251ab336fe35b 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
@@ -256,4 +256,21 @@ void test_s_ttracedata_imm()
__builtin_amdgcn_s_ttracedata_imm(1);
}
-
+// CHECK-LABEL: @test_s_buffer_prefetch_data(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5)
+// CHECK-NEXT: [[LEN_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: store ptr addrspace(8) [[RSRC:%.*]], ptr addrspace(5) [[RSRC_ADDR]], align 16
+// CHECK-NEXT: store i32 [[LEN:%.*]], ptr addrspace(5) [[LEN_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(8), ptr addrspace(5) [[RSRC_ADDR]], align 16
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[LEN_ADDR]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.s.buffer.prefetch.data(ptr addrspace(8) [[TMP0]], i32 128, i32 [[TMP1]])
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(8), ptr addrspace(5) [[RSRC_ADDR]], align 16
+// CHECK-NEXT: call void @llvm.amdgcn.s.buffer.prefetch.data(ptr addrspace(8) [[TMP2]], i32 0, i32 31)
+// CHECK-NEXT: ret void
+//
+void test_s_buffer_prefetch_data(__amdgpu_buffer_rsrc_t rsrc, unsigned int len)
+{
+ __builtin_amdgcn_s_buffer_prefetch_data(rsrc, 128, len);
+ __builtin_amdgcn_s_buffer_prefetch_data(rsrc, 0, 31);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index dc13a35c66f9ab..57b87f6d29a5f4 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1723,6 +1723,15 @@ class AMDGPUStructPtrBufferLoadLDS : Intrinsic <
ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
def int_amdgcn_struct_ptr_buffer_load_lds : AMDGPUStructPtrBufferLoadLDS;
+def int_amdgcn_s_buffer_prefetch_data : DefaultAttrsIntrinsic <
+ [],
+ [AMDGPUBufferRsrcTy, // rsrc(SGPR)
+ llvm_i32_ty, // offset (imm)
+ llvm_i32_ty], // len (SGPR/imm)
+ [IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<1>>], "", [SDNPMemOperand]>,
+ AMDGPURsrcIntrinsic<0>,
+ ClangBuiltin<"__builtin_amdgcn_s_buffer_prefetch_data">;
+
} // defset AMDGPUBufferIntrinsics
// Uses that do not set the done bit should set IntrWriteMem on the
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 118271af879937..278d3536add916 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -296,6 +296,7 @@ def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SBYTE, SIsbuffer_load_byte>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_UBYTE, SIsbuffer_load_ubyte>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SSHORT, SIsbuffer_load_short>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_USHORT, SIsbuffer_load_ushort>;
+def : GINodeEquiv<G_AMDGPU_S_BUFFER_PREFETCH, SIsbuffer_prefetch>;
class GISelSop2Pat <
SDPatternOperator node,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 015dbc79ef9e4d..81852f6a130584 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5545,6 +5545,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
+ NODE_NAME_CASE(SBUFFER_PREFETCH_DATA)
NODE_NAME_CASE(BUFFER_STORE)
NODE_NAME_CASE(BUFFER_STORE_BYTE)
NODE_NAME_CASE(BUFFER_STORE_SHORT)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index dd9d97bd593bda..18b5c388f32932 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -589,6 +589,7 @@ enum NodeType : unsigned {
SBUFFER_LOAD_UBYTE,
SBUFFER_LOAD_SHORT,
SBUFFER_LOAD_USHORT,
+ SBUFFER_PREFETCH_DATA,
BUFFER_STORE,
BUFFER_STORE_BYTE,
BUFFER_STORE_SHORT,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 3fcb364fc2c536..4dfd3f087c1ae4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5237,7 +5237,8 @@ getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI) {
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
- std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
+ std::optional<uint64_t> OffsetVal =
+ Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI);
if (!OffsetVal)
return {};
@@ -5541,7 +5542,12 @@ void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
const MachineInstr &MI,
int OpIdx) const {
- MIB.addImm(MI.getOperand(OpIdx).getImm());
+ const MachineOperand &Op = MI.getOperand(OpIdx);
+ int64_t Imm;
+ if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
+ MIB.addImm(Imm);
+ else
+ MIB.addImm(Op.getImm());
}
void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 3f6486d44f0ee5..56f4efda7925f1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -6797,6 +6797,18 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
return true;
}
+bool AMDGPULegalizerInfo::legalizeSBufferPrefetch(LegalizerHelper &Helper,
+ MachineInstr &MI) const {
+ MachineIRBuilder &B = Helper.MIRBuilder;
+ GISelChangeObserver &Observer = Helper.Observer;
+ Observer.changingInstr(MI);
+ MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
+ MI.removeOperand(0); // Remove intrinsic ID
+ castBufferRsrcArgToV4I32(MI, B, 0);
+ Observer.changedInstr(MI);
+ return true;
+}
+
// TODO: Move to selection
bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr &MI,
MachineRegisterInfo &MRI,
@@ -7485,6 +7497,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_permlanex16:
case Intrinsic::amdgcn_permlane64:
return legalizeLaneOp(Helper, MI, IntrID);
+ case Intrinsic::amdgcn_s_buffer_prefetch_data:
+ return legalizeSBufferPrefetch(Helper, MI);
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrID))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index a815e87a7da35f..84470dc75b60ef 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -227,6 +227,8 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const;
+ bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const;
+
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 4737a322c255f4..3c0b57f16d1b76 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3101,6 +3101,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
applyMappingSBufferLoad(B, OpdMapper);
return;
}
+ case AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH:
+ constrainOpWithReadfirstlane(B, MI, 0);
+ constrainOpWithReadfirstlane(B, MI, 2);
+ return;
case AMDGPU::G_INTRINSIC:
case AMDGPU::G_INTRINSIC_CONVERGENT: {
switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
@@ -4454,6 +4458,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
break;
}
+ case AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH:
+ OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+ OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ break;
case AMDGPU::G_INTRINSIC:
case AMDGPU::G_INTRINSIC_CONVERGENT: {
switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 81b52935ddf397..64bf82653144ac 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1210,9 +1210,13 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.ptrVal = RsrcArg;
}
- auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
- if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
- Info.flags |= MachineMemOperand::MOVolatile;
+ bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
+ if (!IsSPrefetch) {
+ auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
+ if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
+ Info.flags |= MachineMemOperand::MOVolatile;
+ }
+
Info.flags |= MachineMemOperand::MODereferenceable;
if (ME.onlyReadsMemory()) {
if (RsrcIntr->IsImage) {
@@ -1251,16 +1255,18 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags |= MachineMemOperand::MOStore;
} else {
- // Atomic or NoReturn Sampler
+ // Atomic, NoReturn Sampler or prefetch
Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
ISD::INTRINSIC_W_CHAIN;
- Info.flags |= MachineMemOperand::MOLoad |
- MachineMemOperand::MOStore |
- MachineMemOperand::MODereferenceable;
+ Info.flags |=
+ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable;
+
+ if (!IsSPrefetch)
+ Info.flags |= MachineMemOperand::MOStore;
switch (IntrID) {
default:
- if (RsrcIntr->IsImage && BaseOpcode->NoReturn) {
+ if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
// Fake memory access type for no return sampler intrinsics
Info.memVT = MVT::i32;
} else {
@@ -9921,6 +9927,18 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
return SDValue(NewMI, 0);
}
+ case Intrinsic::amdgcn_s_buffer_prefetch_data: {
+ SDValue Ops[] = {
+ Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
+ Op.getOperand(3), // offset
+ Op.getOperand(4), // length
+ };
+
+ MemSDNode *M = cast<MemSDNode>(Op);
+ return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_PREFETCH_DATA, DL,
+ Op->getVTList(), Ops, M->getMemoryVT(),
+ M->getMemOperand());
+ }
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index faa8ca282e7ab8..2ccc16a8f2685e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -312,6 +312,14 @@ class isIntType<ValueType SrcVT> {
bit ret = !and(SrcVT.isInteger, !ne(SrcVT.Value, i1.Value));
}
+def SDTSBufferPrefetch : SDTypeProfile<0, 3,
+ [SDTCisVT<0, v4i32>, // rsrc
+ SDTCisVT<1, i32>, // offset(imm)
+ SDTCisVT<2, i32>]>; // length
+
+def SIsbuffer_prefetch : SDNode<"AMDGPUISD::SBUFFER_PREFETCH_DATA", SDTSBufferPrefetch,
+ [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+
//===----------------------------------------------------------------------===//
// SDNodes PatFrags for loads/stores with a glue input.
// This is for SDNodes and PatFrag for local loads and stores to
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 69e1b9a38324f2..fe04f52ef9108e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3978,6 +3978,16 @@ def G_AMDGPU_S_BUFFER_LOAD_UBYTE : SBufferLoadInstruction;
def G_AMDGPU_S_BUFFER_LOAD_SSHORT : SBufferLoadInstruction;
def G_AMDGPU_S_BUFFER_LOAD_USHORT : SBufferLoadInstruction;
+class SBufferPrefetchInstruction : AMDGPUGenericInstruction {
+ let OutOperandList = (outs);
+ let InOperandList = (ins type0:$rsrc, untyped_imm_0:$offset, type1:$len);
+ let hasSideEffects = 0;
+ let mayLoad = 1;
+ let mayStore = 1;
+}
+
+def G_AMDGPU_S_BUFFER_PREFETCH : SBufferPrefetchInstruction;
+
def G_AMDGPU_S_MUL_U64_U32 : AMDGPUGenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins type0:$src0, type0:$src1);
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 9fc570bb85f24e..bb160ffff7c6b6 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -1152,6 +1152,18 @@ multiclass SMPrefetchPat<string type, TImmLeaf cache_type> {
defm : SMPrefetchPat<"INST", i32imm_zero>;
defm : SMPrefetchPat<"DATA", i32imm_one>;
+let SubtargetPredicate = isGFX12Plus in {
+ def : GCNPat <
+ (SIsbuffer_prefetch v4i32:$sbase, (SMRDBufferImm i32:$offset), (i32 SReg_32:$len)),
+ (S_BUFFER_PREFETCH_DATA SReg_128:$sbase, i32imm:$offset, $len, 0)
+ >;
+
+ def : GCNPat <
+ (SIsbuffer_prefetch v4i32:$sbase, (SMRDBufferImm i32:$offset), imm:$len),
+ (S_BUFFER_PREFETCH_DATA SReg_128:$sbase, i32imm:$offset, (i32 SGPR_NULL), (as_i8timm $len))
+ >;
+} // End let SubtargetPredicate = isGFX12Plus
+
//===----------------------------------------------------------------------===//
// GFX10.
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.prefetch.data.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.prefetch.data.ll
new file mode 100644
index 00000000000000..88b6df43152352
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.prefetch.data.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefix=GCN %s
+
+declare void @llvm.amdgcn.s.buffer.prefetch.data(ptr addrspace(8) %rsrc, i32 %offset, i32 %len)
+
+define amdgpu_ps void @buffer_prefetch_data_imm_offset_sgpr_len(ptr addrspace(8) inreg %rsrc, i32 inreg %len) {
+; GCN-LABEL: buffer_prefetch_data_imm_offset_sgpr_len:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_buffer_prefetch_data s[0:3], 0x80, s4, 0
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.s.buffer.prefetch.data(ptr addrspace(8) inreg %rsrc, i32 128, i32 %len)
+ ret void
+}
+
+define amdgpu_ps void @buffer_prefetch_data_imm_offset_imm_len(ptr addrspace(8) inreg %rsrc) {
+; GCN-LABEL: buffer_prefetch_data_imm_offset_imm_len:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_buffer_prefetch_data s[0:3], 0x0, null, 31
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.s.buffer.prefetch.data(ptr addrspace(8) inreg %rsrc, i32 0, i32 31)
+ ret void
+}
+
+define amdgpu_ps void @buffer_prefetch_data_imm_offset_vgpr_len(ptr addrspace(8) inreg %rsrc, i32 %len) {
+; GCN-LABEL: buffer_prefetch_data_imm_offset_vgpr_len:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_readfirstlane_b32 s4, v0
+; GCN-NEXT: s_buffer_prefetch_data s[0:3], 0x80, s4, 0
+; GCN-NEXT: s_endpgm
+entry:
+ tail call void @llvm.amdgcn.s.buffer.prefetch.data(ptr addrspace(8) inreg %rsrc, i32 128, i32 %len)
+ ret void
+}
|
// For non-global address space preserve the chain and remove the call. | ||
if (!AMDGPU::isFlatGlobalAddrSpace(cast<MemSDNode>(Op)->getAddressSpace())) | ||
return Op.getOperand(0); | ||
return Op; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd expect the private/local cases to be an error. Also collectFlatAddressOperands should handle this, to get the flat->global optimization in InferAddressSpaces
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are you sure you want this? Prefetch is a hint, I did not want a hard error, just remove the nonsense. That is because after a set of inline and infer address space the address may turn to be non-global, but shall it fail to compile?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Besides it is not this change.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Infer can just not do the change if the resulting address space isn't global
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For the context of this change what do I have to do? This is the code the change does not touch.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I understand that the change is very similar to the previous one, but the keyword is buffer. It has nothing to do with infer.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the parent needs some revision for global/flat/infer handling
Do you like this more? #107624 |
No description provided.