Skip to content

[NVPTX] Remove nvvm.ldg.global.* intrinsics #112834

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 30 additions & 17 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20485,8 +20485,8 @@ static NVPTXMmaInfo getNVPTXMmaInfo(unsigned BuiltinID) {
#undef MMA_VARIANTS_B1_XOR
}

static Value *MakeLdgLdu(unsigned IntrinsicID, CodeGenFunction &CGF,
const CallExpr *E) {
static Value *MakeLdu(unsigned IntrinsicID, CodeGenFunction &CGF,
const CallExpr *E) {
Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
QualType ArgType = E->getArg(0)->getType();
clang::CharUnits Align = CGF.CGM.getNaturalPointeeTypeAlignment(ArgType);
Expand All @@ -20496,6 +20496,21 @@ static Value *MakeLdgLdu(unsigned IntrinsicID, CodeGenFunction &CGF,
{Ptr, ConstantInt::get(CGF.Builder.getInt32Ty(), Align.getQuantity())});
}

static Value *MakeLdg(CodeGenFunction &CGF, const CallExpr *E) {
Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
QualType ArgType = E->getArg(0)->getType();
clang::CharUnits AlignV = CGF.CGM.getNaturalPointeeTypeAlignment(ArgType);
llvm::Type *ElemTy = CGF.ConvertTypeForMem(ArgType->getPointeeType());

// Use addrspace(1) for NVPTX ADDRESS_SPACE_GLOBAL
auto *ASC = CGF.Builder.CreateAddrSpaceCast(Ptr, CGF.Builder.getPtrTy(1));
auto *LD = CGF.Builder.CreateAlignedLoad(ElemTy, ASC, AlignV.getAsAlign());
MDNode *MD = MDNode::get(CGF.Builder.getContext(), {});
LD->setMetadata(LLVMContext::MD_invariant_load, MD);

return LD;
}

static Value *MakeScopedAtomic(unsigned IntrinsicID, CodeGenFunction &CGF,
const CallExpr *E) {
Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
Expand Down Expand Up @@ -20529,9 +20544,11 @@ static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID,
return nullptr;
}

if (IntrinsicID == Intrinsic::nvvm_ldg_global_f ||
IntrinsicID == Intrinsic::nvvm_ldu_global_f)
return MakeLdgLdu(IntrinsicID, CGF, E);
if (BuiltinID == NVPTX::BI__nvvm_ldg_h || BuiltinID == NVPTX::BI__nvvm_ldg_h2)
return MakeLdg(CGF, E);

if (IntrinsicID == Intrinsic::nvvm_ldu_global_f)
return MakeLdu(IntrinsicID, CGF, E);

SmallVector<Value *, 16> Args;
auto *F = CGF.CGM.getIntrinsic(IntrinsicID);
Expand Down Expand Up @@ -20668,16 +20685,15 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
case NVPTX::BI__nvvm_ldg_ul2:
case NVPTX::BI__nvvm_ldg_ull:
case NVPTX::BI__nvvm_ldg_ull2:
// PTX Interoperability section 2.2: "For a vector with an even number of
// elements, its alignment is set to number of elements times the alignment
// of its member: n*alignof(t)."
return MakeLdgLdu(Intrinsic::nvvm_ldg_global_i, *this, E);
case NVPTX::BI__nvvm_ldg_f:
case NVPTX::BI__nvvm_ldg_f2:
case NVPTX::BI__nvvm_ldg_f4:
case NVPTX::BI__nvvm_ldg_d:
case NVPTX::BI__nvvm_ldg_d2:
return MakeLdgLdu(Intrinsic::nvvm_ldg_global_f, *this, E);
// PTX Interoperability section 2.2: "For a vector with an even number of
// elements, its alignment is set to number of elements times the alignment
// of its member: n*alignof(t)."
return MakeLdg(*this, E);

case NVPTX::BI__nvvm_ldu_c:
case NVPTX::BI__nvvm_ldu_sc:
Expand Down Expand Up @@ -20708,13 +20724,13 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
case NVPTX::BI__nvvm_ldu_ul2:
case NVPTX::BI__nvvm_ldu_ull:
case NVPTX::BI__nvvm_ldu_ull2:
return MakeLdgLdu(Intrinsic::nvvm_ldu_global_i, *this, E);
return MakeLdu(Intrinsic::nvvm_ldu_global_i, *this, E);
case NVPTX::BI__nvvm_ldu_f:
case NVPTX::BI__nvvm_ldu_f2:
case NVPTX::BI__nvvm_ldu_f4:
case NVPTX::BI__nvvm_ldu_d:
case NVPTX::BI__nvvm_ldu_d2:
return MakeLdgLdu(Intrinsic::nvvm_ldu_global_f, *this, E);
return MakeLdu(Intrinsic::nvvm_ldu_global_f, *this, E);

case NVPTX::BI__nvvm_atom_cta_add_gen_i:
case NVPTX::BI__nvvm_atom_cta_add_gen_l:
Expand Down Expand Up @@ -21188,14 +21204,11 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
return MakeHalfType(Intrinsic::nvvm_fmin_xorsign_abs_f16x2, BuiltinID, E,
*this);
case NVPTX::BI__nvvm_ldg_h:
return MakeHalfType(Intrinsic::nvvm_ldg_global_f, BuiltinID, E, *this);
case NVPTX::BI__nvvm_ldg_h2:
return MakeHalfType(Intrinsic::nvvm_ldg_global_f, BuiltinID, E, *this);
return MakeHalfType(Intrinsic::not_intrinsic, BuiltinID, E, *this);
case NVPTX::BI__nvvm_ldu_h:
case NVPTX::BI__nvvm_ldu_h2:
return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this);
case NVPTX::BI__nvvm_ldu_h2: {
return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this);
}
case NVPTX::BI__nvvm_cp_async_ca_shared_global_4:
return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_4,
Intrinsic::nvvm_cp_async_ca_shared_global_4_s, *this, E,
Expand Down
4 changes: 2 additions & 2 deletions clang/test/CodeGen/builtins-nvptx-native-half-type-native.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2)));
// CHECK: call <2 x half> @llvm.nvvm.fmax.ftz.xorsign.abs.f16x2(<2 x half> {{.*}}, <2 x half> {{.*}})
// CHECK: call <2 x half> @llvm.nvvm.fmax.nan.xorsign.abs.f16x2(<2 x half> {{.*}}, <2 x half> {{.*}})
// CHECK: call <2 x half> @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16x2(<2 x half> {{.*}}, <2 x half> {{.*}})
// CHECK: call half @llvm.nvvm.ldg.global.f.f16.p0(ptr {{.*}}, i32 2)
// CHECK: call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr {{.*}}, i32 4)
// CHECK: load half, ptr addrspace(1) {{.*}}, align 2, !invariant.load
// CHECK: load <2 x half>, ptr addrspace(1) {{.*}}, align 4, !invariant.load
// CHECK: call half @llvm.nvvm.ldu.global.f.f16.p0(ptr {{.*}}, i32 2)
// CHECK: call <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p0(ptr {{.*}}, i32 4)
__device__ void nvvm_native_half_types(void *a, void*b, void*c, __fp16* out) {
Expand Down
4 changes: 2 additions & 2 deletions clang/test/CodeGen/builtins-nvptx-native-half-type.c
Original file line number Diff line number Diff line change
Expand Up @@ -177,9 +177,9 @@ typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2)));

// CHECK-LABEL: nvvm_ldg_native_half_types
__device__ void nvvm_ldg_native_half_types(const void *p) {
// CHECK: call half @llvm.nvvm.ldg.global.f.f16.p0
// CHECK: load half, ptr addrspace(1) {{.*}}, align 2, !invariant.load
__nvvm_ldg_h((const __fp16 *)p);
// CHECK: call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0
// CHECK: load <2 x half>, ptr addrspace(1) {{.*}}, align 4, !invariant.load
__nvvm_ldg_h2((const __fp16v2 *)p);
}

Expand Down
72 changes: 36 additions & 36 deletions clang/test/CodeGen/builtins-nvptx.c
Original file line number Diff line number Diff line change
Expand Up @@ -598,33 +598,33 @@ __device__ void nvvm_atom(float *fp, float f, double *dfp, double df,

// CHECK-LABEL: nvvm_ldg
__device__ void nvvm_ldg(const void *p) {
// CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
// CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
// CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
// CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load
// CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load
// CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load
__nvvm_ldg_c((const char *)p);
__nvvm_ldg_uc((const unsigned char *)p);
__nvvm_ldg_sc((const signed char *)p);

// CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
// CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
// CHECK: load i16, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
// CHECK: load i16, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
__nvvm_ldg_s((const short *)p);
__nvvm_ldg_us((const unsigned short *)p);

// CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
// CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
// CHECK: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
// CHECK: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
__nvvm_ldg_i((const int *)p);
__nvvm_ldg_ui((const unsigned int *)p);

// LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
// LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
// LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8)
// LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8)
// LP32: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
// LP32: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
// LP64: load i64, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
// LP64: load i64, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
__nvvm_ldg_l((const long *)p);
__nvvm_ldg_ul((const unsigned long *)p);

// CHECK: call float @llvm.nvvm.ldg.global.f.f32.p0(ptr {{%[0-9]+}}, i32 4)
// CHECK: load float, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
__nvvm_ldg_f((const float *)p);
// CHECK: call double @llvm.nvvm.ldg.global.f.f64.p0(ptr {{%[0-9]+}}, i32 8)
// CHECK: load double, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
__nvvm_ldg_d((const double *)p);

// In practice, the pointers we pass to __ldg will be aligned as appropriate
Expand All @@ -636,79 +636,79 @@ __device__ void nvvm_ldg(const void *p) {
// elements, its alignment is set to number of elements times the alignment of
// its member: n*alignof(t)."

// CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
// CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
// CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
// CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
// CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
// CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
typedef char char2 __attribute__((ext_vector_type(2)));
typedef unsigned char uchar2 __attribute__((ext_vector_type(2)));
typedef signed char schar2 __attribute__((ext_vector_type(2)));
__nvvm_ldg_c2((const char2 *)p);
__nvvm_ldg_uc2((const uchar2 *)p);
__nvvm_ldg_sc2((const schar2 *)p);

// CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
// CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
// CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
// CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
// CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
// CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
typedef char char4 __attribute__((ext_vector_type(4)));
typedef unsigned char uchar4 __attribute__((ext_vector_type(4)));
typedef signed char schar4 __attribute__((ext_vector_type(4)));
__nvvm_ldg_c4((const char4 *)p);
__nvvm_ldg_uc4((const uchar4 *)p);
__nvvm_ldg_sc4((const schar4 *)p);

// CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4)
// CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4)
// CHECK: load <2 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
// CHECK: load <2 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
typedef short short2 __attribute__((ext_vector_type(2)));
typedef unsigned short ushort2 __attribute__((ext_vector_type(2)));
__nvvm_ldg_s2((const short2 *)p);
__nvvm_ldg_us2((const ushort2 *)p);

// CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8)
// CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8)
// CHECK: load <4 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
// CHECK: load <4 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
typedef short short4 __attribute__((ext_vector_type(4)));
typedef unsigned short ushort4 __attribute__((ext_vector_type(4)));
__nvvm_ldg_s4((const short4 *)p);
__nvvm_ldg_us4((const ushort4 *)p);

// CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
// CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
// CHECK: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
// CHECK: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
typedef int int2 __attribute__((ext_vector_type(2)));
typedef unsigned int uint2 __attribute__((ext_vector_type(2)));
__nvvm_ldg_i2((const int2 *)p);
__nvvm_ldg_ui2((const uint2 *)p);

// CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16)
// CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16)
// CHECK: load <4 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
// CHECK: load <4 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
typedef int int4 __attribute__((ext_vector_type(4)));
typedef unsigned int uint4 __attribute__((ext_vector_type(4)));
__nvvm_ldg_i4((const int4 *)p);
__nvvm_ldg_ui4((const uint4 *)p);

// LP32: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
// LP32: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
// LP64: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
// LP64: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
// LP32: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
// LP32: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
// LP64: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
// LP64: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
typedef long long2 __attribute__((ext_vector_type(2)));
typedef unsigned long ulong2 __attribute__((ext_vector_type(2)));
__nvvm_ldg_l2((const long2 *)p);
__nvvm_ldg_ul2((const ulong2 *)p);

// CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
// CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
// CHECK: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
// CHECK: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
typedef long long longlong2 __attribute__((ext_vector_type(2)));
typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2)));
__nvvm_ldg_ll2((const longlong2 *)p);
__nvvm_ldg_ull2((const ulonglong2 *)p);

// CHECK: call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0(ptr {{%[0-9]+}}, i32 8)
// CHECK: load <2 x float>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
typedef float float2 __attribute__((ext_vector_type(2)));
__nvvm_ldg_f2((const float2 *)p);

// CHECK: call <4 x float> @llvm.nvvm.ldg.global.f.v4f32.p0(ptr {{%[0-9]+}}, i32 16)
// CHECK: load <4 x float>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
typedef float float4 __attribute__((ext_vector_type(4)));
__nvvm_ldg_f4((const float4 *)p);

// CHECK: call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr {{%[0-9]+}}, i32 16)
// CHECK: load <2 x double>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
typedef double double2 __attribute__((ext_vector_type(2)));
__nvvm_ldg_d2((const double2 *)p);
}
Expand Down
9 changes: 8 additions & 1 deletion llvm/docs/ReleaseNotes.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,14 @@ Changes to the LLVM IR
* `llvm.nvvm.ptr.shared.to.gen`
* `llvm.nvvm.ptr.constant.to.gen`
* `llvm.nvvm.ptr.local.to.gen`


* Remove the following intrinsics which can be relaced with a load from
addrspace(1) with an !invariant.load metadata

* `llvm.nvvm.ldg.global.i`
* `llvm.nvvm.ldg.global.f`
* `llvm.nvvm.ldg.global.p`

* Operand bundle values can now be metadata strings.

Changes to LLVM infrastructure
Expand Down
18 changes: 3 additions & 15 deletions llvm/include/llvm/IR/IntrinsicsNVVM.td
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@
// * llvm.nvvm.ptr.shared.to.gen --> ibid.
// * llvm.nvvm.ptr.constant.to.gen --> ibid.
// * llvm.nvvm.ptr.local.to.gen --> ibid.
// * llvm.nvvm.ldg.global.i --> load addrspace(1) !load.invariant
// * llvm.nvvm.ldg.global.f --> ibid.
// * llvm.nvvm.ldg.global.p --> ibid.

def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr
def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr
Expand Down Expand Up @@ -1595,21 +1598,6 @@ def int_nvvm_ldu_global_p : Intrinsic<[llvm_anyptr_ty],
[IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
"llvm.nvvm.ldu.global.p">;

// Generated within nvvm. Use for ldg on sm_35 or later. Second arg is the
// pointer's alignment.
def int_nvvm_ldg_global_i : Intrinsic<[llvm_anyint_ty],
[llvm_anyptr_ty, llvm_i32_ty],
[IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
"llvm.nvvm.ldg.global.i">;
def int_nvvm_ldg_global_f : Intrinsic<[llvm_anyfloat_ty],
[llvm_anyptr_ty, llvm_i32_ty],
[IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
"llvm.nvvm.ldg.global.f">;
def int_nvvm_ldg_global_p : Intrinsic<[llvm_anyptr_ty],
[llvm_anyptr_ty, llvm_i32_ty],
[IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
"llvm.nvvm.ldg.global.p">;

// Used in nvvm internally to help address space opt and ptx code generation
// This is for params that are passed to kernel functions by pointer by-val.
def int_nvvm_ptr_gen_to_param: Intrinsic<[llvm_anyptr_ty],
Expand Down
14 changes: 14 additions & 0 deletions llvm/lib/IR/AutoUpgrade.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/Verifier.h"
#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Support/CommandLine.h"
Expand Down Expand Up @@ -1301,6 +1302,10 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
(Name.consume_front("local") || Name.consume_front("shared") ||
Name.consume_front("global") || Name.consume_front("constant")) &&
Name.starts_with(".to.gen");
else if (Name.consume_front("ldg.global."))
// nvvm.ldg.global.{i,p,f}
Expand = (Name.starts_with("i.") || Name.starts_with("f.") ||
Name.starts_with("p."));
else
Expand = false;

Expand Down Expand Up @@ -2363,6 +2368,15 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
Name.consume_front("constant")) &&
Name.starts_with(".to.gen"))) {
Rep = Builder.CreateAddrSpaceCast(CI->getArgOperand(0), CI->getType());
} else if (Name.consume_front("ldg.global")) {
Value *Ptr = CI->getArgOperand(0);
Align PtrAlign = cast<ConstantInt>(CI->getArgOperand(1))->getAlignValue();
// Use addrspace(1) for NVPTX ADDRESS_SPACE_GLOBAL
Value *ASC = Builder.CreateAddrSpaceCast(Ptr, Builder.getPtrTy(1));
Instruction *LD = Builder.CreateAlignedLoad(CI->getType(), ASC, PtrAlign);
MDNode *MD = MDNode::get(Builder.getContext(), {});
LD->setMetadata(LLVMContext::MD_invariant_load, MD);
return LD;
} else {
Intrinsic::ID IID = shouldUpgradeNVPTXBF16Intrinsic(Name);
if (IID != Intrinsic::not_intrinsic &&
Expand Down
Loading
Loading