Skip to content

Conversation

AlexMaclean
Copy link
Member

Remove these intrinsics which can be better represented by load instructions with !invariant.load metadata:

  • llvm.nvvm.ldg.global.i
  • llvm.nvvm.ldg.global.f
  • llvm.nvvm.ldg.global.p

@AlexMaclean AlexMaclean self-assigned this Oct 18, 2024
@llvmbot llvmbot added clang Clang issues not falling into any other category clang:codegen IR generation bugs: mangling, exceptions, etc. backend:NVPTX llvm:ir labels Oct 18, 2024
@llvmbot
Copy link
Member

llvmbot commented Oct 18, 2024

@llvm/pr-subscribers-clang

Author: Alex MacLean (AlexMaclean)

Changes

Remove these intrinsics which can be better represented by load instructions with !invariant.load metadata:

  • llvm.nvvm.ldg.global.i
  • llvm.nvvm.ldg.global.f
  • llvm.nvvm.ldg.global.p

Patch is 40.27 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/112834.diff

10 Files Affected:

  • (modified) clang/lib/CodeGen/CGBuiltin.cpp (+29-16)
  • (modified) clang/test/CodeGen/builtins-nvptx-native-half-type-native.c (+2-2)
  • (modified) clang/test/CodeGen/builtins-nvptx-native-half-type.c (+2-2)
  • (modified) clang/test/CodeGen/builtins-nvptx.c (+36-36)
  • (modified) llvm/include/llvm/IR/IntrinsicsNVVM.td (+3-15)
  • (modified) llvm/lib/IR/AutoUpgrade.cpp (+14)
  • (modified) llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp (+69-120)
  • (modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (+2-53)
  • (modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.h (-2)
  • (modified) llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll (+31)
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index f6d7db2c204c12..3b42977b578e15 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -20473,7 +20473,7 @@ static NVPTXMmaInfo getNVPTXMmaInfo(unsigned BuiltinID) {
 #undef MMA_VARIANTS_B1_XOR
 }
 
-static Value *MakeLdgLdu(unsigned IntrinsicID, CodeGenFunction &CGF,
+static Value *MakeLdu(unsigned IntrinsicID, CodeGenFunction &CGF,
                          const CallExpr *E) {
   Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
   QualType ArgType = E->getArg(0)->getType();
@@ -20484,6 +20484,21 @@ static Value *MakeLdgLdu(unsigned IntrinsicID, CodeGenFunction &CGF,
       {Ptr, ConstantInt::get(CGF.Builder.getInt32Ty(), Align.getQuantity())});
 }
 
+static Value *MakeLdg(CodeGenFunction &CGF, const CallExpr *E) {
+  Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
+  QualType ArgType = E->getArg(0)->getType();
+  clang::CharUnits AlignV = CGF.CGM.getNaturalPointeeTypeAlignment(ArgType);
+  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ArgType->getPointeeType());
+
+  // Use addrspace(1) for NVPTX ADDRESS_SPACE_GLOBAL
+  auto *ASC = CGF.Builder.CreateAddrSpaceCast(Ptr, CGF.Builder.getPtrTy(1));
+  auto *LD = CGF.Builder.CreateAlignedLoad(ElemTy, ASC, AlignV.getAsAlign());
+  MDNode *MD = MDNode::get(CGF.Builder.getContext(), {});
+  LD->setMetadata(LLVMContext::MD_invariant_load, MD);
+
+  return LD;
+}
+
 static Value *MakeScopedAtomic(unsigned IntrinsicID, CodeGenFunction &CGF,
                                const CallExpr *E) {
   Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
@@ -20517,9 +20532,11 @@ static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID,
     return nullptr;
   }
 
-  if (IntrinsicID == Intrinsic::nvvm_ldg_global_f ||
-      IntrinsicID == Intrinsic::nvvm_ldu_global_f)
-    return MakeLdgLdu(IntrinsicID, CGF, E);
+  if (BuiltinID == NVPTX::BI__nvvm_ldg_h || BuiltinID == NVPTX::BI__nvvm_ldg_h2)
+    return MakeLdg(CGF, E);
+
+  if (IntrinsicID == Intrinsic::nvvm_ldu_global_f)
+    return MakeLdu(IntrinsicID, CGF, E);
 
   SmallVector<Value *, 16> Args;
   auto *F = CGF.CGM.getIntrinsic(IntrinsicID);
@@ -20656,16 +20673,15 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
   case NVPTX::BI__nvvm_ldg_ul2:
   case NVPTX::BI__nvvm_ldg_ull:
   case NVPTX::BI__nvvm_ldg_ull2:
-    // PTX Interoperability section 2.2: "For a vector with an even number of
-    // elements, its alignment is set to number of elements times the alignment
-    // of its member: n*alignof(t)."
-    return MakeLdgLdu(Intrinsic::nvvm_ldg_global_i, *this, E);
   case NVPTX::BI__nvvm_ldg_f:
   case NVPTX::BI__nvvm_ldg_f2:
   case NVPTX::BI__nvvm_ldg_f4:
   case NVPTX::BI__nvvm_ldg_d:
   case NVPTX::BI__nvvm_ldg_d2:
-    return MakeLdgLdu(Intrinsic::nvvm_ldg_global_f, *this, E);
+    // PTX Interoperability section 2.2: "For a vector with an even number of
+    // elements, its alignment is set to number of elements times the alignment
+    // of its member: n*alignof(t)."
+    return MakeLdg(*this, E);
 
   case NVPTX::BI__nvvm_ldu_c:
   case NVPTX::BI__nvvm_ldu_sc:
@@ -20696,13 +20712,13 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
   case NVPTX::BI__nvvm_ldu_ul2:
   case NVPTX::BI__nvvm_ldu_ull:
   case NVPTX::BI__nvvm_ldu_ull2:
-    return MakeLdgLdu(Intrinsic::nvvm_ldu_global_i, *this, E);
+    return MakeLdu(Intrinsic::nvvm_ldu_global_i, *this, E);
   case NVPTX::BI__nvvm_ldu_f:
   case NVPTX::BI__nvvm_ldu_f2:
   case NVPTX::BI__nvvm_ldu_f4:
   case NVPTX::BI__nvvm_ldu_d:
   case NVPTX::BI__nvvm_ldu_d2:
-    return MakeLdgLdu(Intrinsic::nvvm_ldu_global_f, *this, E);
+    return MakeLdu(Intrinsic::nvvm_ldu_global_f, *this, E);
 
   case NVPTX::BI__nvvm_atom_cta_add_gen_i:
   case NVPTX::BI__nvvm_atom_cta_add_gen_l:
@@ -21176,14 +21192,11 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
     return MakeHalfType(Intrinsic::nvvm_fmin_xorsign_abs_f16x2, BuiltinID, E,
                         *this);
   case NVPTX::BI__nvvm_ldg_h:
-    return MakeHalfType(Intrinsic::nvvm_ldg_global_f, BuiltinID, E, *this);
   case NVPTX::BI__nvvm_ldg_h2:
-    return MakeHalfType(Intrinsic::nvvm_ldg_global_f, BuiltinID, E, *this);
+    return MakeHalfType(Intrinsic::not_intrinsic, BuiltinID, E, *this);
   case NVPTX::BI__nvvm_ldu_h:
+  case NVPTX::BI__nvvm_ldu_h2:
     return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this);
-  case NVPTX::BI__nvvm_ldu_h2: {
-    return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this);
-  }
   case NVPTX::BI__nvvm_cp_async_ca_shared_global_4:
     return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_4,
                        Intrinsic::nvvm_cp_async_ca_shared_global_4_s, *this, E,
diff --git a/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c b/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c
index b594fc876d4b9e..035c4c6066be24 100644
--- a/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c
+++ b/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c
@@ -52,8 +52,8 @@ typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2)));
 // CHECK: call <2 x half> @llvm.nvvm.fmax.ftz.xorsign.abs.f16x2(<2 x half> {{.*}}, <2 x half> {{.*}})
 // CHECK: call <2 x half> @llvm.nvvm.fmax.nan.xorsign.abs.f16x2(<2 x half> {{.*}}, <2 x half> {{.*}})
 // CHECK: call <2 x half> @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16x2(<2 x half> {{.*}}, <2 x half> {{.*}})
-// CHECK: call half @llvm.nvvm.ldg.global.f.f16.p0(ptr {{.*}}, i32 2)
-// CHECK: call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr {{.*}}, i32 4)
+// CHECK: load half, ptr addrspace(1) {{.*}}, align 2, !invariant.load
+// CHECK: load <2 x half>, ptr addrspace(1) {{.*}}, align 4, !invariant.load
 // CHECK: call half @llvm.nvvm.ldu.global.f.f16.p0(ptr {{.*}}, i32 2)
 // CHECK: call <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p0(ptr {{.*}}, i32 4)
 __device__ void nvvm_native_half_types(void *a, void*b, void*c, __fp16* out) {
diff --git a/clang/test/CodeGen/builtins-nvptx-native-half-type.c b/clang/test/CodeGen/builtins-nvptx-native-half-type.c
index 4aeae953bc1622..511497702ff7f9 100644
--- a/clang/test/CodeGen/builtins-nvptx-native-half-type.c
+++ b/clang/test/CodeGen/builtins-nvptx-native-half-type.c
@@ -177,9 +177,9 @@ typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2)));
 
 // CHECK-LABEL: nvvm_ldg_native_half_types
 __device__ void nvvm_ldg_native_half_types(const void *p) {
-  // CHECK: call half @llvm.nvvm.ldg.global.f.f16.p0
+  // CHECK: load half, ptr addrspace(1) {{.*}}, align 2, !invariant.load
   __nvvm_ldg_h((const __fp16 *)p);
-  // CHECK: call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0
+  // CHECK: load <2 x half>, ptr addrspace(1) {{.*}}, align 4, !invariant.load
   __nvvm_ldg_h2((const __fp16v2 *)p);
 }
 
diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c
index 0d0e3ecdb90c9e..3406cbdde2bf88 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -598,33 +598,33 @@ __device__ void nvvm_atom(float *fp, float f, double *dfp, double df,
 
 // CHECK-LABEL: nvvm_ldg
 __device__ void nvvm_ldg(const void *p) {
-  // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
-  // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
-  // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
+  // CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load
+  // CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load
+  // CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load
   __nvvm_ldg_c((const char *)p);
   __nvvm_ldg_uc((const unsigned char *)p);
   __nvvm_ldg_sc((const signed char *)p);
 
-  // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
-  // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
+  // CHECK: load i16, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
+  // CHECK: load i16, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
   __nvvm_ldg_s((const short *)p);
   __nvvm_ldg_us((const unsigned short *)p);
 
-  // CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
-  // CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
+  // CHECK: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
+  // CHECK: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
   __nvvm_ldg_i((const int *)p);
   __nvvm_ldg_ui((const unsigned int *)p);
 
-  // LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
-  // LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
-  // LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8)
-  // LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8)
+  // LP32: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
+  // LP32: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
+  // LP64: load i64, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
+  // LP64: load i64, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
   __nvvm_ldg_l((const long *)p);
   __nvvm_ldg_ul((const unsigned long *)p);
 
-  // CHECK: call float @llvm.nvvm.ldg.global.f.f32.p0(ptr {{%[0-9]+}}, i32 4)
+  // CHECK: load float, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
   __nvvm_ldg_f((const float *)p);
-  // CHECK: call double @llvm.nvvm.ldg.global.f.f64.p0(ptr {{%[0-9]+}}, i32 8)
+  // CHECK: load double, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
   __nvvm_ldg_d((const double *)p);
 
   // In practice, the pointers we pass to __ldg will be aligned as appropriate
@@ -636,9 +636,9 @@ __device__ void nvvm_ldg(const void *p) {
   // elements, its alignment is set to number of elements times the alignment of
   // its member: n*alignof(t)."
 
-  // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
-  // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
-  // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
+  // CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
+  // CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
+  // CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
   typedef char char2 __attribute__((ext_vector_type(2)));
   typedef unsigned char uchar2 __attribute__((ext_vector_type(2)));
   typedef signed char schar2 __attribute__((ext_vector_type(2)));
@@ -646,9 +646,9 @@ __device__ void nvvm_ldg(const void *p) {
   __nvvm_ldg_uc2((const uchar2 *)p);
   __nvvm_ldg_sc2((const schar2 *)p);
 
-  // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
-  // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
-  // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
+  // CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
+  // CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
+  // CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
   typedef char char4 __attribute__((ext_vector_type(4)));
   typedef unsigned char uchar4 __attribute__((ext_vector_type(4)));
   typedef signed char schar4 __attribute__((ext_vector_type(4)));
@@ -656,59 +656,59 @@ __device__ void nvvm_ldg(const void *p) {
   __nvvm_ldg_uc4((const uchar4 *)p);
   __nvvm_ldg_sc4((const schar4 *)p);
 
-  // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4)
-  // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4)
+  // CHECK: load <2 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
+  // CHECK: load <2 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
   typedef short short2 __attribute__((ext_vector_type(2)));
   typedef unsigned short ushort2 __attribute__((ext_vector_type(2)));
   __nvvm_ldg_s2((const short2 *)p);
   __nvvm_ldg_us2((const ushort2 *)p);
 
-  // CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8)
-  // CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8)
+  // CHECK: load <4 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
+  // CHECK: load <4 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
   typedef short short4 __attribute__((ext_vector_type(4)));
   typedef unsigned short ushort4 __attribute__((ext_vector_type(4)));
   __nvvm_ldg_s4((const short4 *)p);
   __nvvm_ldg_us4((const ushort4 *)p);
 
-  // CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
-  // CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
+  // CHECK: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
+  // CHECK: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
   typedef int int2 __attribute__((ext_vector_type(2)));
   typedef unsigned int uint2 __attribute__((ext_vector_type(2)));
   __nvvm_ldg_i2((const int2 *)p);
   __nvvm_ldg_ui2((const uint2 *)p);
 
-  // CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16)
-  // CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16)
+  // CHECK: load <4 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
+  // CHECK: load <4 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
   typedef int int4 __attribute__((ext_vector_type(4)));
   typedef unsigned int uint4 __attribute__((ext_vector_type(4)));
   __nvvm_ldg_i4((const int4 *)p);
   __nvvm_ldg_ui4((const uint4 *)p);
 
-  // LP32: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
-  // LP32: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
-  // LP64: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
-  // LP64: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
+  // LP32: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
+  // LP32: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
+  // LP64: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
+  // LP64: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
   typedef long long2 __attribute__((ext_vector_type(2)));
   typedef unsigned long ulong2 __attribute__((ext_vector_type(2)));
   __nvvm_ldg_l2((const long2 *)p);
   __nvvm_ldg_ul2((const ulong2 *)p);
 
-  // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
-  // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
+  // CHECK: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
+  // CHECK: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
   typedef long long longlong2 __attribute__((ext_vector_type(2)));
   typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2)));
   __nvvm_ldg_ll2((const longlong2 *)p);
   __nvvm_ldg_ull2((const ulonglong2 *)p);
 
-  // CHECK: call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0(ptr {{%[0-9]+}}, i32 8)
+  // CHECK: load <2 x float>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
   typedef float float2 __attribute__((ext_vector_type(2)));
   __nvvm_ldg_f2((const float2 *)p);
 
-  // CHECK: call <4 x float> @llvm.nvvm.ldg.global.f.v4f32.p0(ptr {{%[0-9]+}}, i32 16)
+  // CHECK: load <4 x float>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
   typedef float float4 __attribute__((ext_vector_type(4)));
   __nvvm_ldg_f4((const float4 *)p);
 
-  // CHECK: call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr {{%[0-9]+}}, i32 16)
+  // CHECK: load <2 x double>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
   typedef double double2 __attribute__((ext_vector_type(2)));
   __nvvm_ldg_d2((const double2 *)p);
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 7b8ffe417fccdb..3cc45adb198e26 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -42,6 +42,9 @@
 //   * llvm.nvvm.ptr.shared.to.gen   --> ibid.
 //   * llvm.nvvm.ptr.constant.to.gen --> ibid.
 //   * llvm.nvvm.ptr.local.to.gen    --> ibid.
+//   * llvm.nvvm.ldg.global.i        --> load addrspace(1) !load.invariant
+//   * llvm.nvvm.ldg.global.f        --> ibid.
+//   * llvm.nvvm.ldg.global.p        --> ibid.
 
 def llvm_global_ptr_ty  : LLVMQualPointerType<1>;  // (global)ptr
 def llvm_shared_ptr_ty  : LLVMQualPointerType<3>;  // (shared)ptr
@@ -1595,21 +1598,6 @@ def int_nvvm_ldu_global_p : Intrinsic<[llvm_anyptr_ty],
   [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
   "llvm.nvvm.ldu.global.p">;
 
-// Generated within nvvm. Use for ldg on sm_35 or later.  Second arg is the
-// pointer's alignment.
-def int_nvvm_ldg_global_i : Intrinsic<[llvm_anyint_ty],
-  [llvm_anyptr_ty, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
-  "llvm.nvvm.ldg.global.i">;
-def int_nvvm_ldg_global_f : Intrinsic<[llvm_anyfloat_ty],
-  [llvm_anyptr_ty, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
-  "llvm.nvvm.ldg.global.f">;
-def int_nvvm_ldg_global_p : Intrinsic<[llvm_anyptr_ty],
-  [llvm_anyptr_ty, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
-  "llvm.nvvm.ldg.global.p">;
-
 // Used in nvvm internally to help address space opt and ptx code generation
 // This is for params that are passed to kernel functions by pointer by-val.
 def int_nvvm_ptr_gen_to_param: Intrinsic<[llvm_anyptr_ty],
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index bb03c9290e4cf4..73882fbc7a251a 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -37,6 +37,7 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/CommandLine.h"
@@ -1301,6 +1302,10 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
             (Name.consume_front("local") || Name.consume_front("shared") ||
              Name.consume_front("global") || Name.consume_front("constant")) &&
             Name.starts_with(".to.gen");
+      else if (Name.consume_front("ldg.global."))
+        // nvvm.ldg.global.{i,p,f}
+        Expand = (Name.starts_with("i.") || Name.starts_with("f.") ||
+                  Name.starts_with("p."));
       else
         Expand = false;
 
@@ -2363,6 +2368,15 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
                Name.consume_front("constant")) &&
               Name.starts_with(".to.gen"))) {
     Rep = Builder.CreateAddrSpaceCast(CI->getArgOperand(0), CI->getType());
+  } else if (Name.consume_front("ldg.global")) {
+    Value *Ptr = CI->getArgOperand(0);
+    Align PtrAlign = cast<ConstantInt>(CI->getArgOperand(1))->getAlignValue();
+    // Use addrspace(1) for NVPTX ADDRESS_SPACE_GLOBAL
+    Value *ASC = Builder.CreateAddrSpaceCast(Ptr, Builder.getPtrTy(1));
+    Instruction *LD = Builder.CreateAlignedLoad(CI->getType(), ASC, PtrAlign);
+    MDNode *MD = MDNode::get(Builder.getContext(), {});
+    LD->setMetadata(LLVMContext::MD_invariant_load, MD);
+    return LD;
   } else {
     Intrinsic::ID IID = shouldUpgradeNVPTXBF16Intrinsic(Name);
     if (IID != Intrinsic::not_intrinsic &&
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 7f942de74bdcc9..4d2e7fb373de5d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -126,8 +126,6 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
     if (tryLoadVector(N))
       return;
     break;
-  case NVPTXISD::LDGV2:
-  case NVPTXISD::LDGV4:
   case NVPTXISD::LDUV2:
   case NVPTXISD::LDUV4:
     if (tryLDGLDU(N))
@@ -550,9 +548,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
   switch (IID) {
   default:
     return false;
-  case Intrinsic::nvvm_ldg_global_f:
-  case Intrinsic::nvvm_ldg_global_i:
-  case Intrinsic::nvvm_ldg_glo...
[truncated]

@llvmbot
Copy link
Member

llvmbot commented Oct 18, 2024

@llvm/pr-subscribers-clang-codegen

Author: Alex MacLean (AlexMaclean)

Changes

Remove these intrinsics which can be better represented by load instructions with !invariant.load metadata:

  • llvm.nvvm.ldg.global.i
  • llvm.nvvm.ldg.global.f
  • llvm.nvvm.ldg.global.p

Patch is 40.27 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/112834.diff

10 Files Affected:

  • (modified) clang/lib/CodeGen/CGBuiltin.cpp (+29-16)
  • (modified) clang/test/CodeGen/builtins-nvptx-native-half-type-native.c (+2-2)
  • (modified) clang/test/CodeGen/builtins-nvptx-native-half-type.c (+2-2)
  • (modified) clang/test/CodeGen/builtins-nvptx.c (+36-36)
  • (modified) llvm/include/llvm/IR/IntrinsicsNVVM.td (+3-15)
  • (modified) llvm/lib/IR/AutoUpgrade.cpp (+14)
  • (modified) llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp (+69-120)
  • (modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (+2-53)
  • (modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.h (-2)
  • (modified) llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll (+31)
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index f6d7db2c204c12..3b42977b578e15 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -20473,7 +20473,7 @@ static NVPTXMmaInfo getNVPTXMmaInfo(unsigned BuiltinID) {
 #undef MMA_VARIANTS_B1_XOR
 }
 
-static Value *MakeLdgLdu(unsigned IntrinsicID, CodeGenFunction &CGF,
+static Value *MakeLdu(unsigned IntrinsicID, CodeGenFunction &CGF,
                          const CallExpr *E) {
   Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
   QualType ArgType = E->getArg(0)->getType();
@@ -20484,6 +20484,21 @@ static Value *MakeLdgLdu(unsigned IntrinsicID, CodeGenFunction &CGF,
       {Ptr, ConstantInt::get(CGF.Builder.getInt32Ty(), Align.getQuantity())});
 }
 
+static Value *MakeLdg(CodeGenFunction &CGF, const CallExpr *E) {
+  Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
+  QualType ArgType = E->getArg(0)->getType();
+  clang::CharUnits AlignV = CGF.CGM.getNaturalPointeeTypeAlignment(ArgType);
+  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ArgType->getPointeeType());
+
+  // Use addrspace(1) for NVPTX ADDRESS_SPACE_GLOBAL
+  auto *ASC = CGF.Builder.CreateAddrSpaceCast(Ptr, CGF.Builder.getPtrTy(1));
+  auto *LD = CGF.Builder.CreateAlignedLoad(ElemTy, ASC, AlignV.getAsAlign());
+  MDNode *MD = MDNode::get(CGF.Builder.getContext(), {});
+  LD->setMetadata(LLVMContext::MD_invariant_load, MD);
+
+  return LD;
+}
+
 static Value *MakeScopedAtomic(unsigned IntrinsicID, CodeGenFunction &CGF,
                                const CallExpr *E) {
   Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
@@ -20517,9 +20532,11 @@ static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID,
     return nullptr;
   }
 
-  if (IntrinsicID == Intrinsic::nvvm_ldg_global_f ||
-      IntrinsicID == Intrinsic::nvvm_ldu_global_f)
-    return MakeLdgLdu(IntrinsicID, CGF, E);
+  if (BuiltinID == NVPTX::BI__nvvm_ldg_h || BuiltinID == NVPTX::BI__nvvm_ldg_h2)
+    return MakeLdg(CGF, E);
+
+  if (IntrinsicID == Intrinsic::nvvm_ldu_global_f)
+    return MakeLdu(IntrinsicID, CGF, E);
 
   SmallVector<Value *, 16> Args;
   auto *F = CGF.CGM.getIntrinsic(IntrinsicID);
@@ -20656,16 +20673,15 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
   case NVPTX::BI__nvvm_ldg_ul2:
   case NVPTX::BI__nvvm_ldg_ull:
   case NVPTX::BI__nvvm_ldg_ull2:
-    // PTX Interoperability section 2.2: "For a vector with an even number of
-    // elements, its alignment is set to number of elements times the alignment
-    // of its member: n*alignof(t)."
-    return MakeLdgLdu(Intrinsic::nvvm_ldg_global_i, *this, E);
   case NVPTX::BI__nvvm_ldg_f:
   case NVPTX::BI__nvvm_ldg_f2:
   case NVPTX::BI__nvvm_ldg_f4:
   case NVPTX::BI__nvvm_ldg_d:
   case NVPTX::BI__nvvm_ldg_d2:
-    return MakeLdgLdu(Intrinsic::nvvm_ldg_global_f, *this, E);
+    // PTX Interoperability section 2.2: "For a vector with an even number of
+    // elements, its alignment is set to number of elements times the alignment
+    // of its member: n*alignof(t)."
+    return MakeLdg(*this, E);
 
   case NVPTX::BI__nvvm_ldu_c:
   case NVPTX::BI__nvvm_ldu_sc:
@@ -20696,13 +20712,13 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
   case NVPTX::BI__nvvm_ldu_ul2:
   case NVPTX::BI__nvvm_ldu_ull:
   case NVPTX::BI__nvvm_ldu_ull2:
-    return MakeLdgLdu(Intrinsic::nvvm_ldu_global_i, *this, E);
+    return MakeLdu(Intrinsic::nvvm_ldu_global_i, *this, E);
   case NVPTX::BI__nvvm_ldu_f:
   case NVPTX::BI__nvvm_ldu_f2:
   case NVPTX::BI__nvvm_ldu_f4:
   case NVPTX::BI__nvvm_ldu_d:
   case NVPTX::BI__nvvm_ldu_d2:
-    return MakeLdgLdu(Intrinsic::nvvm_ldu_global_f, *this, E);
+    return MakeLdu(Intrinsic::nvvm_ldu_global_f, *this, E);
 
   case NVPTX::BI__nvvm_atom_cta_add_gen_i:
   case NVPTX::BI__nvvm_atom_cta_add_gen_l:
@@ -21176,14 +21192,11 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
     return MakeHalfType(Intrinsic::nvvm_fmin_xorsign_abs_f16x2, BuiltinID, E,
                         *this);
   case NVPTX::BI__nvvm_ldg_h:
-    return MakeHalfType(Intrinsic::nvvm_ldg_global_f, BuiltinID, E, *this);
   case NVPTX::BI__nvvm_ldg_h2:
-    return MakeHalfType(Intrinsic::nvvm_ldg_global_f, BuiltinID, E, *this);
+    return MakeHalfType(Intrinsic::not_intrinsic, BuiltinID, E, *this);
   case NVPTX::BI__nvvm_ldu_h:
+  case NVPTX::BI__nvvm_ldu_h2:
     return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this);
-  case NVPTX::BI__nvvm_ldu_h2: {
-    return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this);
-  }
   case NVPTX::BI__nvvm_cp_async_ca_shared_global_4:
     return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_4,
                        Intrinsic::nvvm_cp_async_ca_shared_global_4_s, *this, E,
diff --git a/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c b/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c
index b594fc876d4b9e..035c4c6066be24 100644
--- a/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c
+++ b/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c
@@ -52,8 +52,8 @@ typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2)));
 // CHECK: call <2 x half> @llvm.nvvm.fmax.ftz.xorsign.abs.f16x2(<2 x half> {{.*}}, <2 x half> {{.*}})
 // CHECK: call <2 x half> @llvm.nvvm.fmax.nan.xorsign.abs.f16x2(<2 x half> {{.*}}, <2 x half> {{.*}})
 // CHECK: call <2 x half> @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16x2(<2 x half> {{.*}}, <2 x half> {{.*}})
-// CHECK: call half @llvm.nvvm.ldg.global.f.f16.p0(ptr {{.*}}, i32 2)
-// CHECK: call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr {{.*}}, i32 4)
+// CHECK: load half, ptr addrspace(1) {{.*}}, align 2, !invariant.load
+// CHECK: load <2 x half>, ptr addrspace(1) {{.*}}, align 4, !invariant.load
 // CHECK: call half @llvm.nvvm.ldu.global.f.f16.p0(ptr {{.*}}, i32 2)
 // CHECK: call <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p0(ptr {{.*}}, i32 4)
 __device__ void nvvm_native_half_types(void *a, void*b, void*c, __fp16* out) {
diff --git a/clang/test/CodeGen/builtins-nvptx-native-half-type.c b/clang/test/CodeGen/builtins-nvptx-native-half-type.c
index 4aeae953bc1622..511497702ff7f9 100644
--- a/clang/test/CodeGen/builtins-nvptx-native-half-type.c
+++ b/clang/test/CodeGen/builtins-nvptx-native-half-type.c
@@ -177,9 +177,9 @@ typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2)));
 
 // CHECK-LABEL: nvvm_ldg_native_half_types
 __device__ void nvvm_ldg_native_half_types(const void *p) {
-  // CHECK: call half @llvm.nvvm.ldg.global.f.f16.p0
+  // CHECK: load half, ptr addrspace(1) {{.*}}, align 2, !invariant.load
   __nvvm_ldg_h((const __fp16 *)p);
-  // CHECK: call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0
+  // CHECK: load <2 x half>, ptr addrspace(1) {{.*}}, align 4, !invariant.load
   __nvvm_ldg_h2((const __fp16v2 *)p);
 }
 
diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c
index 0d0e3ecdb90c9e..3406cbdde2bf88 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -598,33 +598,33 @@ __device__ void nvvm_atom(float *fp, float f, double *dfp, double df,
 
 // CHECK-LABEL: nvvm_ldg
 __device__ void nvvm_ldg(const void *p) {
-  // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
-  // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
-  // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
+  // CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load
+  // CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load
+  // CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load
   __nvvm_ldg_c((const char *)p);
   __nvvm_ldg_uc((const unsigned char *)p);
   __nvvm_ldg_sc((const signed char *)p);
 
-  // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
-  // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
+  // CHECK: load i16, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
+  // CHECK: load i16, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
   __nvvm_ldg_s((const short *)p);
   __nvvm_ldg_us((const unsigned short *)p);
 
-  // CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
-  // CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
+  // CHECK: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
+  // CHECK: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
   __nvvm_ldg_i((const int *)p);
   __nvvm_ldg_ui((const unsigned int *)p);
 
-  // LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
-  // LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
-  // LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8)
-  // LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8)
+  // LP32: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
+  // LP32: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
+  // LP64: load i64, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
+  // LP64: load i64, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
   __nvvm_ldg_l((const long *)p);
   __nvvm_ldg_ul((const unsigned long *)p);
 
-  // CHECK: call float @llvm.nvvm.ldg.global.f.f32.p0(ptr {{%[0-9]+}}, i32 4)
+  // CHECK: load float, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
   __nvvm_ldg_f((const float *)p);
-  // CHECK: call double @llvm.nvvm.ldg.global.f.f64.p0(ptr {{%[0-9]+}}, i32 8)
+  // CHECK: load double, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
   __nvvm_ldg_d((const double *)p);
 
   // In practice, the pointers we pass to __ldg will be aligned as appropriate
@@ -636,9 +636,9 @@ __device__ void nvvm_ldg(const void *p) {
   // elements, its alignment is set to number of elements times the alignment of
   // its member: n*alignof(t)."
 
-  // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
-  // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
-  // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
+  // CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
+  // CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
+  // CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
   typedef char char2 __attribute__((ext_vector_type(2)));
   typedef unsigned char uchar2 __attribute__((ext_vector_type(2)));
   typedef signed char schar2 __attribute__((ext_vector_type(2)));
@@ -646,9 +646,9 @@ __device__ void nvvm_ldg(const void *p) {
   __nvvm_ldg_uc2((const uchar2 *)p);
   __nvvm_ldg_sc2((const schar2 *)p);
 
-  // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
-  // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
-  // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
+  // CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
+  // CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
+  // CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
   typedef char char4 __attribute__((ext_vector_type(4)));
   typedef unsigned char uchar4 __attribute__((ext_vector_type(4)));
   typedef signed char schar4 __attribute__((ext_vector_type(4)));
@@ -656,59 +656,59 @@ __device__ void nvvm_ldg(const void *p) {
   __nvvm_ldg_uc4((const uchar4 *)p);
   __nvvm_ldg_sc4((const schar4 *)p);
 
-  // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4)
-  // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4)
+  // CHECK: load <2 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
+  // CHECK: load <2 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
   typedef short short2 __attribute__((ext_vector_type(2)));
   typedef unsigned short ushort2 __attribute__((ext_vector_type(2)));
   __nvvm_ldg_s2((const short2 *)p);
   __nvvm_ldg_us2((const ushort2 *)p);
 
-  // CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8)
-  // CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8)
+  // CHECK: load <4 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
+  // CHECK: load <4 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
   typedef short short4 __attribute__((ext_vector_type(4)));
   typedef unsigned short ushort4 __attribute__((ext_vector_type(4)));
   __nvvm_ldg_s4((const short4 *)p);
   __nvvm_ldg_us4((const ushort4 *)p);
 
-  // CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
-  // CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
+  // CHECK: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
+  // CHECK: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
   typedef int int2 __attribute__((ext_vector_type(2)));
   typedef unsigned int uint2 __attribute__((ext_vector_type(2)));
   __nvvm_ldg_i2((const int2 *)p);
   __nvvm_ldg_ui2((const uint2 *)p);
 
-  // CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16)
-  // CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16)
+  // CHECK: load <4 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
+  // CHECK: load <4 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
   typedef int int4 __attribute__((ext_vector_type(4)));
   typedef unsigned int uint4 __attribute__((ext_vector_type(4)));
   __nvvm_ldg_i4((const int4 *)p);
   __nvvm_ldg_ui4((const uint4 *)p);
 
-  // LP32: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
-  // LP32: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
-  // LP64: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
-  // LP64: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
+  // LP32: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
+  // LP32: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
+  // LP64: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
+  // LP64: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
   typedef long long2 __attribute__((ext_vector_type(2)));
   typedef unsigned long ulong2 __attribute__((ext_vector_type(2)));
   __nvvm_ldg_l2((const long2 *)p);
   __nvvm_ldg_ul2((const ulong2 *)p);
 
-  // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
-  // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
+  // CHECK: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
+  // CHECK: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
   typedef long long longlong2 __attribute__((ext_vector_type(2)));
   typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2)));
   __nvvm_ldg_ll2((const longlong2 *)p);
   __nvvm_ldg_ull2((const ulonglong2 *)p);
 
-  // CHECK: call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0(ptr {{%[0-9]+}}, i32 8)
+  // CHECK: load <2 x float>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
   typedef float float2 __attribute__((ext_vector_type(2)));
   __nvvm_ldg_f2((const float2 *)p);
 
-  // CHECK: call <4 x float> @llvm.nvvm.ldg.global.f.v4f32.p0(ptr {{%[0-9]+}}, i32 16)
+  // CHECK: load <4 x float>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
   typedef float float4 __attribute__((ext_vector_type(4)));
   __nvvm_ldg_f4((const float4 *)p);
 
-  // CHECK: call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr {{%[0-9]+}}, i32 16)
+  // CHECK: load <2 x double>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
   typedef double double2 __attribute__((ext_vector_type(2)));
   __nvvm_ldg_d2((const double2 *)p);
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 7b8ffe417fccdb..3cc45adb198e26 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -42,6 +42,9 @@
 //   * llvm.nvvm.ptr.shared.to.gen   --> ibid.
 //   * llvm.nvvm.ptr.constant.to.gen --> ibid.
 //   * llvm.nvvm.ptr.local.to.gen    --> ibid.
+//   * llvm.nvvm.ldg.global.i        --> load addrspace(1) !load.invariant
+//   * llvm.nvvm.ldg.global.f        --> ibid.
+//   * llvm.nvvm.ldg.global.p        --> ibid.
 
 def llvm_global_ptr_ty  : LLVMQualPointerType<1>;  // (global)ptr
 def llvm_shared_ptr_ty  : LLVMQualPointerType<3>;  // (shared)ptr
@@ -1595,21 +1598,6 @@ def int_nvvm_ldu_global_p : Intrinsic<[llvm_anyptr_ty],
   [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
   "llvm.nvvm.ldu.global.p">;
 
-// Generated within nvvm. Use for ldg on sm_35 or later.  Second arg is the
-// pointer's alignment.
-def int_nvvm_ldg_global_i : Intrinsic<[llvm_anyint_ty],
-  [llvm_anyptr_ty, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
-  "llvm.nvvm.ldg.global.i">;
-def int_nvvm_ldg_global_f : Intrinsic<[llvm_anyfloat_ty],
-  [llvm_anyptr_ty, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
-  "llvm.nvvm.ldg.global.f">;
-def int_nvvm_ldg_global_p : Intrinsic<[llvm_anyptr_ty],
-  [llvm_anyptr_ty, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
-  "llvm.nvvm.ldg.global.p">;
-
 // Used in nvvm internally to help address space opt and ptx code generation
 // This is for params that are passed to kernel functions by pointer by-val.
 def int_nvvm_ptr_gen_to_param: Intrinsic<[llvm_anyptr_ty],
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index bb03c9290e4cf4..73882fbc7a251a 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -37,6 +37,7 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/CommandLine.h"
@@ -1301,6 +1302,10 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
             (Name.consume_front("local") || Name.consume_front("shared") ||
              Name.consume_front("global") || Name.consume_front("constant")) &&
             Name.starts_with(".to.gen");
+      else if (Name.consume_front("ldg.global."))
+        // nvvm.ldg.global.{i,p,f}
+        Expand = (Name.starts_with("i.") || Name.starts_with("f.") ||
+                  Name.starts_with("p."));
       else
         Expand = false;
 
@@ -2363,6 +2368,15 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
                Name.consume_front("constant")) &&
               Name.starts_with(".to.gen"))) {
     Rep = Builder.CreateAddrSpaceCast(CI->getArgOperand(0), CI->getType());
+  } else if (Name.consume_front("ldg.global")) {
+    Value *Ptr = CI->getArgOperand(0);
+    Align PtrAlign = cast<ConstantInt>(CI->getArgOperand(1))->getAlignValue();
+    // Use addrspace(1) for NVPTX ADDRESS_SPACE_GLOBAL
+    Value *ASC = Builder.CreateAddrSpaceCast(Ptr, Builder.getPtrTy(1));
+    Instruction *LD = Builder.CreateAlignedLoad(CI->getType(), ASC, PtrAlign);
+    MDNode *MD = MDNode::get(Builder.getContext(), {});
+    LD->setMetadata(LLVMContext::MD_invariant_load, MD);
+    return LD;
   } else {
     Intrinsic::ID IID = shouldUpgradeNVPTXBF16Intrinsic(Name);
     if (IID != Intrinsic::not_intrinsic &&
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 7f942de74bdcc9..4d2e7fb373de5d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -126,8 +126,6 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
     if (tryLoadVector(N))
       return;
     break;
-  case NVPTXISD::LDGV2:
-  case NVPTXISD::LDGV4:
   case NVPTXISD::LDUV2:
   case NVPTXISD::LDUV4:
     if (tryLDGLDU(N))
@@ -550,9 +548,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
   switch (IID) {
   default:
     return false;
-  case Intrinsic::nvvm_ldg_global_f:
-  case Intrinsic::nvvm_ldg_global_i:
-  case Intrinsic::nvvm_ldg_glo...
[truncated]

@llvmbot
Copy link
Member

llvmbot commented Oct 18, 2024

@llvm/pr-subscribers-llvm-ir

Author: Alex MacLean (AlexMaclean)

Changes

Remove these intrinsics which can be better represented by load instructions with !invariant.load metadata:

  • llvm.nvvm.ldg.global.i
  • llvm.nvvm.ldg.global.f
  • llvm.nvvm.ldg.global.p

Patch is 40.27 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/112834.diff

10 Files Affected:

  • (modified) clang/lib/CodeGen/CGBuiltin.cpp (+29-16)
  • (modified) clang/test/CodeGen/builtins-nvptx-native-half-type-native.c (+2-2)
  • (modified) clang/test/CodeGen/builtins-nvptx-native-half-type.c (+2-2)
  • (modified) clang/test/CodeGen/builtins-nvptx.c (+36-36)
  • (modified) llvm/include/llvm/IR/IntrinsicsNVVM.td (+3-15)
  • (modified) llvm/lib/IR/AutoUpgrade.cpp (+14)
  • (modified) llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp (+69-120)
  • (modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (+2-53)
  • (modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.h (-2)
  • (modified) llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll (+31)
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index f6d7db2c204c12..3b42977b578e15 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -20473,7 +20473,7 @@ static NVPTXMmaInfo getNVPTXMmaInfo(unsigned BuiltinID) {
 #undef MMA_VARIANTS_B1_XOR
 }
 
-static Value *MakeLdgLdu(unsigned IntrinsicID, CodeGenFunction &CGF,
+static Value *MakeLdu(unsigned IntrinsicID, CodeGenFunction &CGF,
                          const CallExpr *E) {
   Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
   QualType ArgType = E->getArg(0)->getType();
@@ -20484,6 +20484,21 @@ static Value *MakeLdgLdu(unsigned IntrinsicID, CodeGenFunction &CGF,
       {Ptr, ConstantInt::get(CGF.Builder.getInt32Ty(), Align.getQuantity())});
 }
 
+static Value *MakeLdg(CodeGenFunction &CGF, const CallExpr *E) {
+  Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
+  QualType ArgType = E->getArg(0)->getType();
+  clang::CharUnits AlignV = CGF.CGM.getNaturalPointeeTypeAlignment(ArgType);
+  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ArgType->getPointeeType());
+
+  // Use addrspace(1) for NVPTX ADDRESS_SPACE_GLOBAL
+  auto *ASC = CGF.Builder.CreateAddrSpaceCast(Ptr, CGF.Builder.getPtrTy(1));
+  auto *LD = CGF.Builder.CreateAlignedLoad(ElemTy, ASC, AlignV.getAsAlign());
+  MDNode *MD = MDNode::get(CGF.Builder.getContext(), {});
+  LD->setMetadata(LLVMContext::MD_invariant_load, MD);
+
+  return LD;
+}
+
 static Value *MakeScopedAtomic(unsigned IntrinsicID, CodeGenFunction &CGF,
                                const CallExpr *E) {
   Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
@@ -20517,9 +20532,11 @@ static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID,
     return nullptr;
   }
 
-  if (IntrinsicID == Intrinsic::nvvm_ldg_global_f ||
-      IntrinsicID == Intrinsic::nvvm_ldu_global_f)
-    return MakeLdgLdu(IntrinsicID, CGF, E);
+  if (BuiltinID == NVPTX::BI__nvvm_ldg_h || BuiltinID == NVPTX::BI__nvvm_ldg_h2)
+    return MakeLdg(CGF, E);
+
+  if (IntrinsicID == Intrinsic::nvvm_ldu_global_f)
+    return MakeLdu(IntrinsicID, CGF, E);
 
   SmallVector<Value *, 16> Args;
   auto *F = CGF.CGM.getIntrinsic(IntrinsicID);
@@ -20656,16 +20673,15 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
   case NVPTX::BI__nvvm_ldg_ul2:
   case NVPTX::BI__nvvm_ldg_ull:
   case NVPTX::BI__nvvm_ldg_ull2:
-    // PTX Interoperability section 2.2: "For a vector with an even number of
-    // elements, its alignment is set to number of elements times the alignment
-    // of its member: n*alignof(t)."
-    return MakeLdgLdu(Intrinsic::nvvm_ldg_global_i, *this, E);
   case NVPTX::BI__nvvm_ldg_f:
   case NVPTX::BI__nvvm_ldg_f2:
   case NVPTX::BI__nvvm_ldg_f4:
   case NVPTX::BI__nvvm_ldg_d:
   case NVPTX::BI__nvvm_ldg_d2:
-    return MakeLdgLdu(Intrinsic::nvvm_ldg_global_f, *this, E);
+    // PTX Interoperability section 2.2: "For a vector with an even number of
+    // elements, its alignment is set to number of elements times the alignment
+    // of its member: n*alignof(t)."
+    return MakeLdg(*this, E);
 
   case NVPTX::BI__nvvm_ldu_c:
   case NVPTX::BI__nvvm_ldu_sc:
@@ -20696,13 +20712,13 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
   case NVPTX::BI__nvvm_ldu_ul2:
   case NVPTX::BI__nvvm_ldu_ull:
   case NVPTX::BI__nvvm_ldu_ull2:
-    return MakeLdgLdu(Intrinsic::nvvm_ldu_global_i, *this, E);
+    return MakeLdu(Intrinsic::nvvm_ldu_global_i, *this, E);
   case NVPTX::BI__nvvm_ldu_f:
   case NVPTX::BI__nvvm_ldu_f2:
   case NVPTX::BI__nvvm_ldu_f4:
   case NVPTX::BI__nvvm_ldu_d:
   case NVPTX::BI__nvvm_ldu_d2:
-    return MakeLdgLdu(Intrinsic::nvvm_ldu_global_f, *this, E);
+    return MakeLdu(Intrinsic::nvvm_ldu_global_f, *this, E);
 
   case NVPTX::BI__nvvm_atom_cta_add_gen_i:
   case NVPTX::BI__nvvm_atom_cta_add_gen_l:
@@ -21176,14 +21192,11 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
     return MakeHalfType(Intrinsic::nvvm_fmin_xorsign_abs_f16x2, BuiltinID, E,
                         *this);
   case NVPTX::BI__nvvm_ldg_h:
-    return MakeHalfType(Intrinsic::nvvm_ldg_global_f, BuiltinID, E, *this);
   case NVPTX::BI__nvvm_ldg_h2:
-    return MakeHalfType(Intrinsic::nvvm_ldg_global_f, BuiltinID, E, *this);
+    return MakeHalfType(Intrinsic::not_intrinsic, BuiltinID, E, *this);
   case NVPTX::BI__nvvm_ldu_h:
+  case NVPTX::BI__nvvm_ldu_h2:
     return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this);
-  case NVPTX::BI__nvvm_ldu_h2: {
-    return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this);
-  }
   case NVPTX::BI__nvvm_cp_async_ca_shared_global_4:
     return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_4,
                        Intrinsic::nvvm_cp_async_ca_shared_global_4_s, *this, E,
diff --git a/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c b/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c
index b594fc876d4b9e..035c4c6066be24 100644
--- a/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c
+++ b/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c
@@ -52,8 +52,8 @@ typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2)));
 // CHECK: call <2 x half> @llvm.nvvm.fmax.ftz.xorsign.abs.f16x2(<2 x half> {{.*}}, <2 x half> {{.*}})
 // CHECK: call <2 x half> @llvm.nvvm.fmax.nan.xorsign.abs.f16x2(<2 x half> {{.*}}, <2 x half> {{.*}})
 // CHECK: call <2 x half> @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16x2(<2 x half> {{.*}}, <2 x half> {{.*}})
-// CHECK: call half @llvm.nvvm.ldg.global.f.f16.p0(ptr {{.*}}, i32 2)
-// CHECK: call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr {{.*}}, i32 4)
+// CHECK: load half, ptr addrspace(1) {{.*}}, align 2, !invariant.load
+// CHECK: load <2 x half>, ptr addrspace(1) {{.*}}, align 4, !invariant.load
 // CHECK: call half @llvm.nvvm.ldu.global.f.f16.p0(ptr {{.*}}, i32 2)
 // CHECK: call <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p0(ptr {{.*}}, i32 4)
 __device__ void nvvm_native_half_types(void *a, void*b, void*c, __fp16* out) {
diff --git a/clang/test/CodeGen/builtins-nvptx-native-half-type.c b/clang/test/CodeGen/builtins-nvptx-native-half-type.c
index 4aeae953bc1622..511497702ff7f9 100644
--- a/clang/test/CodeGen/builtins-nvptx-native-half-type.c
+++ b/clang/test/CodeGen/builtins-nvptx-native-half-type.c
@@ -177,9 +177,9 @@ typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2)));
 
 // CHECK-LABEL: nvvm_ldg_native_half_types
 __device__ void nvvm_ldg_native_half_types(const void *p) {
-  // CHECK: call half @llvm.nvvm.ldg.global.f.f16.p0
+  // CHECK: load half, ptr addrspace(1) {{.*}}, align 2, !invariant.load
   __nvvm_ldg_h((const __fp16 *)p);
-  // CHECK: call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0
+  // CHECK: load <2 x half>, ptr addrspace(1) {{.*}}, align 4, !invariant.load
   __nvvm_ldg_h2((const __fp16v2 *)p);
 }
 
diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c
index 0d0e3ecdb90c9e..3406cbdde2bf88 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -598,33 +598,33 @@ __device__ void nvvm_atom(float *fp, float f, double *dfp, double df,
 
 // CHECK-LABEL: nvvm_ldg
 __device__ void nvvm_ldg(const void *p) {
-  // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
-  // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
-  // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
+  // CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load
+  // CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load
+  // CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load
   __nvvm_ldg_c((const char *)p);
   __nvvm_ldg_uc((const unsigned char *)p);
   __nvvm_ldg_sc((const signed char *)p);
 
-  // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
-  // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
+  // CHECK: load i16, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
+  // CHECK: load i16, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
   __nvvm_ldg_s((const short *)p);
   __nvvm_ldg_us((const unsigned short *)p);
 
-  // CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
-  // CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
+  // CHECK: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
+  // CHECK: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
   __nvvm_ldg_i((const int *)p);
   __nvvm_ldg_ui((const unsigned int *)p);
 
-  // LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
-  // LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
-  // LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8)
-  // LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8)
+  // LP32: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
+  // LP32: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
+  // LP64: load i64, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
+  // LP64: load i64, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
   __nvvm_ldg_l((const long *)p);
   __nvvm_ldg_ul((const unsigned long *)p);
 
-  // CHECK: call float @llvm.nvvm.ldg.global.f.f32.p0(ptr {{%[0-9]+}}, i32 4)
+  // CHECK: load float, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
   __nvvm_ldg_f((const float *)p);
-  // CHECK: call double @llvm.nvvm.ldg.global.f.f64.p0(ptr {{%[0-9]+}}, i32 8)
+  // CHECK: load double, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
   __nvvm_ldg_d((const double *)p);
 
   // In practice, the pointers we pass to __ldg will be aligned as appropriate
@@ -636,9 +636,9 @@ __device__ void nvvm_ldg(const void *p) {
   // elements, its alignment is set to number of elements times the alignment of
   // its member: n*alignof(t)."
 
-  // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
-  // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
-  // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
+  // CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
+  // CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
+  // CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
   typedef char char2 __attribute__((ext_vector_type(2)));
   typedef unsigned char uchar2 __attribute__((ext_vector_type(2)));
   typedef signed char schar2 __attribute__((ext_vector_type(2)));
@@ -646,9 +646,9 @@ __device__ void nvvm_ldg(const void *p) {
   __nvvm_ldg_uc2((const uchar2 *)p);
   __nvvm_ldg_sc2((const schar2 *)p);
 
-  // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
-  // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
-  // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
+  // CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
+  // CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
+  // CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
   typedef char char4 __attribute__((ext_vector_type(4)));
   typedef unsigned char uchar4 __attribute__((ext_vector_type(4)));
   typedef signed char schar4 __attribute__((ext_vector_type(4)));
@@ -656,59 +656,59 @@ __device__ void nvvm_ldg(const void *p) {
   __nvvm_ldg_uc4((const uchar4 *)p);
   __nvvm_ldg_sc4((const schar4 *)p);
 
-  // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4)
-  // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4)
+  // CHECK: load <2 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
+  // CHECK: load <2 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
   typedef short short2 __attribute__((ext_vector_type(2)));
   typedef unsigned short ushort2 __attribute__((ext_vector_type(2)));
   __nvvm_ldg_s2((const short2 *)p);
   __nvvm_ldg_us2((const ushort2 *)p);
 
-  // CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8)
-  // CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8)
+  // CHECK: load <4 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
+  // CHECK: load <4 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
   typedef short short4 __attribute__((ext_vector_type(4)));
   typedef unsigned short ushort4 __attribute__((ext_vector_type(4)));
   __nvvm_ldg_s4((const short4 *)p);
   __nvvm_ldg_us4((const ushort4 *)p);
 
-  // CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
-  // CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
+  // CHECK: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
+  // CHECK: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
   typedef int int2 __attribute__((ext_vector_type(2)));
   typedef unsigned int uint2 __attribute__((ext_vector_type(2)));
   __nvvm_ldg_i2((const int2 *)p);
   __nvvm_ldg_ui2((const uint2 *)p);
 
-  // CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16)
-  // CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16)
+  // CHECK: load <4 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
+  // CHECK: load <4 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
   typedef int int4 __attribute__((ext_vector_type(4)));
   typedef unsigned int uint4 __attribute__((ext_vector_type(4)));
   __nvvm_ldg_i4((const int4 *)p);
   __nvvm_ldg_ui4((const uint4 *)p);
 
-  // LP32: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
-  // LP32: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
-  // LP64: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
-  // LP64: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
+  // LP32: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
+  // LP32: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
+  // LP64: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
+  // LP64: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
   typedef long long2 __attribute__((ext_vector_type(2)));
   typedef unsigned long ulong2 __attribute__((ext_vector_type(2)));
   __nvvm_ldg_l2((const long2 *)p);
   __nvvm_ldg_ul2((const ulong2 *)p);
 
-  // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
-  // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
+  // CHECK: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
+  // CHECK: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
   typedef long long longlong2 __attribute__((ext_vector_type(2)));
   typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2)));
   __nvvm_ldg_ll2((const longlong2 *)p);
   __nvvm_ldg_ull2((const ulonglong2 *)p);
 
-  // CHECK: call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0(ptr {{%[0-9]+}}, i32 8)
+  // CHECK: load <2 x float>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
   typedef float float2 __attribute__((ext_vector_type(2)));
   __nvvm_ldg_f2((const float2 *)p);
 
-  // CHECK: call <4 x float> @llvm.nvvm.ldg.global.f.v4f32.p0(ptr {{%[0-9]+}}, i32 16)
+  // CHECK: load <4 x float>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
   typedef float float4 __attribute__((ext_vector_type(4)));
   __nvvm_ldg_f4((const float4 *)p);
 
-  // CHECK: call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr {{%[0-9]+}}, i32 16)
+  // CHECK: load <2 x double>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
   typedef double double2 __attribute__((ext_vector_type(2)));
   __nvvm_ldg_d2((const double2 *)p);
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 7b8ffe417fccdb..3cc45adb198e26 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -42,6 +42,9 @@
 //   * llvm.nvvm.ptr.shared.to.gen   --> ibid.
 //   * llvm.nvvm.ptr.constant.to.gen --> ibid.
 //   * llvm.nvvm.ptr.local.to.gen    --> ibid.
+//   * llvm.nvvm.ldg.global.i        --> load addrspace(1) !load.invariant
+//   * llvm.nvvm.ldg.global.f        --> ibid.
+//   * llvm.nvvm.ldg.global.p        --> ibid.
 
 def llvm_global_ptr_ty  : LLVMQualPointerType<1>;  // (global)ptr
 def llvm_shared_ptr_ty  : LLVMQualPointerType<3>;  // (shared)ptr
@@ -1595,21 +1598,6 @@ def int_nvvm_ldu_global_p : Intrinsic<[llvm_anyptr_ty],
   [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
   "llvm.nvvm.ldu.global.p">;
 
-// Generated within nvvm. Use for ldg on sm_35 or later.  Second arg is the
-// pointer's alignment.
-def int_nvvm_ldg_global_i : Intrinsic<[llvm_anyint_ty],
-  [llvm_anyptr_ty, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
-  "llvm.nvvm.ldg.global.i">;
-def int_nvvm_ldg_global_f : Intrinsic<[llvm_anyfloat_ty],
-  [llvm_anyptr_ty, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
-  "llvm.nvvm.ldg.global.f">;
-def int_nvvm_ldg_global_p : Intrinsic<[llvm_anyptr_ty],
-  [llvm_anyptr_ty, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
-  "llvm.nvvm.ldg.global.p">;
-
 // Used in nvvm internally to help address space opt and ptx code generation
 // This is for params that are passed to kernel functions by pointer by-val.
 def int_nvvm_ptr_gen_to_param: Intrinsic<[llvm_anyptr_ty],
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index bb03c9290e4cf4..73882fbc7a251a 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -37,6 +37,7 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/CommandLine.h"
@@ -1301,6 +1302,10 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
             (Name.consume_front("local") || Name.consume_front("shared") ||
              Name.consume_front("global") || Name.consume_front("constant")) &&
             Name.starts_with(".to.gen");
+      else if (Name.consume_front("ldg.global."))
+        // nvvm.ldg.global.{i,p,f}
+        Expand = (Name.starts_with("i.") || Name.starts_with("f.") ||
+                  Name.starts_with("p."));
       else
         Expand = false;
 
@@ -2363,6 +2368,15 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
                Name.consume_front("constant")) &&
               Name.starts_with(".to.gen"))) {
     Rep = Builder.CreateAddrSpaceCast(CI->getArgOperand(0), CI->getType());
+  } else if (Name.consume_front("ldg.global")) {
+    Value *Ptr = CI->getArgOperand(0);
+    Align PtrAlign = cast<ConstantInt>(CI->getArgOperand(1))->getAlignValue();
+    // Use addrspace(1) for NVPTX ADDRESS_SPACE_GLOBAL
+    Value *ASC = Builder.CreateAddrSpaceCast(Ptr, Builder.getPtrTy(1));
+    Instruction *LD = Builder.CreateAlignedLoad(CI->getType(), ASC, PtrAlign);
+    MDNode *MD = MDNode::get(Builder.getContext(), {});
+    LD->setMetadata(LLVMContext::MD_invariant_load, MD);
+    return LD;
   } else {
     Intrinsic::ID IID = shouldUpgradeNVPTXBF16Intrinsic(Name);
     if (IID != Intrinsic::not_intrinsic &&
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 7f942de74bdcc9..4d2e7fb373de5d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -126,8 +126,6 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
     if (tryLoadVector(N))
       return;
     break;
-  case NVPTXISD::LDGV2:
-  case NVPTXISD::LDGV4:
   case NVPTXISD::LDUV2:
   case NVPTXISD::LDUV4:
     if (tryLDGLDU(N))
@@ -550,9 +548,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
   switch (IID) {
   default:
     return false;
-  case Intrinsic::nvvm_ldg_global_f:
-  case Intrinsic::nvvm_ldg_global_i:
-  case Intrinsic::nvvm_ldg_glo...
[truncated]

@llvmbot
Copy link
Member

llvmbot commented Oct 18, 2024

@llvm/pr-subscribers-backend-nvptx

Author: Alex MacLean (AlexMaclean)

Changes

Remove these intrinsics which can be better represented by load instructions with !invariant.load metadata:

  • llvm.nvvm.ldg.global.i
  • llvm.nvvm.ldg.global.f
  • llvm.nvvm.ldg.global.p

Patch is 40.27 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/112834.diff

10 Files Affected:

  • (modified) clang/lib/CodeGen/CGBuiltin.cpp (+29-16)
  • (modified) clang/test/CodeGen/builtins-nvptx-native-half-type-native.c (+2-2)
  • (modified) clang/test/CodeGen/builtins-nvptx-native-half-type.c (+2-2)
  • (modified) clang/test/CodeGen/builtins-nvptx.c (+36-36)
  • (modified) llvm/include/llvm/IR/IntrinsicsNVVM.td (+3-15)
  • (modified) llvm/lib/IR/AutoUpgrade.cpp (+14)
  • (modified) llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp (+69-120)
  • (modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (+2-53)
  • (modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.h (-2)
  • (modified) llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll (+31)
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index f6d7db2c204c12..3b42977b578e15 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -20473,7 +20473,7 @@ static NVPTXMmaInfo getNVPTXMmaInfo(unsigned BuiltinID) {
 #undef MMA_VARIANTS_B1_XOR
 }
 
-static Value *MakeLdgLdu(unsigned IntrinsicID, CodeGenFunction &CGF,
+static Value *MakeLdu(unsigned IntrinsicID, CodeGenFunction &CGF,
                          const CallExpr *E) {
   Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
   QualType ArgType = E->getArg(0)->getType();
@@ -20484,6 +20484,21 @@ static Value *MakeLdgLdu(unsigned IntrinsicID, CodeGenFunction &CGF,
       {Ptr, ConstantInt::get(CGF.Builder.getInt32Ty(), Align.getQuantity())});
 }
 
+static Value *MakeLdg(CodeGenFunction &CGF, const CallExpr *E) {
+  Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
+  QualType ArgType = E->getArg(0)->getType();
+  clang::CharUnits AlignV = CGF.CGM.getNaturalPointeeTypeAlignment(ArgType);
+  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ArgType->getPointeeType());
+
+  // Use addrspace(1) for NVPTX ADDRESS_SPACE_GLOBAL
+  auto *ASC = CGF.Builder.CreateAddrSpaceCast(Ptr, CGF.Builder.getPtrTy(1));
+  auto *LD = CGF.Builder.CreateAlignedLoad(ElemTy, ASC, AlignV.getAsAlign());
+  MDNode *MD = MDNode::get(CGF.Builder.getContext(), {});
+  LD->setMetadata(LLVMContext::MD_invariant_load, MD);
+
+  return LD;
+}
+
 static Value *MakeScopedAtomic(unsigned IntrinsicID, CodeGenFunction &CGF,
                                const CallExpr *E) {
   Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
@@ -20517,9 +20532,11 @@ static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID,
     return nullptr;
   }
 
-  if (IntrinsicID == Intrinsic::nvvm_ldg_global_f ||
-      IntrinsicID == Intrinsic::nvvm_ldu_global_f)
-    return MakeLdgLdu(IntrinsicID, CGF, E);
+  if (BuiltinID == NVPTX::BI__nvvm_ldg_h || BuiltinID == NVPTX::BI__nvvm_ldg_h2)
+    return MakeLdg(CGF, E);
+
+  if (IntrinsicID == Intrinsic::nvvm_ldu_global_f)
+    return MakeLdu(IntrinsicID, CGF, E);
 
   SmallVector<Value *, 16> Args;
   auto *F = CGF.CGM.getIntrinsic(IntrinsicID);
@@ -20656,16 +20673,15 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
   case NVPTX::BI__nvvm_ldg_ul2:
   case NVPTX::BI__nvvm_ldg_ull:
   case NVPTX::BI__nvvm_ldg_ull2:
-    // PTX Interoperability section 2.2: "For a vector with an even number of
-    // elements, its alignment is set to number of elements times the alignment
-    // of its member: n*alignof(t)."
-    return MakeLdgLdu(Intrinsic::nvvm_ldg_global_i, *this, E);
   case NVPTX::BI__nvvm_ldg_f:
   case NVPTX::BI__nvvm_ldg_f2:
   case NVPTX::BI__nvvm_ldg_f4:
   case NVPTX::BI__nvvm_ldg_d:
   case NVPTX::BI__nvvm_ldg_d2:
-    return MakeLdgLdu(Intrinsic::nvvm_ldg_global_f, *this, E);
+    // PTX Interoperability section 2.2: "For a vector with an even number of
+    // elements, its alignment is set to number of elements times the alignment
+    // of its member: n*alignof(t)."
+    return MakeLdg(*this, E);
 
   case NVPTX::BI__nvvm_ldu_c:
   case NVPTX::BI__nvvm_ldu_sc:
@@ -20696,13 +20712,13 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
   case NVPTX::BI__nvvm_ldu_ul2:
   case NVPTX::BI__nvvm_ldu_ull:
   case NVPTX::BI__nvvm_ldu_ull2:
-    return MakeLdgLdu(Intrinsic::nvvm_ldu_global_i, *this, E);
+    return MakeLdu(Intrinsic::nvvm_ldu_global_i, *this, E);
   case NVPTX::BI__nvvm_ldu_f:
   case NVPTX::BI__nvvm_ldu_f2:
   case NVPTX::BI__nvvm_ldu_f4:
   case NVPTX::BI__nvvm_ldu_d:
   case NVPTX::BI__nvvm_ldu_d2:
-    return MakeLdgLdu(Intrinsic::nvvm_ldu_global_f, *this, E);
+    return MakeLdu(Intrinsic::nvvm_ldu_global_f, *this, E);
 
   case NVPTX::BI__nvvm_atom_cta_add_gen_i:
   case NVPTX::BI__nvvm_atom_cta_add_gen_l:
@@ -21176,14 +21192,11 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
     return MakeHalfType(Intrinsic::nvvm_fmin_xorsign_abs_f16x2, BuiltinID, E,
                         *this);
   case NVPTX::BI__nvvm_ldg_h:
-    return MakeHalfType(Intrinsic::nvvm_ldg_global_f, BuiltinID, E, *this);
   case NVPTX::BI__nvvm_ldg_h2:
-    return MakeHalfType(Intrinsic::nvvm_ldg_global_f, BuiltinID, E, *this);
+    return MakeHalfType(Intrinsic::not_intrinsic, BuiltinID, E, *this);
   case NVPTX::BI__nvvm_ldu_h:
+  case NVPTX::BI__nvvm_ldu_h2:
     return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this);
-  case NVPTX::BI__nvvm_ldu_h2: {
-    return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this);
-  }
   case NVPTX::BI__nvvm_cp_async_ca_shared_global_4:
     return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_4,
                        Intrinsic::nvvm_cp_async_ca_shared_global_4_s, *this, E,
diff --git a/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c b/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c
index b594fc876d4b9e..035c4c6066be24 100644
--- a/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c
+++ b/clang/test/CodeGen/builtins-nvptx-native-half-type-native.c
@@ -52,8 +52,8 @@ typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2)));
 // CHECK: call <2 x half> @llvm.nvvm.fmax.ftz.xorsign.abs.f16x2(<2 x half> {{.*}}, <2 x half> {{.*}})
 // CHECK: call <2 x half> @llvm.nvvm.fmax.nan.xorsign.abs.f16x2(<2 x half> {{.*}}, <2 x half> {{.*}})
 // CHECK: call <2 x half> @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16x2(<2 x half> {{.*}}, <2 x half> {{.*}})
-// CHECK: call half @llvm.nvvm.ldg.global.f.f16.p0(ptr {{.*}}, i32 2)
-// CHECK: call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr {{.*}}, i32 4)
+// CHECK: load half, ptr addrspace(1) {{.*}}, align 2, !invariant.load
+// CHECK: load <2 x half>, ptr addrspace(1) {{.*}}, align 4, !invariant.load
 // CHECK: call half @llvm.nvvm.ldu.global.f.f16.p0(ptr {{.*}}, i32 2)
 // CHECK: call <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p0(ptr {{.*}}, i32 4)
 __device__ void nvvm_native_half_types(void *a, void*b, void*c, __fp16* out) {
diff --git a/clang/test/CodeGen/builtins-nvptx-native-half-type.c b/clang/test/CodeGen/builtins-nvptx-native-half-type.c
index 4aeae953bc1622..511497702ff7f9 100644
--- a/clang/test/CodeGen/builtins-nvptx-native-half-type.c
+++ b/clang/test/CodeGen/builtins-nvptx-native-half-type.c
@@ -177,9 +177,9 @@ typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2)));
 
 // CHECK-LABEL: nvvm_ldg_native_half_types
 __device__ void nvvm_ldg_native_half_types(const void *p) {
-  // CHECK: call half @llvm.nvvm.ldg.global.f.f16.p0
+  // CHECK: load half, ptr addrspace(1) {{.*}}, align 2, !invariant.load
   __nvvm_ldg_h((const __fp16 *)p);
-  // CHECK: call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0
+  // CHECK: load <2 x half>, ptr addrspace(1) {{.*}}, align 4, !invariant.load
   __nvvm_ldg_h2((const __fp16v2 *)p);
 }
 
diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c
index 0d0e3ecdb90c9e..3406cbdde2bf88 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -598,33 +598,33 @@ __device__ void nvvm_atom(float *fp, float f, double *dfp, double df,
 
 // CHECK-LABEL: nvvm_ldg
 __device__ void nvvm_ldg(const void *p) {
-  // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
-  // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
-  // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
+  // CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load
+  // CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load
+  // CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load
   __nvvm_ldg_c((const char *)p);
   __nvvm_ldg_uc((const unsigned char *)p);
   __nvvm_ldg_sc((const signed char *)p);
 
-  // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
-  // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
+  // CHECK: load i16, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
+  // CHECK: load i16, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
   __nvvm_ldg_s((const short *)p);
   __nvvm_ldg_us((const unsigned short *)p);
 
-  // CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
-  // CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
+  // CHECK: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
+  // CHECK: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
   __nvvm_ldg_i((const int *)p);
   __nvvm_ldg_ui((const unsigned int *)p);
 
-  // LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
-  // LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
-  // LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8)
-  // LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8)
+  // LP32: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
+  // LP32: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
+  // LP64: load i64, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
+  // LP64: load i64, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
   __nvvm_ldg_l((const long *)p);
   __nvvm_ldg_ul((const unsigned long *)p);
 
-  // CHECK: call float @llvm.nvvm.ldg.global.f.f32.p0(ptr {{%[0-9]+}}, i32 4)
+  // CHECK: load float, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
   __nvvm_ldg_f((const float *)p);
-  // CHECK: call double @llvm.nvvm.ldg.global.f.f64.p0(ptr {{%[0-9]+}}, i32 8)
+  // CHECK: load double, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
   __nvvm_ldg_d((const double *)p);
 
   // In practice, the pointers we pass to __ldg will be aligned as appropriate
@@ -636,9 +636,9 @@ __device__ void nvvm_ldg(const void *p) {
   // elements, its alignment is set to number of elements times the alignment of
   // its member: n*alignof(t)."
 
-  // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
-  // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
-  // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
+  // CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
+  // CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
+  // CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
   typedef char char2 __attribute__((ext_vector_type(2)));
   typedef unsigned char uchar2 __attribute__((ext_vector_type(2)));
   typedef signed char schar2 __attribute__((ext_vector_type(2)));
@@ -646,9 +646,9 @@ __device__ void nvvm_ldg(const void *p) {
   __nvvm_ldg_uc2((const uchar2 *)p);
   __nvvm_ldg_sc2((const schar2 *)p);
 
-  // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
-  // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
-  // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
+  // CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
+  // CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
+  // CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
   typedef char char4 __attribute__((ext_vector_type(4)));
   typedef unsigned char uchar4 __attribute__((ext_vector_type(4)));
   typedef signed char schar4 __attribute__((ext_vector_type(4)));
@@ -656,59 +656,59 @@ __device__ void nvvm_ldg(const void *p) {
   __nvvm_ldg_uc4((const uchar4 *)p);
   __nvvm_ldg_sc4((const schar4 *)p);
 
-  // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4)
-  // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4)
+  // CHECK: load <2 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
+  // CHECK: load <2 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
   typedef short short2 __attribute__((ext_vector_type(2)));
   typedef unsigned short ushort2 __attribute__((ext_vector_type(2)));
   __nvvm_ldg_s2((const short2 *)p);
   __nvvm_ldg_us2((const ushort2 *)p);
 
-  // CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8)
-  // CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8)
+  // CHECK: load <4 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
+  // CHECK: load <4 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
   typedef short short4 __attribute__((ext_vector_type(4)));
   typedef unsigned short ushort4 __attribute__((ext_vector_type(4)));
   __nvvm_ldg_s4((const short4 *)p);
   __nvvm_ldg_us4((const ushort4 *)p);
 
-  // CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
-  // CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
+  // CHECK: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
+  // CHECK: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
   typedef int int2 __attribute__((ext_vector_type(2)));
   typedef unsigned int uint2 __attribute__((ext_vector_type(2)));
   __nvvm_ldg_i2((const int2 *)p);
   __nvvm_ldg_ui2((const uint2 *)p);
 
-  // CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16)
-  // CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16)
+  // CHECK: load <4 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
+  // CHECK: load <4 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
   typedef int int4 __attribute__((ext_vector_type(4)));
   typedef unsigned int uint4 __attribute__((ext_vector_type(4)));
   __nvvm_ldg_i4((const int4 *)p);
   __nvvm_ldg_ui4((const uint4 *)p);
 
-  // LP32: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
-  // LP32: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
-  // LP64: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
-  // LP64: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
+  // LP32: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
+  // LP32: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
+  // LP64: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
+  // LP64: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
   typedef long long2 __attribute__((ext_vector_type(2)));
   typedef unsigned long ulong2 __attribute__((ext_vector_type(2)));
   __nvvm_ldg_l2((const long2 *)p);
   __nvvm_ldg_ul2((const ulong2 *)p);
 
-  // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
-  // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
+  // CHECK: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
+  // CHECK: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
   typedef long long longlong2 __attribute__((ext_vector_type(2)));
   typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2)));
   __nvvm_ldg_ll2((const longlong2 *)p);
   __nvvm_ldg_ull2((const ulonglong2 *)p);
 
-  // CHECK: call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0(ptr {{%[0-9]+}}, i32 8)
+  // CHECK: load <2 x float>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
   typedef float float2 __attribute__((ext_vector_type(2)));
   __nvvm_ldg_f2((const float2 *)p);
 
-  // CHECK: call <4 x float> @llvm.nvvm.ldg.global.f.v4f32.p0(ptr {{%[0-9]+}}, i32 16)
+  // CHECK: load <4 x float>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
   typedef float float4 __attribute__((ext_vector_type(4)));
   __nvvm_ldg_f4((const float4 *)p);
 
-  // CHECK: call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr {{%[0-9]+}}, i32 16)
+  // CHECK: load <2 x double>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
   typedef double double2 __attribute__((ext_vector_type(2)));
   __nvvm_ldg_d2((const double2 *)p);
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 7b8ffe417fccdb..3cc45adb198e26 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -42,6 +42,9 @@
 //   * llvm.nvvm.ptr.shared.to.gen   --> ibid.
 //   * llvm.nvvm.ptr.constant.to.gen --> ibid.
 //   * llvm.nvvm.ptr.local.to.gen    --> ibid.
+//   * llvm.nvvm.ldg.global.i        --> load addrspace(1) !load.invariant
+//   * llvm.nvvm.ldg.global.f        --> ibid.
+//   * llvm.nvvm.ldg.global.p        --> ibid.
 
 def llvm_global_ptr_ty  : LLVMQualPointerType<1>;  // (global)ptr
 def llvm_shared_ptr_ty  : LLVMQualPointerType<3>;  // (shared)ptr
@@ -1595,21 +1598,6 @@ def int_nvvm_ldu_global_p : Intrinsic<[llvm_anyptr_ty],
   [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
   "llvm.nvvm.ldu.global.p">;
 
-// Generated within nvvm. Use for ldg on sm_35 or later.  Second arg is the
-// pointer's alignment.
-def int_nvvm_ldg_global_i : Intrinsic<[llvm_anyint_ty],
-  [llvm_anyptr_ty, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
-  "llvm.nvvm.ldg.global.i">;
-def int_nvvm_ldg_global_f : Intrinsic<[llvm_anyfloat_ty],
-  [llvm_anyptr_ty, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
-  "llvm.nvvm.ldg.global.f">;
-def int_nvvm_ldg_global_p : Intrinsic<[llvm_anyptr_ty],
-  [llvm_anyptr_ty, llvm_i32_ty],
-  [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
-  "llvm.nvvm.ldg.global.p">;
-
 // Used in nvvm internally to help address space opt and ptx code generation
 // This is for params that are passed to kernel functions by pointer by-val.
 def int_nvvm_ptr_gen_to_param: Intrinsic<[llvm_anyptr_ty],
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index bb03c9290e4cf4..73882fbc7a251a 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -37,6 +37,7 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/CommandLine.h"
@@ -1301,6 +1302,10 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
             (Name.consume_front("local") || Name.consume_front("shared") ||
              Name.consume_front("global") || Name.consume_front("constant")) &&
             Name.starts_with(".to.gen");
+      else if (Name.consume_front("ldg.global."))
+        // nvvm.ldg.global.{i,p,f}
+        Expand = (Name.starts_with("i.") || Name.starts_with("f.") ||
+                  Name.starts_with("p."));
       else
         Expand = false;
 
@@ -2363,6 +2368,15 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
                Name.consume_front("constant")) &&
               Name.starts_with(".to.gen"))) {
     Rep = Builder.CreateAddrSpaceCast(CI->getArgOperand(0), CI->getType());
+  } else if (Name.consume_front("ldg.global")) {
+    Value *Ptr = CI->getArgOperand(0);
+    Align PtrAlign = cast<ConstantInt>(CI->getArgOperand(1))->getAlignValue();
+    // Use addrspace(1) for NVPTX ADDRESS_SPACE_GLOBAL
+    Value *ASC = Builder.CreateAddrSpaceCast(Ptr, Builder.getPtrTy(1));
+    Instruction *LD = Builder.CreateAlignedLoad(CI->getType(), ASC, PtrAlign);
+    MDNode *MD = MDNode::get(Builder.getContext(), {});
+    LD->setMetadata(LLVMContext::MD_invariant_load, MD);
+    return LD;
   } else {
     Intrinsic::ID IID = shouldUpgradeNVPTXBF16Intrinsic(Name);
     if (IID != Intrinsic::not_intrinsic &&
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 7f942de74bdcc9..4d2e7fb373de5d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -126,8 +126,6 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
     if (tryLoadVector(N))
       return;
     break;
-  case NVPTXISD::LDGV2:
-  case NVPTXISD::LDGV4:
   case NVPTXISD::LDUV2:
   case NVPTXISD::LDUV4:
     if (tryLDGLDU(N))
@@ -550,9 +548,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
   switch (IID) {
   default:
     return false;
-  case Intrinsic::nvvm_ldg_global_f:
-  case Intrinsic::nvvm_ldg_global_i:
-  case Intrinsic::nvvm_ldg_glo...
[truncated]

Copy link

github-actions bot commented Oct 18, 2024

✅ With the latest revision this PR passed the C/C++ code formatter.

Copy link
Member

@Artem-B Artem-B left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM. Please add a note about the intrinsic removal/deprecation to the release notes.

@AlexMaclean AlexMaclean force-pushed the dev/amaclean/upstream-ldg-remove branch from 5e16b79 to 07b4ef6 Compare October 22, 2024 05:46
@AlexMaclean AlexMaclean merged commit fb33af0 into llvm:main Oct 27, 2024
9 checks passed
@llvm-ci
Copy link
Collaborator

llvm-ci commented Oct 28, 2024

LLVM Buildbot has detected a new failure on builder openmp-s390x-linux running on systemz-1 while building clang,llvm at step 6 "test-openmp".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/88/builds/3935

Here is the relevant piece of the build log for the reference
Step 6 (test-openmp) failure: 1200 seconds without output running [b'ninja', b'-j 4', b'check-openmp'], attempting to kill
...
PASS: ompd-test :: openmp_examples/example_3.c (439 of 449)
PASS: ompd-test :: openmp_examples/example_4.c (440 of 449)
PASS: ompd-test :: openmp_examples/example_5.c (441 of 449)
PASS: ompd-test :: openmp_examples/example_task.c (442 of 449)
UNSUPPORTED: ompd-test :: openmp_examples/ompd_bt.c (443 of 449)
PASS: ompd-test :: openmp_examples/fibonacci.c (444 of 449)
UNSUPPORTED: ompd-test :: openmp_examples/ompd_parallel.c (445 of 449)
PASS: ompd-test :: openmp_examples/parallel.c (446 of 449)
PASS: ompd-test :: openmp_examples/nested.c (447 of 449)
PASS: ompd-test :: openmp_examples/ompd_icvs.c (448 of 449)
command timed out: 1200 seconds without output running [b'ninja', b'-j 4', b'check-openmp'], attempting to kill
process killed by signal 9
program finished with exit code -1
elapsedTime=1327.211968

@llvm-ci
Copy link
Collaborator

llvm-ci commented Oct 28, 2024

LLVM Buildbot has detected a new failure on builder sanitizer-aarch64-linux-bootstrap-asan running on sanitizer-buildbot8 while building clang,llvm at step 2 "annotate".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/24/builds/2313

Here is the relevant piece of the build log for the reference
Step 2 (annotate) failure: 'python ../sanitizer_buildbot/sanitizers/zorg/buildbot/builders/sanitizers/buildbot_selector.py' (failure)
...
llvm-lit: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/llvm/config.py:506: note: using lld-link: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/lld-link
llvm-lit: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/llvm/config.py:506: note: using ld64.lld: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld64.lld
llvm-lit: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/llvm/config.py:506: note: using wasm-ld: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/wasm-ld
llvm-lit: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/llvm/config.py:506: note: using ld.lld: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld
llvm-lit: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/llvm/config.py:506: note: using lld-link: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/lld-link
llvm-lit: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/llvm/config.py:506: note: using ld64.lld: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld64.lld
llvm-lit: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/llvm/config.py:506: note: using wasm-ld: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/wasm-ld
llvm-lit: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/main.py:72: note: The test suite configuration requested an individual test timeout of 0 seconds but a timeout of 900 seconds was requested on the command line. Forcing timeout to be 900 seconds.
-- Testing: 83149 of 83150 tests, 48 workers --
Testing:  0.. 10.. 20.. 30.. 40.. 50.. 60.. 70.. 80.. 90.
FAIL: lld :: ELF/allow-shlib-undefined.s (80867 of 83149)
******************** TEST 'lld :: ELF/allow-shlib-undefined.s' FAILED ********************
Exit Code: 1

Command Output (stderr):
--
RUN: at line 3: rm -rf /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/Output/allow-shlib-undefined.s.tmp && split-file /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/lld/test/ELF/allow-shlib-undefined.s /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/Output/allow-shlib-undefined.s.tmp && cd /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/Output/allow-shlib-undefined.s.tmp
+ rm -rf /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/Output/allow-shlib-undefined.s.tmp
+ split-file /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/lld/test/ELF/allow-shlib-undefined.s /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/Output/allow-shlib-undefined.s.tmp
+ cd /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/Output/allow-shlib-undefined.s.tmp
RUN: at line 4: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/llvm-mc -filetype=obj -triple=x86_64 main.s -o main.o
+ /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/llvm-mc -filetype=obj -triple=x86_64 main.s -o main.o
RUN: at line 5: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/llvm-mc -filetype=obj -triple=x86_64 def.s -o def.o
+ /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/llvm-mc -filetype=obj -triple=x86_64 def.s -o def.o
RUN: at line 6: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/llvm-mc -filetype=obj -triple=x86_64 def-hidden.s -o def-hidden.o
+ /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/llvm-mc -filetype=obj -triple=x86_64 def-hidden.s -o def-hidden.o
RUN: at line 7: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/llvm-mc -filetype=obj -triple=x86_64 ref.s -o ref.o
+ /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/llvm-mc -filetype=obj -triple=x86_64 ref.s -o ref.o
RUN: at line 8: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o && /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld -shared a.o -o a.so
+ /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o
+ /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld -shared a.o -o a.so
RUN: at line 9: cp a.so b.so
+ cp a.so b.so
RUN: at line 10: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/llvm-mc -filetype=obj -triple=x86_64 empty.s -o empty.o && /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld -shared empty.o -o empty.so
+ /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/llvm-mc -filetype=obj -triple=x86_64 empty.s -o empty.o
+ /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld -shared empty.o -o empty.so
RUN: at line 12: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld --allow-shlib-undefined main.o a.so -o /dev/null
+ /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld --allow-shlib-undefined main.o a.so -o /dev/null
RUN: at line 13: not /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld --no-allow-shlib-undefined main.o a.so -o /dev/null 2>&1 | /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/FileCheck /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/lld/test/ELF/allow-shlib-undefined.s
+ not /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld --no-allow-shlib-undefined main.o a.so -o /dev/null
+ /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/FileCheck /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/lld/test/ELF/allow-shlib-undefined.s
RUN: at line 15: not /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld main.o a.so -o /dev/null 2>&1 | /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/FileCheck /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/lld/test/ELF/allow-shlib-undefined.s
+ /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/FileCheck /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/lld/test/ELF/allow-shlib-undefined.s
+ not /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld main.o a.so -o /dev/null
RUN: at line 16: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld main.o a.so --noinhibit-exec -o /dev/null 2>&1 | /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/FileCheck /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/lld/test/ELF/allow-shlib-undefined.s --check-prefix=WARN
+ /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld main.o a.so --noinhibit-exec -o /dev/null
+ /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/FileCheck /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/lld/test/ELF/allow-shlib-undefined.s --check-prefix=WARN
RUN: at line 17: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld main.o a.so --warn-unresolved-symbols -o /dev/null 2>&1 | /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/FileCheck /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/lld/test/ELF/allow-shlib-undefined.s --check-prefix=WARN
+ /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld main.o a.so --warn-unresolved-symbols -o /dev/null
Step 10 (stage2/asan check) failure: stage2/asan check (failure)
...
llvm-lit: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/llvm/config.py:506: note: using lld-link: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/lld-link
llvm-lit: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/llvm/config.py:506: note: using ld64.lld: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld64.lld
llvm-lit: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/llvm/config.py:506: note: using wasm-ld: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/wasm-ld
llvm-lit: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/llvm/config.py:506: note: using ld.lld: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld
llvm-lit: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/llvm/config.py:506: note: using lld-link: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/lld-link
llvm-lit: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/llvm/config.py:506: note: using ld64.lld: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld64.lld
llvm-lit: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/llvm/config.py:506: note: using wasm-ld: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/wasm-ld
llvm-lit: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/llvm/utils/lit/lit/main.py:72: note: The test suite configuration requested an individual test timeout of 0 seconds but a timeout of 900 seconds was requested on the command line. Forcing timeout to be 900 seconds.
-- Testing: 83149 of 83150 tests, 48 workers --
Testing:  0.. 10.. 20.. 30.. 40.. 50.. 60.. 70.. 80.. 90.
FAIL: lld :: ELF/allow-shlib-undefined.s (80867 of 83149)
******************** TEST 'lld :: ELF/allow-shlib-undefined.s' FAILED ********************
Exit Code: 1

Command Output (stderr):
--
RUN: at line 3: rm -rf /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/Output/allow-shlib-undefined.s.tmp && split-file /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/lld/test/ELF/allow-shlib-undefined.s /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/Output/allow-shlib-undefined.s.tmp && cd /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/Output/allow-shlib-undefined.s.tmp
+ rm -rf /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/Output/allow-shlib-undefined.s.tmp
+ split-file /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/lld/test/ELF/allow-shlib-undefined.s /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/Output/allow-shlib-undefined.s.tmp
+ cd /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/tools/lld/test/ELF/Output/allow-shlib-undefined.s.tmp
RUN: at line 4: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/llvm-mc -filetype=obj -triple=x86_64 main.s -o main.o
+ /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/llvm-mc -filetype=obj -triple=x86_64 main.s -o main.o
RUN: at line 5: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/llvm-mc -filetype=obj -triple=x86_64 def.s -o def.o
+ /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/llvm-mc -filetype=obj -triple=x86_64 def.s -o def.o
RUN: at line 6: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/llvm-mc -filetype=obj -triple=x86_64 def-hidden.s -o def-hidden.o
+ /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/llvm-mc -filetype=obj -triple=x86_64 def-hidden.s -o def-hidden.o
RUN: at line 7: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/llvm-mc -filetype=obj -triple=x86_64 ref.s -o ref.o
+ /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/llvm-mc -filetype=obj -triple=x86_64 ref.s -o ref.o
RUN: at line 8: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o && /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld -shared a.o -o a.so
+ /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o
+ /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld -shared a.o -o a.so
RUN: at line 9: cp a.so b.so
+ cp a.so b.so
RUN: at line 10: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/llvm-mc -filetype=obj -triple=x86_64 empty.s -o empty.o && /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld -shared empty.o -o empty.so
+ /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/llvm-mc -filetype=obj -triple=x86_64 empty.s -o empty.o
+ /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld -shared empty.o -o empty.so
RUN: at line 12: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld --allow-shlib-undefined main.o a.so -o /dev/null
+ /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld --allow-shlib-undefined main.o a.so -o /dev/null
RUN: at line 13: not /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld --no-allow-shlib-undefined main.o a.so -o /dev/null 2>&1 | /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/FileCheck /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/lld/test/ELF/allow-shlib-undefined.s
+ not /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld --no-allow-shlib-undefined main.o a.so -o /dev/null
+ /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/FileCheck /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/lld/test/ELF/allow-shlib-undefined.s
RUN: at line 15: not /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld main.o a.so -o /dev/null 2>&1 | /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/FileCheck /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/lld/test/ELF/allow-shlib-undefined.s
+ /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/FileCheck /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/lld/test/ELF/allow-shlib-undefined.s
+ not /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld main.o a.so -o /dev/null
RUN: at line 16: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld main.o a.so --noinhibit-exec -o /dev/null 2>&1 | /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/FileCheck /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/lld/test/ELF/allow-shlib-undefined.s --check-prefix=WARN
+ /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld main.o a.so --noinhibit-exec -o /dev/null
+ /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/FileCheck /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/lld/test/ELF/allow-shlib-undefined.s --check-prefix=WARN
RUN: at line 17: /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld main.o a.so --warn-unresolved-symbols -o /dev/null 2>&1 | /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/FileCheck /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm-project/lld/test/ELF/allow-shlib-undefined.s --check-prefix=WARN
+ /home/b/sanitizer-aarch64-linux-bootstrap-asan/build/llvm_build_asan/bin/ld.lld main.o a.so --warn-unresolved-symbols -o /dev/null

NoumanAmir657 pushed a commit to NoumanAmir657/llvm-project that referenced this pull request Nov 4, 2024
Remove these intrinsics which can be better represented by load
instructions with `!invariant.load` metadata:

- llvm.nvvm.ldg.global.i
- llvm.nvvm.ldg.global.f
- llvm.nvvm.ldg.global.p
vchuravy added a commit to JuliaLang/julia that referenced this pull request Feb 13, 2025
Other backends (in this case NVPTX) require that `invariant.load`
metadata is maintained to generate non-coherent loads.

Currently, we unconditionally strip that metadata from all loads,
since our other uses of it may have become invalid.

x-ref: llvm/llvm-project#112834 JuliaGPU/CUDA.jl#2531
vchuravy added a commit to JuliaLang/julia that referenced this pull request Feb 18, 2025
Other backends (in this case NVPTX) require that `invariant.load`
metadata is maintained to generate non-coherent loads.

Currently, we unconditionally strip that metadata from all loads,
since our other uses of it may have become invalid.

x-ref: llvm/llvm-project#112834
JuliaGPU/CUDA.jl#2531

---------

Co-authored-by: Gabriel Baraldi <[email protected]>
KristofferC pushed a commit to JuliaLang/julia that referenced this pull request Feb 21, 2025
Other backends (in this case NVPTX) require that `invariant.load`
metadata is maintained to generate non-coherent loads.

Currently, we unconditionally strip that metadata from all loads,
since our other uses of it may have become invalid.

x-ref: llvm/llvm-project#112834
JuliaGPU/CUDA.jl#2531

---------

Co-authored-by: Gabriel Baraldi <[email protected]>
(cherry picked from commit 29da86b)
KristofferC pushed a commit to JuliaLang/julia that referenced this pull request Mar 11, 2025
Other backends (in this case NVPTX) require that `invariant.load`
metadata is maintained to generate non-coherent loads.

Currently, we unconditionally strip that metadata from all loads,
since our other uses of it may have become invalid.

x-ref: llvm/llvm-project#112834
JuliaGPU/CUDA.jl#2531

---------

Co-authored-by: Gabriel Baraldi <[email protected]>
(cherry picked from commit 29da86b)
KristofferC pushed a commit to JuliaLang/julia that referenced this pull request Mar 11, 2025
Other backends (in this case NVPTX) require that `invariant.load`
metadata is maintained to generate non-coherent loads.

Currently, we unconditionally strip that metadata from all loads,
since our other uses of it may have become invalid.

x-ref: llvm/llvm-project#112834
JuliaGPU/CUDA.jl#2531

---------

Co-authored-by: Gabriel Baraldi <[email protected]>
(cherry picked from commit 29da86b)
KristofferC pushed a commit to JuliaLang/julia that referenced this pull request Mar 25, 2025
Other backends (in this case NVPTX) require that `invariant.load`
metadata is maintained to generate non-coherent loads.

Currently, we unconditionally strip that metadata from all loads,
since our other uses of it may have become invalid.

x-ref: llvm/llvm-project#112834
JuliaGPU/CUDA.jl#2531

---------

Co-authored-by: Gabriel Baraldi <[email protected]>
(cherry picked from commit 29da86b)
modularbot pushed a commit to modular/modular that referenced this pull request Apr 29, 2025
…C (#50241)

llvm/llvm-project#112834 removes some of the ldg
functions and simplifies them. This does the same and unblocks the llvm
bump.

Closes KERN-733

LINALG_ORIG_REV_ID: 62e96df7f8430b7cd495a997d1db4a14255192b9
lriggs1311 pushed a commit to modular/modular that referenced this pull request Apr 29, 2025
…C (#50241)

llvm/llvm-project#112834 removes some of the ldg
functions and simplifies them. This does the same and unblocks the llvm
bump.

Closes KERN-733

LINALG_ORIG_REV_ID: 62e96df7f8430b7cd495a997d1db4a14255192b9
modularbot pushed a commit to modular/modular that referenced this pull request May 1, 2025
…C (#50241)

llvm/llvm-project#112834 removes some of the ldg
functions and simplifies them. This does the same and unblocks the llvm
bump.

Closes KERN-733

GPU_TEST_ORIG_REV_ID: 62e96df7f8430b7cd495a997d1db4a14255192b9
lriggs1311 pushed a commit to modular/modular that referenced this pull request May 1, 2025
…C (#50241)

llvm/llvm-project#112834 removes some of the ldg
functions and simplifies them. This does the same and unblocks the llvm
bump.

Closes KERN-733

GPU_TEST_ORIG_REV_ID: 62e96df7f8430b7cd495a997d1db4a14255192b9
KristofferC pushed a commit to JuliaLang/julia that referenced this pull request Jun 5, 2025
Other backends (in this case NVPTX) require that `invariant.load`
metadata is maintained to generate non-coherent loads.

Currently, we unconditionally strip that metadata from all loads,
since our other uses of it may have become invalid.

x-ref: llvm/llvm-project#112834
JuliaGPU/CUDA.jl#2531

---------

Co-authored-by: Gabriel Baraldi <[email protected]>
(cherry picked from commit 29da86b)
KristofferC pushed a commit to JuliaLang/julia that referenced this pull request Jul 3, 2025
Other backends (in this case NVPTX) require that `invariant.load`
metadata is maintained to generate non-coherent loads.

Currently, we unconditionally strip that metadata from all loads,
since our other uses of it may have become invalid.

x-ref: llvm/llvm-project#112834
JuliaGPU/CUDA.jl#2531

---------

Co-authored-by: Gabriel Baraldi <[email protected]>
(cherry picked from commit 29da86b)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
backend:NVPTX clang:codegen IR generation bugs: mangling, exceptions, etc. clang Clang issues not falling into any other category llvm:ir
Projects
None yet
Development

Successfully merging this pull request may close these issues.

5 participants