From 4c3c3a27fe537828ab0d8eb2b227016f3214c6ff Mon Sep 17 00:00:00 2001
From: AdUhTkJm <2292398666@qq.com>
Date: Tue, 11 Mar 2025 08:04:45 +0000
Subject: [PATCH] [CIR][CUDA] Register __global__ functions

---
 .../clang/CIR/Dialect/IR/CIRDataLayout.h      |  11 +
 clang/lib/CIR/Dialect/IR/CIRDataLayout.cpp    |  12 +
 .../Dialect/Transforms/LoweringPrepare.cpp    | 213 +++++++++++++++---
 clang/test/CIR/CodeGen/CUDA/registration.cu   |  82 ++++---
 4 files changed, 254 insertions(+), 64 deletions(-)
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRDataLayout.h b/clang/include/clang/CIR/Dialect/IR/CIRDataLayout.h
index 89a724594081..9db2dc568f9f 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRDataLayout.h
+++ b/clang/include/clang/CIR/Dialect/IR/CIRDataLayout.h
@@ -14,6 +14,7 @@
 
 #include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "clang/CIR/Dialect/IR/CIRAttrs.h"
 #include "clang/CIR/Dialect/IR/CIRTypes.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Support/Alignment.h"
@@ -35,6 +36,8 @@ class CIRDataLayout {
   // The StructType -> StructLayout map.
   mutable void *LayoutMap = nullptr;
 
+  TypeSizeInfoAttr typeSizeInfo;
+
 public:
   mlir::DataLayout layout;
 
@@ -106,6 +109,14 @@ class CIRDataLayout {
         cir::IntType::get(Ty.getContext(), getPointerTypeSizeInBits(Ty), false);
     return IntTy;
   }
+
+  mlir::Type getIntType(mlir::MLIRContext *ctx) const {
+    return typeSizeInfo.getIntType(ctx);
+  }
+
+  mlir::Type getCharType(mlir::MLIRContext *ctx) const {
+    return typeSizeInfo.getCharType(ctx);
+  }
 };
 
 /// Used to lazily calculate structure layout information for a target machine,
diff --git a/clang/lib/CIR/Dialect/IR/CIRDataLayout.cpp b/clang/lib/CIR/Dialect/IR/CIRDataLayout.cpp
index 9add51a35c53..0ccebdccf6a7 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDataLayout.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDataLayout.cpp
@@ -1,4 +1,5 @@
 #include "clang/CIR/Dialect/IR/CIRDataLayout.h"
+#include "clang/CIR/Dialect/IR/CIRDialect.h"
 #include "clang/CIR/Dialect/IR/CIRTypes.h"
 #include "clang/CIR/MissingFeatures.h"
 #include "llvm/IR/DataLayout.h"
@@ -112,6 +113,17 @@ class StructLayoutMap {
 
 CIRDataLayout::CIRDataLayout(mlir::ModuleOp modOp) : layout{modOp} {
   reset(modOp.getDataLayoutSpec());
+  if (auto attr = modOp->getAttr(cir::CIRDialect::getTypeSizeInfoAttrName()))
+    typeSizeInfo = mlir::cast<TypeSizeInfoAttr>(attr);
+  else {
+    // Generate default size information.
+    auto voidPtrTy = PointerType::get(VoidType::get(modOp->getContext()));
+    llvm::TypeSize ptrSize = getTypeSizeInBits(voidPtrTy);
+    typeSizeInfo =
+        TypeSizeInfoAttr::get(modOp->getContext(),
+                              /*char_size=*/8, /*int_size=*/32,
+                              /*size_t_size=*/ptrSize.getFixedValue());
+  }
 }
 
 void CIRDataLayout::reset(mlir::DataLayoutSpecInterface spec) {
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index 00852064b5c0..2cac74fb9308 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -123,13 +123,16 @@ struct LoweringPreparePass : public LoweringPrepareBase<LoweringPreparePass> {
   /// CUDA related
   /// ------------
 
-  // Maps CUDA device stub name to kernel name.
-  llvm::DenseMap<llvm::StringRef, std::string> cudaKernelMap;
+  // Maps CUDA kernel name to device stub function.
+  llvm::StringMap<FuncOp> cudaKernelMap;
 
   void buildCUDAModuleCtor();
   void buildCUDAModuleDtor();
   std::optional<FuncOp> buildCUDARegisterGlobals();
 
+  void buildCUDARegisterGlobalFunctions(cir::CIRBaseBuilderTy &builder,
+                                        FuncOp regGlobalFunc);
+
   ///
   /// AST related
   /// -----------
@@ -185,6 +188,18 @@ struct LoweringPreparePass : public LoweringPrepareBase<LoweringPreparePass> {
   /// List of annotations in the module
   llvm::SmallVector<mlir::Attribute, 4> globalAnnotations;
 };
+
+std::string getCUDAPrefix(clang::ASTContext *astCtx) {
+  if (astCtx->getLangOpts().HIP)
+    return "hip";
+  return "cuda";
+}
+
+std::string addUnderscoredPrefix(llvm::StringRef cudaPrefix,
+                                 llvm::StringRef cudaFunctionName) {
+  return ("__" + cudaPrefix + cudaFunctionName).str();
+}
+
 } // namespace
 
 GlobalOp LoweringPreparePass::buildRuntimeVariable(
@@ -983,6 +998,11 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
   if (astCtx->getLangOpts().GPURelocatableDeviceCode)
     llvm_unreachable("NYI");
 
+  // For CUDA without -fgpu-rdc, it's safe to stop generating ctor
+  // if there's nothing to register.
+  if (cudaKernelMap.empty())
+    return;
+
   // There's no device-side binary, so no need to proceed for CUDA.
   // HIP has to create an external symbol in this case, which is NYI.
   auto cudaBinaryHandleAttr =
@@ -995,18 +1015,14 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
   std::string cudaGPUBinaryName =
       cast<CUDABinaryHandleAttr>(cudaBinaryHandleAttr).getName();
 
-  llvm::StringRef prefix = "cuda";
-
   constexpr unsigned cudaFatMagic = 0x466243b1;
   constexpr unsigned hipFatMagic = 0x48495046; // "HIPF"
 
+  auto cudaPrefix = getCUDAPrefix(astCtx);
+
   const unsigned fatMagic =
       astCtx->getLangOpts().HIP ? hipFatMagic : cudaFatMagic;
 
-  auto addUnderscoredPrefix = [&](llvm::StringRef name) -> std::string {
-    return ("__" + prefix + name).str();
-  };
-
   // MAC OS X needs special care, but we haven't supported that in CIR yet.
   assert(!cir::MissingFeatures::checkMacOSXTriple());
 
@@ -1015,15 +1031,11 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
 
   mlir::Location loc = theModule.getLoc();
 
-  // Extract types from the module.
-  auto typeSizesAttr = cast<TypeSizeInfoAttr>(
-      theModule->getAttr(CIRDialect::getTypeSizeInfoAttrName()));
-
   auto voidTy = VoidType::get(&getContext());
   auto voidPtrTy = PointerType::get(voidTy);
   auto voidPtrPtrTy = PointerType::get(voidPtrTy);
-  auto intTy = typeSizesAttr.getIntType(&getContext());
-  auto charTy = typeSizesAttr.getCharType(&getContext());
+  auto intTy = datalayout->getIntType(&getContext());
+  auto charTy = datalayout->getCharType(&getContext());
 
   // Read the GPU binary and create a constant array for it.
   llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> cudaGPUBinaryOrErr =
@@ -1046,7 +1058,7 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
 
   // OG gives an empty name to this global constant,
   // which is not allowed in CIR.
-  std::string fatbinStrName = addUnderscoredPrefix("_fatbin_str");
+  std::string fatbinStrName = addUnderscoredPrefix(cudaPrefix, "_fatbin_str");
   GlobalOp fatbinStr = builder.create<GlobalOp>(
       loc, fatbinStrName, fatbinType, /*isConstant=*/true,
       /*linkage=*/cir::GlobalLinkageKind::PrivateLinkage);
@@ -1064,59 +1076,186 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
       &getContext(), {intTy, intTy, voidPtrTy, voidPtrTy}, /*packed=*/false,
       /*padded=*/false, StructType::RecordKind::Struct);
 
-  std::string fatbinWrapperName = addUnderscoredPrefix("_fatbin_wrapper");
+  std::string fatbinWrapperName =
+      addUnderscoredPrefix(cudaPrefix, "_fatbin_wrapper");
   GlobalOp fatbinWrapper = builder.create<GlobalOp>(
-      loc, fatbinWrapperName, fatbinWrapperType, /*isConstant=*/false,
+      loc, fatbinWrapperName, fatbinWrapperType, /*isConstant=*/true,
       /*linkage=*/cir::GlobalLinkageKind::InternalLinkage);
   fatbinWrapper.setPrivate();
   fatbinWrapper.setSection(fatbinSectionName);
 
   auto magicInit = IntAttr::get(intTy, fatMagic);
   auto versionInit = IntAttr::get(intTy, 1);
-  // `fatbinInit` is only a placeholder. The value will be initialized at the
-  // beginning of module ctor.
-  auto fatbinInit = builder.getConstNullPtrAttr(voidPtrTy);
+  auto fatbinStrSymbol =
+      mlir::FlatSymbolRefAttr::get(fatbinStr.getSymNameAttr());
+  auto fatbinInit = GlobalViewAttr::get(voidPtrTy, fatbinStrSymbol);
   auto unusedInit = builder.getConstNullPtrAttr(voidPtrTy);
   fatbinWrapper.setInitialValueAttr(cir::ConstStructAttr::get(
       fatbinWrapperType,
       ArrayAttr::get(&getContext(),
                      {magicInit, versionInit, fatbinInit, unusedInit})));
 
+  // GPU fat binary handle is also a global variable in OG.
+  std::string gpubinHandleName =
+      addUnderscoredPrefix(cudaPrefix, "_gpubin_handle");
+  auto gpubinHandle = builder.create<GlobalOp>(
+      loc, gpubinHandleName, voidPtrPtrTy,
+      /*isConstant=*/false, /*linkage=*/GlobalLinkageKind::InternalLinkage);
+  gpubinHandle.setInitialValueAttr(builder.getConstNullPtrAttr(voidPtrPtrTy));
+  gpubinHandle.setPrivate();
+
   // Declare this function:
   //    void **__{cuda|hip}RegisterFatBinary(void *);
 
-  std::string regFuncName = addUnderscoredPrefix("RegisterFatBinary");
+  std::string regFuncName =
+      addUnderscoredPrefix(cudaPrefix, "RegisterFatBinary");
   auto regFuncType = FuncType::get({voidPtrTy}, voidPtrPtrTy);
   auto regFunc = buildRuntimeFunction(builder, regFuncName, loc, regFuncType);
 
   // Create the module constructor.
 
-  std::string moduleCtorName = addUnderscoredPrefix("_module_ctor");
+  std::string moduleCtorName = addUnderscoredPrefix(cudaPrefix, "_module_ctor");
   auto moduleCtor = buildRuntimeFunction(builder, moduleCtorName, loc,
                                          FuncType::get({}, voidTy),
                                          GlobalLinkageKind::InternalLinkage);
   globalCtorList.push_back(GlobalCtorAttr::get(&getContext(), moduleCtorName));
   builder.setInsertionPointToStart(moduleCtor.addEntryBlock());
 
-  auto wrapper = builder.createGetGlobal(fatbinWrapper);
-  // Put fatbinStr inside fatbinWrapper.
-  mlir::Value fatbinStrValue = builder.createGetGlobal(fatbinStr);
-  mlir::Value fatbinField = builder.createGetMemberOp(loc, wrapper, "", 2);
-  builder.createStore(loc, fatbinStrValue, fatbinField);
-
   // Register binary with CUDA runtime. This is substantially different in
   // default mode vs. separate compilation.
   // Corresponding code:
   //     gpuBinaryHandle = __cudaRegisterFatBinary(&fatbinWrapper);
+  auto wrapper = builder.createGetGlobal(fatbinWrapper);
   auto fatbinVoidPtr = builder.createBitcast(wrapper, voidPtrTy);
-  builder.createCallOp(loc, regFunc, fatbinVoidPtr);
+  auto gpuBinaryHandleCall = builder.createCallOp(loc, regFunc, fatbinVoidPtr);
+  auto gpuBinaryHandle = gpuBinaryHandleCall.getResult();
+  // Store the value back to the global `__cuda_gpubin_handle`.
+  auto gpuBinaryHandleGlobal = builder.createGetGlobal(gpubinHandle);
+  builder.createStore(loc, gpuBinaryHandle, gpuBinaryHandleGlobal);
+
+  // Generate __cuda_register_globals and call it.
+  std::optional<FuncOp> regGlobal = buildCUDARegisterGlobals();
+  if (regGlobal) {
+    builder.createCallOp(loc, *regGlobal, gpuBinaryHandle);
+  }
 
-  // This is currently incomplete.
-  // TODO(cir): create __cuda_register_globals(), and call it here.
+  // From CUDA 10.1 onwards, we must call this function to end registration:
+  //      void __cudaRegisterFatBinaryEnd(void **fatbinHandle);
+  // This is CUDA-specific, so no need to use `addUnderscoredPrefix`.
+  if (clang::CudaFeatureEnabled(
+          astCtx->getTargetInfo().getSDKVersion(),
+          clang::CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
+    cir::CIRBaseBuilderTy globalBuilder(getContext());
+    globalBuilder.setInsertionPointToStart(theModule.getBody());
+    FuncOp endFunc =
+        buildRuntimeFunction(globalBuilder, "__cudaRegisterFatBinaryEnd", loc,
+                             FuncType::get({voidPtrPtrTy}, voidTy));
+    builder.createCallOp(loc, endFunc, gpuBinaryHandle);
+  }
 
   builder.create<cir::ReturnOp>(loc);
 }
 
+std::optional<FuncOp> LoweringPreparePass::buildCUDARegisterGlobals() {
+  // There is nothing to register.
+  if (cudaKernelMap.empty())
+    return {};
+
+  cir::CIRBaseBuilderTy builder(getContext());
+  builder.setInsertionPointToStart(theModule.getBody());
+
+  auto loc = theModule.getLoc();
+  auto cudaPrefix = getCUDAPrefix(astCtx);
+
+  auto voidTy = VoidType::get(&getContext());
+  auto voidPtrPtrTy = PointerType::get(PointerType::get(voidTy));
+
+  // Create the function:
+  //      void __cuda_register_globals(void **fatbinHandle)
+  std::string regGlobalFuncName =
+      addUnderscoredPrefix(cudaPrefix, "_register_globals");
+  auto regGlobalFuncTy = FuncType::get({voidPtrPtrTy}, voidTy);
+  FuncOp regGlobalFunc =
+      buildRuntimeFunction(builder, regGlobalFuncName, loc, regGlobalFuncTy,
+                           /*linkage=*/GlobalLinkageKind::InternalLinkage);
+  builder.setInsertionPointToStart(regGlobalFunc.addEntryBlock());
+
+  buildCUDARegisterGlobalFunctions(builder, regGlobalFunc);
+
+  // TODO(cir): registration for global variables.
+
+  builder.create<ReturnOp>(loc);
+  return regGlobalFunc;
+}
+
+void LoweringPreparePass::buildCUDARegisterGlobalFunctions(
+    cir::CIRBaseBuilderTy &builder, FuncOp regGlobalFunc) {
+  auto loc = theModule.getLoc();
+  auto cudaPrefix = getCUDAPrefix(astCtx);
+
+  auto voidTy = VoidType::get(&getContext());
+  auto voidPtrTy = PointerType::get(voidTy);
+  auto voidPtrPtrTy = PointerType::get(voidPtrTy);
+  auto intTy = datalayout->getIntType(&getContext());
+  auto charTy = datalayout->getCharType(&getContext());
+
+  // Extract the GPU binary handle argument.
+  mlir::Value fatbinHandle = *regGlobalFunc.args_begin();
+
+  cir::CIRBaseBuilderTy globalBuilder(getContext());
+  globalBuilder.setInsertionPointToStart(theModule.getBody());
+
+  // Declare CUDA internal functions:
+  // int __cudaRegisterFunction(
+  //   void **fatbinHandle,
+  //   const char *hostFunc,
+  //   char *deviceFunc,
+  //   const char *deviceName,
+  //   int threadLimit,
+  //   uint3 *tid, uint3 *bid, dim3 *bDim, dim3 *gDim,
+  //   int *wsize
+  // )
+  // OG doesn't care about the types at all. They're treated as void*.
+
+  FuncOp cudaRegisterFunction = buildRuntimeFunction(
+      globalBuilder, addUnderscoredPrefix(cudaPrefix, "RegisterFunction"), loc,
+      FuncType::get({voidPtrPtrTy, voidPtrTy, voidPtrTy, voidPtrTy, intTy,
+                     voidPtrTy, voidPtrTy, voidPtrTy, voidPtrTy, voidPtrTy},
+                    intTy));
+
+  auto makeConstantString = [&](llvm::StringRef str) -> GlobalOp {
+    auto strType = ArrayType::get(&getContext(), charTy, 1 + str.size());
+
+    auto tmpString = globalBuilder.create<GlobalOp>(
+        loc, (".str" + str).str(), strType, /*isConstant=*/true,
+        /*linkage=*/cir::GlobalLinkageKind::PrivateLinkage);
+
+    // We must make the string zero-terminated.
+    tmpString.setInitialValueAttr(ConstArrayAttr::get(
+        strType, StringAttr::get(&getContext(), str + "\0")));
+    tmpString.setPrivate();
+    return tmpString;
+  };
+
+  auto cirNullPtr = builder.getNullPtr(voidPtrTy, loc);
+  for (auto kernelName : cudaKernelMap.keys()) {
+    FuncOp deviceStub = cudaKernelMap[kernelName];
+    GlobalOp deviceFuncStr = makeConstantString(kernelName);
+    mlir::Value deviceFunc = builder.createBitcast(
+        builder.createGetGlobal(deviceFuncStr), voidPtrTy);
+    mlir::Value hostFunc = builder.createBitcast(
+        builder.create<GetGlobalOp>(
+            loc, PointerType::get(deviceStub.getFunctionType()),
+            mlir::FlatSymbolRefAttr::get(deviceStub.getSymNameAttr())),
+        voidPtrTy);
+    builder.createCallOp(
+        loc, cudaRegisterFunction,
+        {fatbinHandle, hostFunc, deviceFunc, deviceFunc,
+         builder.create<ConstantOp>(loc, IntAttr::get(intTy, -1)), cirNullPtr,
+         cirNullPtr, cirNullPtr, cirNullPtr, cirNullPtr});
+  }
+}
+
 void LoweringPreparePass::lowerDynamicCastOp(DynamicCastOp op) {
   CIRBaseBuilderTy builder(getContext());
   builder.setInsertionPointAfter(op);
@@ -1378,11 +1517,10 @@ void LoweringPreparePass::runOnOp(Operation *op) {
       globalDtorList.push_back(globalDtor);
     }
     if (auto attr = fnOp.getExtraAttrs().getElements().get(
-            CIRDialect::getCUDABinaryHandleAttrName())) {
-      auto cudaBinaryAttr = dyn_cast<CUDABinaryHandleAttr>(attr);
-      std::string kernelName = cudaBinaryAttr.getName();
-      llvm::StringRef stubName = fnOp.getSymName();
-      cudaKernelMap[stubName] = kernelName;
+            CUDAKernelNameAttr::getMnemonic())) {
+      auto cudaBinaryAttr = dyn_cast<CUDAKernelNameAttr>(attr);
+      std::string kernelName = cudaBinaryAttr.getKernelName();
+      cudaKernelMap[kernelName] = fnOp;
     }
     if (std::optional<mlir::ArrayAttr> annotations = fnOp.getAnnotations())
       addGlobalAnnotations(fnOp, annotations.value());
@@ -1399,6 +1537,9 @@ void LoweringPreparePass::runOnOperation() {
     datalayout.emplace(theModule);
   }
 
+  auto typeSizeInfo = cast<TypeSizeInfoAttr>(
+      theModule->getAttr(CIRDialect::getTypeSizeInfoAttrName()));
+
   llvm::SmallVector<Operation *> opsToTransform;
 
   op->walk([&](Operation *op) {
diff --git a/clang/test/CIR/CodeGen/CUDA/registration.cu b/clang/test/CIR/CodeGen/CUDA/registration.cu
index 39e534a2fa97..4c80958efb0d 100644
--- a/clang/test/CIR/CodeGen/CUDA/registration.cu
+++ b/clang/test/CIR/CodeGen/CUDA/registration.cu
@@ -13,56 +13,82 @@
 // RUN:            %s -o %t.ll
 // RUN: FileCheck --check-prefix=LLVM-HOST --input-file=%t.ll %s
 
-// COM: OG doesn't emit anything if there is nothing to register.
-// COM: Here we still emit the template for test purposes,
-// COM: and the behaviour will be fixed later.
-
 // CIR-HOST: module @"{{.*}}" attributes {
 // CIR-HOST:   cir.cu.binary_handle = #cir.cu.binary_handle<{{.*}}.fatbin>,
 // CIR-HOST:   cir.global_ctors = [#cir.global_ctor<"__cuda_module_ctor", {{[0-9]+}}>]
 // CIR-HOST: }
 
+// CIR-HOST: cir.global "private" constant cir_private @".str_Z2fnv" =
+// CIR-HOST-SAME: #cir.const_array<"_Z2fnv", trailing_zeros>
+
+// COM: In OG this variable has an `unnamed_addr` attribute.
+// LLVM-HOST: @.str_Z2fnv = private constant [7 x i8] c"_Z2fnv\00"
+
+// The corresponding CIR test for these three variables are down below.
+// They are here because LLVM IR puts global variables at the front of file.
+
+// LLVM-HOST: @__cuda_fatbin_str = private constant [14 x i8] c"sample fatbin\0A", section ".nv_fatbin"
+// LLVM-HOST: @__cuda_fatbin_wrapper = internal constant {
+// LLVM-HOST:   i32 1180844977, i32 1, ptr @__cuda_fatbin_str, ptr null
+// LLVM-HOST: }
+// LLVM-HOST: @llvm.global_ctors = {{.*}}ptr @__cuda_module_ctor
+
+__global__ void fn() {}
+
+// CIR-HOST: cir.func internal private @__cuda_register_globals(%[[FatbinHandle:[a-zA-Z0-9]+]]{{.*}}) {
+// CIR-HOST:   %[[#NULL:]] = cir.const #cir.ptr<null>
+// CIR-HOST:   %[[#T1:]] = cir.get_global @".str_Z2fnv"
+// CIR-HOST:   %[[#DeviceFn:]] = cir.cast(bitcast, %[[#T1]]
+// CIR-HOST:   %[[#T2:]] = cir.get_global @_Z17__device_stub__fnv
+// CIR-HOST:   %[[#HostFn:]] = cir.cast(bitcast, %[[#T2]]
+// CIR-HOST:   %[[#MinusOne:]] = cir.const #cir.int<-1>
+// CIR-HOST:   cir.call @__cudaRegisterFunction(
+// CIR-HOST-SAME: %[[FatbinHandle]],
+// CIR-HOST-SAME: %[[#HostFn]],
+// CIR-HOST-SAME: %[[#DeviceFn]],
+// CIR-HOST-SAME: %[[#DeviceFn]],
+// CIR-HOST-SAME: %[[#MinusOne]],
+// CIR-HOST-SAME: %[[#NULL]], %[[#NULL]], %[[#NULL]], %[[#NULL]], %[[#NULL]])
+// CIR-HOST: }
+
+// LLVM-HOST: define internal void @__cuda_register_globals(ptr %[[#LLVMFatbin:]]) {
+// LLVM-HOST:   call i32 @__cudaRegisterFunction(
+// LLVM-HOST-SAME: ptr %[[#LLVMFatbin]],
+// LLVM-HOST-SAME: ptr @_Z17__device_stub__fnv,
+// LLVM-HOST-SAME: ptr @.str_Z2fnv,
+// LLVM-HOST-SAME: ptr @.str_Z2fnv,
+// LLVM-HOST-SAME: i32 -1,
+// LLVM-HOST-SAME: ptr null, ptr null, ptr null, ptr null, ptr null)
+// LLVM-HOST: }
+
 // The content in const array should be the same as echoed above,
 // with a trailing line break ('\n', 0x0A).
 // CIR-HOST: cir.global "private" constant cir_private @__cuda_fatbin_str =
 // CIR-HOST-SAME: #cir.const_array<"sample fatbin\0A">
 // CIR-HOST-SAME: {{.*}}section = ".nv_fatbin"
 
-// LLVM-HOST: @__cuda_fatbin_str = private constant [14 x i8] c"sample fatbin\0A", section ".nv_fatbin"
-
 // The first value is CUDA file head magic number.
-// CIR-HOST: cir.global "private" internal @__cuda_fatbin_wrapper
+// CIR-HOST: cir.global "private" constant internal @__cuda_fatbin_wrapper
 // CIR-HOST: = #cir.const_struct<{
 // CIR-HOST:   #cir.int<1180844977> : !s32i,
 // CIR-HOST:   #cir.int<1> : !s32i,
-// CIR-HOST:   #cir.ptr<null> : !cir.ptr<!void>,
+// CIR-HOST:   #cir.global_view<@__cuda_fatbin_str> : !cir.ptr<!void>,
 // CIR-HOST:   #cir.ptr<null> : !cir.ptr<!void>
 // CIR-HOST: }>
 // CIR-HOST-SAME: {{.*}}section = ".nvFatBinSegment"
 
-// COM: @__cuda_fatbin_wrapper is constant for OG.
-// COM: However, as we don't have a way to put @__cuda_fatbin_str directly
-// COM: to its third field in Clang IR, we can't mark this variable as 
-// COM: constant: we need to initialize it later, at the beginning
-// COM: of @__cuda_module_ctor.
-
-// LLVM-HOST: @__cuda_fatbin_wrapper = internal global {
-// LLVM-HOST:   i32 1180844977, i32 1, ptr null, ptr null
-// LLVM-HOST: }
-
-// LLVM-HOST: @llvm.global_ctors = {{.*}}ptr @__cuda_module_ctor
-
 // CIR-HOST: cir.func private @__cudaRegisterFatBinary
 // CIR-HOST: cir.func {{.*}} @__cuda_module_ctor() {
-// CIR-HOST:   %[[#F0:]] = cir.get_global @__cuda_fatbin_wrapper
-// CIR-HOST:   %[[#F1:]] = cir.get_global @__cuda_fatbin_str
-// CIR-HOST:   %[[#F2:]] = cir.get_member %[[#F0]][2]
-// CIR-HOST:   %[[#F3:]] = cir.cast(bitcast, %[[#F2]]
-// CIR-HOST:   cir.store %[[#F1]], %[[#F3]]
-// CIR-HOST:   cir.call @__cudaRegisterFatBinary
+// CIR-HOST:   %[[#Fatbin:]] = cir.call @__cudaRegisterFatBinary
+// CIR-HOST:   %[[#FatbinGlobal:]] = cir.get_global @__cuda_gpubin_handle
+// CIR-HOST:   cir.store %[[#Fatbin]], %[[#FatbinGlobal]]
+// CIR-HOST:   cir.call @__cuda_register_globals
+// CIR-HOTS:   cir.call @__cudaRegisterFatBinaryEnd
 // CIR-HOST: }
 
 // LLVM-HOST: define internal void @__cuda_module_ctor() {
-// LLVM-HOST:   store ptr @__cuda_fatbin_str, ptr getelementptr {{.*}}, ptr @__cuda_fatbin_wrapper
-// LLVM-HOST:   call ptr @__cudaRegisterFatBinary(ptr @__cuda_fatbin_wrapper)
+// LLVM-HOST:  %[[#LLVMFatbin:]] = call ptr @__cudaRegisterFatBinary(ptr @__cuda_fatbin_wrapper)
+// LLVM-HOST:  store ptr %[[#LLVMFatbin]], ptr @__cuda_gpubin_handle
+// LLVM-HOST:  call void @__cuda_register_globals
+// LLVM-HOST:  call void @__cudaRegisterFatBinaryEnd
 // LLVM-HOST: }