From 4c3c3a27fe537828ab0d8eb2b227016f3214c6ff Mon Sep 17 00:00:00 2001 From: AdUhTkJm <2292398666@qq.com> Date: Tue, 11 Mar 2025 08:04:45 +0000 Subject: [PATCH] [CIR][CUDA] Register __global__ functions --- .../clang/CIR/Dialect/IR/CIRDataLayout.h | 11 + clang/lib/CIR/Dialect/IR/CIRDataLayout.cpp | 12 + .../Dialect/Transforms/LoweringPrepare.cpp | 213 +++++++++++++++--- clang/test/CIR/CodeGen/CUDA/registration.cu | 82 ++++--- 4 files changed, 254 insertions(+), 64 deletions(-) diff --git a/clang/include/clang/CIR/Dialect/IR/CIRDataLayout.h b/clang/include/clang/CIR/Dialect/IR/CIRDataLayout.h index 89a724594081..9db2dc568f9f 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIRDataLayout.h +++ b/clang/include/clang/CIR/Dialect/IR/CIRDataLayout.h @@ -14,6 +14,7 @@ #include "mlir/Dialect/DLTI/DLTI.h" #include "mlir/IR/BuiltinOps.h" +#include "clang/CIR/Dialect/IR/CIRAttrs.h" #include "clang/CIR/Dialect/IR/CIRTypes.h" #include "llvm/IR/DataLayout.h" #include "llvm/Support/Alignment.h" @@ -35,6 +36,8 @@ class CIRDataLayout { // The StructType -> StructLayout map. mutable void *LayoutMap = nullptr; + TypeSizeInfoAttr typeSizeInfo; + public: mlir::DataLayout layout; @@ -106,6 +109,14 @@ class CIRDataLayout { cir::IntType::get(Ty.getContext(), getPointerTypeSizeInBits(Ty), false); return IntTy; } + + mlir::Type getIntType(mlir::MLIRContext *ctx) const { + return typeSizeInfo.getIntType(ctx); + } + + mlir::Type getCharType(mlir::MLIRContext *ctx) const { + return typeSizeInfo.getCharType(ctx); + } }; /// Used to lazily calculate structure layout information for a target machine, diff --git a/clang/lib/CIR/Dialect/IR/CIRDataLayout.cpp b/clang/lib/CIR/Dialect/IR/CIRDataLayout.cpp index 9add51a35c53..0ccebdccf6a7 100644 --- a/clang/lib/CIR/Dialect/IR/CIRDataLayout.cpp +++ b/clang/lib/CIR/Dialect/IR/CIRDataLayout.cpp @@ -1,4 +1,5 @@ #include "clang/CIR/Dialect/IR/CIRDataLayout.h" +#include "clang/CIR/Dialect/IR/CIRDialect.h" #include "clang/CIR/Dialect/IR/CIRTypes.h" #include "clang/CIR/MissingFeatures.h" #include "llvm/IR/DataLayout.h" @@ -112,6 +113,17 @@ class StructLayoutMap { CIRDataLayout::CIRDataLayout(mlir::ModuleOp modOp) : layout{modOp} { reset(modOp.getDataLayoutSpec()); + if (auto attr = modOp->getAttr(cir::CIRDialect::getTypeSizeInfoAttrName())) + typeSizeInfo = mlir::cast(attr); + else { + // Generate default size information. + auto voidPtrTy = PointerType::get(VoidType::get(modOp->getContext())); + llvm::TypeSize ptrSize = getTypeSizeInBits(voidPtrTy); + typeSizeInfo = + TypeSizeInfoAttr::get(modOp->getContext(), + /*char_size=*/8, /*int_size=*/32, + /*size_t_size=*/ptrSize.getFixedValue()); + } } void CIRDataLayout::reset(mlir::DataLayoutSpecInterface spec) { diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp index 00852064b5c0..2cac74fb9308 100644 --- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp +++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp @@ -123,13 +123,16 @@ struct LoweringPreparePass : public LoweringPrepareBase { /// CUDA related /// ------------ - // Maps CUDA device stub name to kernel name. - llvm::DenseMap cudaKernelMap; + // Maps CUDA kernel name to device stub function. + llvm::StringMap cudaKernelMap; void buildCUDAModuleCtor(); void buildCUDAModuleDtor(); std::optional buildCUDARegisterGlobals(); + void buildCUDARegisterGlobalFunctions(cir::CIRBaseBuilderTy &builder, + FuncOp regGlobalFunc); + /// /// AST related /// ----------- @@ -185,6 +188,18 @@ struct LoweringPreparePass : public LoweringPrepareBase { /// List of annotations in the module llvm::SmallVector globalAnnotations; }; + +std::string getCUDAPrefix(clang::ASTContext *astCtx) { + if (astCtx->getLangOpts().HIP) + return "hip"; + return "cuda"; +} + +std::string addUnderscoredPrefix(llvm::StringRef cudaPrefix, + llvm::StringRef cudaFunctionName) { + return ("__" + cudaPrefix + cudaFunctionName).str(); +} + } // namespace GlobalOp LoweringPreparePass::buildRuntimeVariable( @@ -983,6 +998,11 @@ void LoweringPreparePass::buildCUDAModuleCtor() { if (astCtx->getLangOpts().GPURelocatableDeviceCode) llvm_unreachable("NYI"); + // For CUDA without -fgpu-rdc, it's safe to stop generating ctor + // if there's nothing to register. + if (cudaKernelMap.empty()) + return; + // There's no device-side binary, so no need to proceed for CUDA. // HIP has to create an external symbol in this case, which is NYI. auto cudaBinaryHandleAttr = @@ -995,18 +1015,14 @@ void LoweringPreparePass::buildCUDAModuleCtor() { std::string cudaGPUBinaryName = cast(cudaBinaryHandleAttr).getName(); - llvm::StringRef prefix = "cuda"; - constexpr unsigned cudaFatMagic = 0x466243b1; constexpr unsigned hipFatMagic = 0x48495046; // "HIPF" + auto cudaPrefix = getCUDAPrefix(astCtx); + const unsigned fatMagic = astCtx->getLangOpts().HIP ? hipFatMagic : cudaFatMagic; - auto addUnderscoredPrefix = [&](llvm::StringRef name) -> std::string { - return ("__" + prefix + name).str(); - }; - // MAC OS X needs special care, but we haven't supported that in CIR yet. assert(!cir::MissingFeatures::checkMacOSXTriple()); @@ -1015,15 +1031,11 @@ void LoweringPreparePass::buildCUDAModuleCtor() { mlir::Location loc = theModule.getLoc(); - // Extract types from the module. - auto typeSizesAttr = cast( - theModule->getAttr(CIRDialect::getTypeSizeInfoAttrName())); - auto voidTy = VoidType::get(&getContext()); auto voidPtrTy = PointerType::get(voidTy); auto voidPtrPtrTy = PointerType::get(voidPtrTy); - auto intTy = typeSizesAttr.getIntType(&getContext()); - auto charTy = typeSizesAttr.getCharType(&getContext()); + auto intTy = datalayout->getIntType(&getContext()); + auto charTy = datalayout->getCharType(&getContext()); // Read the GPU binary and create a constant array for it. llvm::ErrorOr> cudaGPUBinaryOrErr = @@ -1046,7 +1058,7 @@ void LoweringPreparePass::buildCUDAModuleCtor() { // OG gives an empty name to this global constant, // which is not allowed in CIR. - std::string fatbinStrName = addUnderscoredPrefix("_fatbin_str"); + std::string fatbinStrName = addUnderscoredPrefix(cudaPrefix, "_fatbin_str"); GlobalOp fatbinStr = builder.create( loc, fatbinStrName, fatbinType, /*isConstant=*/true, /*linkage=*/cir::GlobalLinkageKind::PrivateLinkage); @@ -1064,59 +1076,186 @@ void LoweringPreparePass::buildCUDAModuleCtor() { &getContext(), {intTy, intTy, voidPtrTy, voidPtrTy}, /*packed=*/false, /*padded=*/false, StructType::RecordKind::Struct); - std::string fatbinWrapperName = addUnderscoredPrefix("_fatbin_wrapper"); + std::string fatbinWrapperName = + addUnderscoredPrefix(cudaPrefix, "_fatbin_wrapper"); GlobalOp fatbinWrapper = builder.create( - loc, fatbinWrapperName, fatbinWrapperType, /*isConstant=*/false, + loc, fatbinWrapperName, fatbinWrapperType, /*isConstant=*/true, /*linkage=*/cir::GlobalLinkageKind::InternalLinkage); fatbinWrapper.setPrivate(); fatbinWrapper.setSection(fatbinSectionName); auto magicInit = IntAttr::get(intTy, fatMagic); auto versionInit = IntAttr::get(intTy, 1); - // `fatbinInit` is only a placeholder. The value will be initialized at the - // beginning of module ctor. - auto fatbinInit = builder.getConstNullPtrAttr(voidPtrTy); + auto fatbinStrSymbol = + mlir::FlatSymbolRefAttr::get(fatbinStr.getSymNameAttr()); + auto fatbinInit = GlobalViewAttr::get(voidPtrTy, fatbinStrSymbol); auto unusedInit = builder.getConstNullPtrAttr(voidPtrTy); fatbinWrapper.setInitialValueAttr(cir::ConstStructAttr::get( fatbinWrapperType, ArrayAttr::get(&getContext(), {magicInit, versionInit, fatbinInit, unusedInit}))); + // GPU fat binary handle is also a global variable in OG. + std::string gpubinHandleName = + addUnderscoredPrefix(cudaPrefix, "_gpubin_handle"); + auto gpubinHandle = builder.create( + loc, gpubinHandleName, voidPtrPtrTy, + /*isConstant=*/false, /*linkage=*/GlobalLinkageKind::InternalLinkage); + gpubinHandle.setInitialValueAttr(builder.getConstNullPtrAttr(voidPtrPtrTy)); + gpubinHandle.setPrivate(); + // Declare this function: // void **__{cuda|hip}RegisterFatBinary(void *); - std::string regFuncName = addUnderscoredPrefix("RegisterFatBinary"); + std::string regFuncName = + addUnderscoredPrefix(cudaPrefix, "RegisterFatBinary"); auto regFuncType = FuncType::get({voidPtrTy}, voidPtrPtrTy); auto regFunc = buildRuntimeFunction(builder, regFuncName, loc, regFuncType); // Create the module constructor. - std::string moduleCtorName = addUnderscoredPrefix("_module_ctor"); + std::string moduleCtorName = addUnderscoredPrefix(cudaPrefix, "_module_ctor"); auto moduleCtor = buildRuntimeFunction(builder, moduleCtorName, loc, FuncType::get({}, voidTy), GlobalLinkageKind::InternalLinkage); globalCtorList.push_back(GlobalCtorAttr::get(&getContext(), moduleCtorName)); builder.setInsertionPointToStart(moduleCtor.addEntryBlock()); - auto wrapper = builder.createGetGlobal(fatbinWrapper); - // Put fatbinStr inside fatbinWrapper. - mlir::Value fatbinStrValue = builder.createGetGlobal(fatbinStr); - mlir::Value fatbinField = builder.createGetMemberOp(loc, wrapper, "", 2); - builder.createStore(loc, fatbinStrValue, fatbinField); - // Register binary with CUDA runtime. This is substantially different in // default mode vs. separate compilation. // Corresponding code: // gpuBinaryHandle = __cudaRegisterFatBinary(&fatbinWrapper); + auto wrapper = builder.createGetGlobal(fatbinWrapper); auto fatbinVoidPtr = builder.createBitcast(wrapper, voidPtrTy); - builder.createCallOp(loc, regFunc, fatbinVoidPtr); + auto gpuBinaryHandleCall = builder.createCallOp(loc, regFunc, fatbinVoidPtr); + auto gpuBinaryHandle = gpuBinaryHandleCall.getResult(); + // Store the value back to the global `__cuda_gpubin_handle`. + auto gpuBinaryHandleGlobal = builder.createGetGlobal(gpubinHandle); + builder.createStore(loc, gpuBinaryHandle, gpuBinaryHandleGlobal); + + // Generate __cuda_register_globals and call it. + std::optional regGlobal = buildCUDARegisterGlobals(); + if (regGlobal) { + builder.createCallOp(loc, *regGlobal, gpuBinaryHandle); + } - // This is currently incomplete. - // TODO(cir): create __cuda_register_globals(), and call it here. + // From CUDA 10.1 onwards, we must call this function to end registration: + // void __cudaRegisterFatBinaryEnd(void **fatbinHandle); + // This is CUDA-specific, so no need to use `addUnderscoredPrefix`. + if (clang::CudaFeatureEnabled( + astCtx->getTargetInfo().getSDKVersion(), + clang::CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) { + cir::CIRBaseBuilderTy globalBuilder(getContext()); + globalBuilder.setInsertionPointToStart(theModule.getBody()); + FuncOp endFunc = + buildRuntimeFunction(globalBuilder, "__cudaRegisterFatBinaryEnd", loc, + FuncType::get({voidPtrPtrTy}, voidTy)); + builder.createCallOp(loc, endFunc, gpuBinaryHandle); + } builder.create(loc); } +std::optional LoweringPreparePass::buildCUDARegisterGlobals() { + // There is nothing to register. + if (cudaKernelMap.empty()) + return {}; + + cir::CIRBaseBuilderTy builder(getContext()); + builder.setInsertionPointToStart(theModule.getBody()); + + auto loc = theModule.getLoc(); + auto cudaPrefix = getCUDAPrefix(astCtx); + + auto voidTy = VoidType::get(&getContext()); + auto voidPtrPtrTy = PointerType::get(PointerType::get(voidTy)); + + // Create the function: + // void __cuda_register_globals(void **fatbinHandle) + std::string regGlobalFuncName = + addUnderscoredPrefix(cudaPrefix, "_register_globals"); + auto regGlobalFuncTy = FuncType::get({voidPtrPtrTy}, voidTy); + FuncOp regGlobalFunc = + buildRuntimeFunction(builder, regGlobalFuncName, loc, regGlobalFuncTy, + /*linkage=*/GlobalLinkageKind::InternalLinkage); + builder.setInsertionPointToStart(regGlobalFunc.addEntryBlock()); + + buildCUDARegisterGlobalFunctions(builder, regGlobalFunc); + + // TODO(cir): registration for global variables. + + builder.create(loc); + return regGlobalFunc; +} + +void LoweringPreparePass::buildCUDARegisterGlobalFunctions( + cir::CIRBaseBuilderTy &builder, FuncOp regGlobalFunc) { + auto loc = theModule.getLoc(); + auto cudaPrefix = getCUDAPrefix(astCtx); + + auto voidTy = VoidType::get(&getContext()); + auto voidPtrTy = PointerType::get(voidTy); + auto voidPtrPtrTy = PointerType::get(voidPtrTy); + auto intTy = datalayout->getIntType(&getContext()); + auto charTy = datalayout->getCharType(&getContext()); + + // Extract the GPU binary handle argument. + mlir::Value fatbinHandle = *regGlobalFunc.args_begin(); + + cir::CIRBaseBuilderTy globalBuilder(getContext()); + globalBuilder.setInsertionPointToStart(theModule.getBody()); + + // Declare CUDA internal functions: + // int __cudaRegisterFunction( + // void **fatbinHandle, + // const char *hostFunc, + // char *deviceFunc, + // const char *deviceName, + // int threadLimit, + // uint3 *tid, uint3 *bid, dim3 *bDim, dim3 *gDim, + // int *wsize + // ) + // OG doesn't care about the types at all. They're treated as void*. + + FuncOp cudaRegisterFunction = buildRuntimeFunction( + globalBuilder, addUnderscoredPrefix(cudaPrefix, "RegisterFunction"), loc, + FuncType::get({voidPtrPtrTy, voidPtrTy, voidPtrTy, voidPtrTy, intTy, + voidPtrTy, voidPtrTy, voidPtrTy, voidPtrTy, voidPtrTy}, + intTy)); + + auto makeConstantString = [&](llvm::StringRef str) -> GlobalOp { + auto strType = ArrayType::get(&getContext(), charTy, 1 + str.size()); + + auto tmpString = globalBuilder.create( + loc, (".str" + str).str(), strType, /*isConstant=*/true, + /*linkage=*/cir::GlobalLinkageKind::PrivateLinkage); + + // We must make the string zero-terminated. + tmpString.setInitialValueAttr(ConstArrayAttr::get( + strType, StringAttr::get(&getContext(), str + "\0"))); + tmpString.setPrivate(); + return tmpString; + }; + + auto cirNullPtr = builder.getNullPtr(voidPtrTy, loc); + for (auto kernelName : cudaKernelMap.keys()) { + FuncOp deviceStub = cudaKernelMap[kernelName]; + GlobalOp deviceFuncStr = makeConstantString(kernelName); + mlir::Value deviceFunc = builder.createBitcast( + builder.createGetGlobal(deviceFuncStr), voidPtrTy); + mlir::Value hostFunc = builder.createBitcast( + builder.create( + loc, PointerType::get(deviceStub.getFunctionType()), + mlir::FlatSymbolRefAttr::get(deviceStub.getSymNameAttr())), + voidPtrTy); + builder.createCallOp( + loc, cudaRegisterFunction, + {fatbinHandle, hostFunc, deviceFunc, deviceFunc, + builder.create(loc, IntAttr::get(intTy, -1)), cirNullPtr, + cirNullPtr, cirNullPtr, cirNullPtr, cirNullPtr}); + } +} + void LoweringPreparePass::lowerDynamicCastOp(DynamicCastOp op) { CIRBaseBuilderTy builder(getContext()); builder.setInsertionPointAfter(op); @@ -1378,11 +1517,10 @@ void LoweringPreparePass::runOnOp(Operation *op) { globalDtorList.push_back(globalDtor); } if (auto attr = fnOp.getExtraAttrs().getElements().get( - CIRDialect::getCUDABinaryHandleAttrName())) { - auto cudaBinaryAttr = dyn_cast(attr); - std::string kernelName = cudaBinaryAttr.getName(); - llvm::StringRef stubName = fnOp.getSymName(); - cudaKernelMap[stubName] = kernelName; + CUDAKernelNameAttr::getMnemonic())) { + auto cudaBinaryAttr = dyn_cast(attr); + std::string kernelName = cudaBinaryAttr.getKernelName(); + cudaKernelMap[kernelName] = fnOp; } if (std::optional annotations = fnOp.getAnnotations()) addGlobalAnnotations(fnOp, annotations.value()); @@ -1399,6 +1537,9 @@ void LoweringPreparePass::runOnOperation() { datalayout.emplace(theModule); } + auto typeSizeInfo = cast( + theModule->getAttr(CIRDialect::getTypeSizeInfoAttrName())); + llvm::SmallVector opsToTransform; op->walk([&](Operation *op) { diff --git a/clang/test/CIR/CodeGen/CUDA/registration.cu b/clang/test/CIR/CodeGen/CUDA/registration.cu index 39e534a2fa97..4c80958efb0d 100644 --- a/clang/test/CIR/CodeGen/CUDA/registration.cu +++ b/clang/test/CIR/CodeGen/CUDA/registration.cu @@ -13,56 +13,82 @@ // RUN: %s -o %t.ll // RUN: FileCheck --check-prefix=LLVM-HOST --input-file=%t.ll %s -// COM: OG doesn't emit anything if there is nothing to register. -// COM: Here we still emit the template for test purposes, -// COM: and the behaviour will be fixed later. - // CIR-HOST: module @"{{.*}}" attributes { // CIR-HOST: cir.cu.binary_handle = #cir.cu.binary_handle<{{.*}}.fatbin>, // CIR-HOST: cir.global_ctors = [#cir.global_ctor<"__cuda_module_ctor", {{[0-9]+}}>] // CIR-HOST: } +// CIR-HOST: cir.global "private" constant cir_private @".str_Z2fnv" = +// CIR-HOST-SAME: #cir.const_array<"_Z2fnv", trailing_zeros> + +// COM: In OG this variable has an `unnamed_addr` attribute. +// LLVM-HOST: @.str_Z2fnv = private constant [7 x i8] c"_Z2fnv\00" + +// The corresponding CIR test for these three variables are down below. +// They are here because LLVM IR puts global variables at the front of file. + +// LLVM-HOST: @__cuda_fatbin_str = private constant [14 x i8] c"sample fatbin\0A", section ".nv_fatbin" +// LLVM-HOST: @__cuda_fatbin_wrapper = internal constant { +// LLVM-HOST: i32 1180844977, i32 1, ptr @__cuda_fatbin_str, ptr null +// LLVM-HOST: } +// LLVM-HOST: @llvm.global_ctors = {{.*}}ptr @__cuda_module_ctor + +__global__ void fn() {} + +// CIR-HOST: cir.func internal private @__cuda_register_globals(%[[FatbinHandle:[a-zA-Z0-9]+]]{{.*}}) { +// CIR-HOST: %[[#NULL:]] = cir.const #cir.ptr +// CIR-HOST: %[[#T1:]] = cir.get_global @".str_Z2fnv" +// CIR-HOST: %[[#DeviceFn:]] = cir.cast(bitcast, %[[#T1]] +// CIR-HOST: %[[#T2:]] = cir.get_global @_Z17__device_stub__fnv +// CIR-HOST: %[[#HostFn:]] = cir.cast(bitcast, %[[#T2]] +// CIR-HOST: %[[#MinusOne:]] = cir.const #cir.int<-1> +// CIR-HOST: cir.call @__cudaRegisterFunction( +// CIR-HOST-SAME: %[[FatbinHandle]], +// CIR-HOST-SAME: %[[#HostFn]], +// CIR-HOST-SAME: %[[#DeviceFn]], +// CIR-HOST-SAME: %[[#DeviceFn]], +// CIR-HOST-SAME: %[[#MinusOne]], +// CIR-HOST-SAME: %[[#NULL]], %[[#NULL]], %[[#NULL]], %[[#NULL]], %[[#NULL]]) +// CIR-HOST: } + +// LLVM-HOST: define internal void @__cuda_register_globals(ptr %[[#LLVMFatbin:]]) { +// LLVM-HOST: call i32 @__cudaRegisterFunction( +// LLVM-HOST-SAME: ptr %[[#LLVMFatbin]], +// LLVM-HOST-SAME: ptr @_Z17__device_stub__fnv, +// LLVM-HOST-SAME: ptr @.str_Z2fnv, +// LLVM-HOST-SAME: ptr @.str_Z2fnv, +// LLVM-HOST-SAME: i32 -1, +// LLVM-HOST-SAME: ptr null, ptr null, ptr null, ptr null, ptr null) +// LLVM-HOST: } + // The content in const array should be the same as echoed above, // with a trailing line break ('\n', 0x0A). // CIR-HOST: cir.global "private" constant cir_private @__cuda_fatbin_str = // CIR-HOST-SAME: #cir.const_array<"sample fatbin\0A"> // CIR-HOST-SAME: {{.*}}section = ".nv_fatbin" -// LLVM-HOST: @__cuda_fatbin_str = private constant [14 x i8] c"sample fatbin\0A", section ".nv_fatbin" - // The first value is CUDA file head magic number. -// CIR-HOST: cir.global "private" internal @__cuda_fatbin_wrapper +// CIR-HOST: cir.global "private" constant internal @__cuda_fatbin_wrapper // CIR-HOST: = #cir.const_struct<{ // CIR-HOST: #cir.int<1180844977> : !s32i, // CIR-HOST: #cir.int<1> : !s32i, -// CIR-HOST: #cir.ptr : !cir.ptr, +// CIR-HOST: #cir.global_view<@__cuda_fatbin_str> : !cir.ptr, // CIR-HOST: #cir.ptr : !cir.ptr // CIR-HOST: }> // CIR-HOST-SAME: {{.*}}section = ".nvFatBinSegment" -// COM: @__cuda_fatbin_wrapper is constant for OG. -// COM: However, as we don't have a way to put @__cuda_fatbin_str directly -// COM: to its third field in Clang IR, we can't mark this variable as -// COM: constant: we need to initialize it later, at the beginning -// COM: of @__cuda_module_ctor. - -// LLVM-HOST: @__cuda_fatbin_wrapper = internal global { -// LLVM-HOST: i32 1180844977, i32 1, ptr null, ptr null -// LLVM-HOST: } - -// LLVM-HOST: @llvm.global_ctors = {{.*}}ptr @__cuda_module_ctor - // CIR-HOST: cir.func private @__cudaRegisterFatBinary // CIR-HOST: cir.func {{.*}} @__cuda_module_ctor() { -// CIR-HOST: %[[#F0:]] = cir.get_global @__cuda_fatbin_wrapper -// CIR-HOST: %[[#F1:]] = cir.get_global @__cuda_fatbin_str -// CIR-HOST: %[[#F2:]] = cir.get_member %[[#F0]][2] -// CIR-HOST: %[[#F3:]] = cir.cast(bitcast, %[[#F2]] -// CIR-HOST: cir.store %[[#F1]], %[[#F3]] -// CIR-HOST: cir.call @__cudaRegisterFatBinary +// CIR-HOST: %[[#Fatbin:]] = cir.call @__cudaRegisterFatBinary +// CIR-HOST: %[[#FatbinGlobal:]] = cir.get_global @__cuda_gpubin_handle +// CIR-HOST: cir.store %[[#Fatbin]], %[[#FatbinGlobal]] +// CIR-HOST: cir.call @__cuda_register_globals +// CIR-HOTS: cir.call @__cudaRegisterFatBinaryEnd // CIR-HOST: } // LLVM-HOST: define internal void @__cuda_module_ctor() { -// LLVM-HOST: store ptr @__cuda_fatbin_str, ptr getelementptr {{.*}}, ptr @__cuda_fatbin_wrapper -// LLVM-HOST: call ptr @__cudaRegisterFatBinary(ptr @__cuda_fatbin_wrapper) +// LLVM-HOST: %[[#LLVMFatbin:]] = call ptr @__cudaRegisterFatBinary(ptr @__cuda_fatbin_wrapper) +// LLVM-HOST: store ptr %[[#LLVMFatbin]], ptr @__cuda_gpubin_handle +// LLVM-HOST: call void @__cuda_register_globals +// LLVM-HOST: call void @__cudaRegisterFatBinaryEnd // LLVM-HOST: }