From cdef6c6150fc76836e70e2adb5ec94f041f84893 Mon Sep 17 00:00:00 2001 From: AdUhTkJm <2292398666@qq.com> Date: Tue, 4 Mar 2025 13:09:31 +0000 Subject: [PATCH] [CIR][CUDA] Lowering device and shared variables --- .../clang/CIR/Dialect/IR/CIRCUDAAttrs.td | 31 +++++++++++++++++++ clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp | 20 ++++++++++++ clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h | 2 ++ clang/lib/CIR/CodeGen/CIRGenModule.cpp | 27 +++++++++------- .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 6 ++++ clang/test/CIR/CodeGen/CUDA/global-vars.cu | 10 +++++- 6 files changed, 84 insertions(+), 12 deletions(-) diff --git a/clang/include/clang/CIR/Dialect/IR/CIRCUDAAttrs.td b/clang/include/clang/CIR/Dialect/IR/CIRCUDAAttrs.td index e658bb49e815..34d563ed898f 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIRCUDAAttrs.td +++ b/clang/include/clang/CIR/Dialect/IR/CIRCUDAAttrs.td @@ -35,6 +35,24 @@ def CUDAKernelNameAttr : CIR_Attr<"CUDAKernelName", let assemblyFormat = "`<` $kernel_name `>`"; } +def CUDAShadowNameAttr : CIR_Attr<"CUDAShadowName", + "cu.shadow_name"> { + let summary = "Device-side global variable name for this shadow."; + let description = + [{ + This attribute is attached to global variable definitions and records the + mangled name of the global variable used on the device. + + In CUDA, __device__, __constant__ and __shared__ variables, as well as + surface and texture variables, will generate a shadow symbol on host. + We must preserve the correspodence in order to generate registration + functions. + }]; + + let parameters = (ins "std::string":$device_side_name); + let assemblyFormat = "`<` $device_side_name `>`"; +} + def CUDABinaryHandleAttr : CIR_Attr<"CUDABinaryHandle", "cu.binary_handle"> { let summary = "Fat binary handle for device code."; @@ -52,4 +70,17 @@ def CUDABinaryHandleAttr : CIR_Attr<"CUDABinaryHandle", let assemblyFormat = "`<` $name `>`"; } +def CUDAExternallyInitializedAttr : CIR_Attr<"CUDAExternallyInitialized", + "cu.externally_initialized"> { + let summary = "The marked variable is externally initialized."; + let description = + [{ + CUDA __device__ and __constant__ variables, along with surface and + textures, might be initialized by host, hence "externally initialized". + Therefore they must be emitted even if they are not referenced. + + The attribute corresponds to the attribute on LLVM with the same name. + }]; +} + #endif // MLIR_CIR_DIALECT_CIR_CUDA_ATTRS diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp index c47663772aa1..fdac639ab35a 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp @@ -283,3 +283,23 @@ mlir::Operation *CIRGenCUDARuntime::getKernelHandle(cir::FuncOp fn, return globalOp; } + +void CIRGenCUDARuntime::internalizeDeviceSideVar( + const VarDecl *d, cir::GlobalLinkageKind &linkage) { + if (cgm.getLangOpts().GPURelocatableDeviceCode) + llvm_unreachable("NYI"); + + // __shared__ variables are odd. Shadows do get created, but + // they are not registered with the CUDA runtime, so they + // can't really be used to access their device-side + // counterparts. It's not clear yet whether it's nvcc's bug or + // a feature, but we've got to do the same for compatibility. + if (d->hasAttr() || d->hasAttr() || + d->hasAttr()) { + linkage = cir::GlobalLinkageKind::InternalLinkage; + } + + if (d->getType()->isCUDADeviceBuiltinSurfaceType() || + d->getType()->isCUDADeviceBuiltinTextureType()) + llvm_unreachable("NYI"); +} diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h index 700f939e3082..8cbadb849129 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h +++ b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h @@ -58,6 +58,8 @@ class CIRGenCUDARuntime { const CUDAKernelCallExpr *expr, ReturnValueSlot retValue); virtual mlir::Operation *getKernelHandle(cir::FuncOp fn, GlobalDecl GD); + virtual void internalizeDeviceSideVar(const VarDecl *d, + cir::GlobalLinkageKind &linkage); }; } // namespace clang::CIRGen diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp index 313cc58a142c..2cee101065f3 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp @@ -569,13 +569,13 @@ bool CIRGenModule::shouldEmitCUDAGlobalVar(const VarDecl *global) const { // their device-side incarnations. if (global->hasAttr() || - global->hasAttr() || global->getType()->isCUDADeviceBuiltinSurfaceType() || global->getType()->isCUDADeviceBuiltinTextureType()) { llvm_unreachable("NYI"); } - return !langOpts.CUDAIsDevice || global->hasAttr(); + return !langOpts.CUDAIsDevice || global->hasAttr() || + global->hasAttr(); } void CIRGenModule::emitGlobal(GlobalDecl gd) { @@ -598,8 +598,10 @@ void CIRGenModule::emitGlobal(GlobalDecl gd) { assert(!global->hasAttr() && "NYI"); if (langOpts.CUDA || langOpts.HIP) { - // clang uses the same flag when building HIP code - if (langOpts.CUDAIsDevice) { + if (const auto *vd = dyn_cast(global)) { + if (!shouldEmitCUDAGlobalVar(vd)) + return; + } else if (langOpts.CUDAIsDevice) { // This will implicitly mark templates and their // specializations as __host__ __device__. if (langOpts.OffloadImplicitHostDeviceTemplates) @@ -621,11 +623,6 @@ void CIRGenModule::emitGlobal(GlobalDecl gd) { return; } } - - if (const auto *vd = dyn_cast(global)) { - if (!shouldEmitCUDAGlobalVar(vd)) - return; - } } if (langOpts.OpenMP) { @@ -1394,7 +1391,7 @@ void CIRGenModule::emitGlobalVarDefinition(const clang::VarDecl *d, d->getType()->isCUDADeviceBuiltinTextureType()); if (getLangOpts().CUDA && (isCudaSharedVar || isCudaShadowVar || isCudaDeviceShadowVar)) - assert(0 && "not implemented"); + init = UndefAttr::get(&getMLIRContext(), convertType(d->getType())); else if (d->hasAttr()) assert(0 && "not implemented"); else if (!initExpr) { @@ -1490,11 +1487,19 @@ void CIRGenModule::emitGlobalVarDefinition(const clang::VarDecl *d, cir::GlobalLinkageKind linkage = getCIRLinkageVarDefinition(d, /*IsConstant=*/false); - // TODO(cir): // CUDA B.2.1 "The __device__ qualifier declares a variable that resides on // the device. [...]" // CUDA B.2.2 "The __constant__ qualifier, optionally used together with // __device__, declares a variable that: [...] + if (langOpts.CUDA && langOpts.CUDAIsDevice) { + // __shared__ variables is not marked as externally initialized, + // because they must not be initialized. + if (linkage != cir::GlobalLinkageKind::InternalLinkage && + (d->hasAttr())) { + gv->setAttr(CUDAExternallyInitializedAttr::getMnemonic(), + CUDAExternallyInitializedAttr::get(&getMLIRContext())); + } + } // Set initializer and finalize emission CIRGenModule::setInitializer(gv, init); diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp index abd1f33c448c..00a9edf8fa2c 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp @@ -2397,6 +2397,12 @@ mlir::LogicalResult CIRToLLVMGlobalOpLowering::matchAndRewrite( attributes.push_back(rewriter.getNamedAttr("visibility_", visibility)); + if (auto extInit = + op->getAttr(CUDAExternallyInitializedAttr::getMnemonic())) { + attributes.push_back(rewriter.getNamedAttr("externally_initialized", + rewriter.getUnitAttr())); + } + if (init.has_value()) { if (mlir::isa(init.value())) { // If a directly equivalent attribute is available, use it. diff --git a/clang/test/CIR/CodeGen/CUDA/global-vars.cu b/clang/test/CIR/CodeGen/CUDA/global-vars.cu index 5b1374c085eb..f6e630a7e797 100644 --- a/clang/test/CIR/CodeGen/CUDA/global-vars.cu +++ b/clang/test/CIR/CodeGen/CUDA/global-vars.cu @@ -5,7 +5,15 @@ // RUN: %s -o %t.cir // RUN: FileCheck --check-prefix=CIR-DEVICE --input-file=%t.cir %s +// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fclangir \ +// RUN: -fcuda-is-device -emit-llvm -target-sdk-version=12.3 \ +// RUN: %s -o %t.cir +// RUN: FileCheck --check-prefix=LLVM-DEVICE --input-file=%t.cir %s __device__ int a; +// CIR-DEVICE: cir.global external addrspace(offload_global) @a = #cir.int<0> +// LLVM-DEVICE: @a = addrspace(1) externally_initialized global i32 0, align 4 -// CIR-DEVICE: cir.global external addrspace(offload_global) @a = #cir.int<0> : !s32i {alignment = 4 : i64} loc(#loc3) \ No newline at end of file +__shared__ int shared; +// CIR-DEVICE: cir.global external addrspace(offload_local) @shared = #cir.undef +// LLVM-DEVICE: @shared = addrspace(3) global i32 undef, align 4