diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp index eddf3e9a47d0b..d710f7652b507 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -101,7 +101,7 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern { FunctionCallBuilder moduleLoadCallBuilder = { "mgpuModuleLoad", llvmPointerType /* void *module */, - {llvmPointerType /* void *cubin */}}; + {llvmPointerType /* void *cubin */, llvmInt64Type /* size_t size */}}; FunctionCallBuilder moduleUnloadCallBuilder = { "mgpuModuleUnload", llvmVoidType, {llvmPointerType /* void *module */}}; FunctionCallBuilder moduleGetFunctionCallBuilder = { @@ -125,7 +125,8 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern { llvmInt32Type, /* unsigned int sharedMemBytes */ llvmPointerType, /* void *hstream */ llvmPointerPointerType, /* void **kernelParams */ - llvmPointerPointerType /* void **extra */ + llvmPointerPointerType, /* void **extra */ + llvmInt64Type /* size_t paramsCount */ }}; FunctionCallBuilder streamCreateCallBuilder = { "mgpuStreamCreate", llvmPointerType /* void *stream */, {}}; @@ -1134,7 +1135,23 @@ LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite( loc, rewriter, nameBuffer.str(), binaryAttr.getValue(), LLVM::Linkage::Internal, getTypeConverter()->useOpaquePointers()); - auto module = moduleLoadCallBuilder.create(loc, rewriter, data); + // Pass the binary size. SPIRV requires binary size. + auto gpuBlob = binaryAttr.getValue(); + auto gpuBlobSize = rewriter.create( + loc, llvmInt64Type, + mlir::IntegerAttr::get(llvmInt64Type, + static_cast(gpuBlob.size()))); + + auto module = + moduleLoadCallBuilder.create(loc, rewriter, {data, gpuBlobSize}); + + // Pass the count of the parameters to runtime wrappers + auto paramsCount = rewriter.create( + loc, llvmInt64Type, + mlir::IntegerAttr::get( + llvmInt64Type, + static_cast(launchOp.getNumKernelOperands()))); + // Get the function from the module. The name corresponds to the name of // the kernel function. auto kernelName = generateKernelNameConstant( @@ -1158,7 +1175,7 @@ LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite( {function.getResult(), adaptor.getGridSizeX(), adaptor.getGridSizeY(), adaptor.getGridSizeZ(), adaptor.getBlockSizeX(), adaptor.getBlockSizeY(), adaptor.getBlockSizeZ(), dynamicSharedMemorySize, stream, kernelParams, - /*extra=*/nullpointer}); + /*extra=*/nullpointer, paramsCount}); if (launchOp.getAsyncToken()) { // Async launch: make dependent ops use the same stream. diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp index 1dba677ebe663..8a53d99c778a6 100644 --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -119,7 +119,8 @@ static bool cusparseLt_initiated = false; #endif // MLIR_ENABLE_CUDA_CUSPARSELT #endif // MLIR_ENABLE_CUDA_CUSPARSE -extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoad(void *data) { +extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule +mgpuModuleLoad(void *data, size_t /*gpuBlobSize*/) { ScopedContext scopedContext; CUmodule module = nullptr; CUDA_REPORT_IF_ERROR(cuModuleLoadData(&module, data)); @@ -144,7 +145,7 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuLaunchKernel(CUfunction function, intptr_t gridX, intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ, int32_t smem, CUstream stream, void **params, - void **extra) { + void **extra, size_t /*paramsCount*/) { ScopedContext scopedContext; int32_t maxShmem = 0; CUdevice device = getDefaultCuDevice(); diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp index bd3868a8e196f..998ff5b8b829f 100644 --- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp @@ -32,7 +32,7 @@ thread_local static int32_t defaultDevice = 0; -extern "C" hipModule_t mgpuModuleLoad(void *data) { +extern "C" hipModule_t mgpuModuleLoad(void *data, size_t /*gpuBlobSize*/) { hipModule_t module = nullptr; HIP_REPORT_IF_ERROR(hipModuleLoadData(&module, data)); return module; @@ -57,7 +57,7 @@ extern "C" void mgpuLaunchKernel(hipFunction_t function, intptr_t gridX, intptr_t blockX, intptr_t blockY, intptr_t blockZ, int32_t smem, hipStream_t stream, void **params, - void **extra) { + void **extra, size_t /*paramsCount*/) { HIP_REPORT_IF_ERROR(hipModuleLaunchKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, smem, stream, params, extra)); diff --git a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir index 2cdc4e8dbb1ad..b4efe0714aab9 100644 --- a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir @@ -34,8 +34,9 @@ module attributes {gpu.container_module} { // CHECK: [[ADDRESSOF:%.*]] = llvm.mlir.addressof @[[GLOBAL]] // CHECK: [[BINARY:%.*]] = llvm.getelementptr [[ADDRESSOF]]{{\[}}0, 0] // CHECK-SAME: -> !llvm.ptr - - // CHECK: [[MODULE:%.*]] = llvm.call @mgpuModuleLoad([[BINARY]]) + // CHECK: [[BINARYSIZE:%.*]] = llvm.mlir.constant + // CHECK: [[MODULE:%.*]] = llvm.call @mgpuModuleLoad([[BINARY]], [[BINARYSIZE]]) + // CHECK: [[PARAMSCOUNT:%.*]] = llvm.mlir.constant // CHECK: [[FUNC:%.*]] = llvm.call @mgpuModuleGetFunction([[MODULE]], {{.*}}) // CHECK: [[STREAM:%.*]] = llvm.call @mgpuStreamCreate @@ -56,7 +57,7 @@ module attributes {gpu.container_module} { // CHECK: llvm.call @mgpuLaunchKernel([[FUNC]], [[C8]], [[C8]], [[C8]], // CHECK-SAME: [[C8]], [[C8]], [[C8]], [[C256]], [[STREAM]], - // CHECK-SAME: [[PARAMS]], [[EXTRA_PARAMS]]) + // CHECK-SAME: [[PARAMS]], [[EXTRA_PARAMS]], [[PARAMSCOUNT]]) // CHECK: llvm.call @mgpuStreamSynchronize // CHECK: llvm.call @mgpuStreamDestroy // CHECK: llvm.call @mgpuModuleUnload