-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[MLIR] Pass count of parameters & gpu binary size to runtime wrappers #66154
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@llvm/pr-subscribers-mlir @llvm/pr-subscribers-mlir-execution-engine ChangesThis PR is a breakdown of the big PR #65539 which enables intel gpu integration. In this PR we pass count of parameters and size of gpu binary to runtime wrappers since the SyclRuntimeWrappers (which will come in subsequent PR) requires the spirv size for compilation and also the number of parameters to iterate over the params. -- Full diff: https://github.com//pull/66154.diff4 Files Affected:
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp index eddf3e9a47d0bc8..d710f7652b507cb 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -101,7 +101,7 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> { FunctionCallBuilder moduleLoadCallBuilder = { "mgpuModuleLoad", llvmPointerType /* void *module */, - {llvmPointerType /* void *cubin */}}; + {llvmPointerType /* void *cubin */, llvmInt64Type /* size_t size */}}; FunctionCallBuilder moduleUnloadCallBuilder = { "mgpuModuleUnload", llvmVoidType, {llvmPointerType /* void *module */}}; FunctionCallBuilder moduleGetFunctionCallBuilder = { @@ -125,7 +125,8 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> { llvmInt32Type, /* unsigned int sharedMemBytes */ llvmPointerType, /* void *hstream */ llvmPointerPointerType, /* void **kernelParams */ - llvmPointerPointerType /* void **extra */ + llvmPointerPointerType, /* void **extra */ + llvmInt64Type /* size_t paramsCount */ }}; FunctionCallBuilder streamCreateCallBuilder = { "mgpuStreamCreate", llvmPointerType /* void *stream */, {}}; @@ -1134,7 +1135,23 @@ LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite( loc, rewriter, nameBuffer.str(), binaryAttr.getValue(), LLVM::Linkage::Internal, getTypeConverter()->useOpaquePointers()); - auto module = moduleLoadCallBuilder.create(loc, rewriter, data); + // Pass the binary size. SPIRV requires binary size. + auto gpuBlob = binaryAttr.getValue(); + auto gpuBlobSize = rewriter.create<mlir::LLVM::ConstantOp>( + loc, llvmInt64Type, + mlir::IntegerAttr::get(llvmInt64Type, + static_cast<int64_t>(gpuBlob.size()))); + + auto module = + moduleLoadCallBuilder.create(loc, rewriter, {data, gpuBlobSize}); + + // Pass the count of the parameters to runtime wrappers + auto paramsCount = rewriter.create<mlir::LLVM::ConstantOp>( + loc, llvmInt64Type, + mlir::IntegerAttr::get( + llvmInt64Type, + static_cast<int64_t>(launchOp.getNumKernelOperands()))); + // Get the function from the module. The name corresponds to the name of // the kernel function. auto kernelName = generateKernelNameConstant( @@ -1158,7 +1175,7 @@ LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite( {function.getResult(), adaptor.getGridSizeX(), adaptor.getGridSizeY(), adaptor.getGridSizeZ(), adaptor.getBlockSizeX(), adaptor.getBlockSizeY(), adaptor.getBlockSizeZ(), dynamicSharedMemorySize, stream, kernelParams, - /*extra=*/nullpointer}); + /*extra=*/nullpointer, paramsCount}); if (launchOp.getAsyncToken()) { // Async launch: make dependent ops use the same stream. diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp index 1dba677ebe66365..8a53d99c778a63a 100644 --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -119,7 +119,8 @@ static bool cusparseLt_initiated = false; #endif // MLIR_ENABLE_CUDA_CUSPARSELT #endif // MLIR_ENABLE_CUDA_CUSPARSE -extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoad(void *data) { +extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule +mgpuModuleLoad(void *data, size_t /*gpuBlobSize*/) { ScopedContext scopedContext; CUmodule module = nullptr; CUDA_REPORT_IF_ERROR(cuModuleLoadData(&module, data)); @@ -144,7 +145,7 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuLaunchKernel(CUfunction function, intptr_t gridX, intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ, int32_t smem, CUstream stream, void **params, - void **extra) { + void **extra, size_t /*paramsCount*/) { ScopedContext scopedContext; int32_t maxShmem = 0; CUdevice device = getDefaultCuDevice(); diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp index bd3868a8e196f6f..998ff5b8b829f88 100644 --- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp @@ -32,7 +32,7 @@ thread_local static int32_t defaultDevice = 0; -extern "C" hipModule_t mgpuModuleLoad(void *data) { +extern "C" hipModule_t mgpuModuleLoad(void *data, size_t /*gpuBlobSize*/) { hipModule_t module = nullptr; HIP_REPORT_IF_ERROR(hipModuleLoadData(&module, data)); return module; @@ -57,7 +57,7 @@ extern "C" void mgpuLaunchKernel(hipFunction_t function, intptr_t gridX, intptr_t blockX, intptr_t blockY, intptr_t blockZ, int32_t smem, hipStream_t stream, void **params, - void **extra) { + void **extra, size_t /*paramsCount*/) { HIP_REPORT_IF_ERROR(hipModuleLaunchKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, smem, stream, params, extra)); diff --git a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir index 2cdc4e8dbb1ad67..b4efe0714aab9aa 100644 --- a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir @@ -34,8 +34,9 @@ module attributes {gpu.container_module} { // CHECK: [[ADDRESSOF:%.*]] = llvm.mlir.addressof @[[GLOBAL]] // CHECK: [[BINARY:%.*]] = llvm.getelementptr [[ADDRESSOF]]{{\[}}0, 0] // CHECK-SAME: -> !llvm.ptr - - // CHECK: [[MODULE:%.*]] = llvm.call @mgpuModuleLoad([[BINARY]]) + // CHECK: [[BINARYSIZE:%.*]] = llvm.mlir.constant + // CHECK: [[MODULE:%.*]] = llvm.call @mgpuModuleLoad([[BINARY]], [[BINARYSIZE]]) + // CHECK: [[PARAMSCOUNT:%.*]] = llvm.mlir.constant // CHECK: [[FUNC:%.*]] = llvm.call @mgpuModuleGetFunction([[MODULE]], {{.*}}) // CHECK: [[STREAM:%.*]] = llvm.call @mgpuStreamCreate @@ -56,7 +57,7 @@ module attributes {gpu.container_module} { // CHECK: llvm.call @mgpuLaunchKernel([[FUNC]], [[C8]], [[C8]], [[C8]], // CHECK-SAME: [[C8]], [[C8]], [[C8]], [[C256]], [[STREAM]], - // CHECK-SAME: [[PARAMS]], [[EXTRA_PARAMS]]) + // CHECK-SAME: [[PARAMS]], [[EXTRA_PARAMS]], [[PARAMSCOUNT]]) // CHECK: llvm.call @mgpuStreamSynchronize // CHECK: llvm.call @mgpuStreamDestroy // CHECK: llvm.call @mgpuModuleUnload |
Ping for review. |
@joker-eph @grypp ping for review |
joker-eph
approved these changes
Sep 25, 2023
@grypp @joker-eph can one of you guys help me merge this? |
legrosbuffle
pushed a commit
to legrosbuffle/llvm-project
that referenced
this pull request
Sep 29, 2023
…llvm#66154) This PR is a breakdown of the big PR llvm#65539 which enables intel gpu integration. In this PR we pass count of parameters and size of gpu binary to runtime wrappers since the SyclRuntimeWrappers (which will come in subsequent PR) requires the spirv size for compilation and also the number of parameters to iterate over the params.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
This PR is a breakdown of the big PR #65539 which enables intel gpu integration. In this PR we pass count of parameters and size of gpu binary to runtime wrappers since the SyclRuntimeWrappers (which will come in subsequent PR) requires the spirv size for compilation and also the number of parameters to iterate over the params.