diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index cf6e545749ffc..8d7166f68df35 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -578,7 +578,10 @@ def ConvertGPUToSPIRV : Pass<"convert-gpu-to-spirv", "ModuleOp"> {
     to control the set and binding if wanted.
   }];
   let constructor = "mlir::createConvertGPUToSPIRVPass()";
-  let dependentDialects = ["spirv::SPIRVDialect"];
+  let dependentDialects = [
+    "func::FuncDialect",
+    "spirv::SPIRVDialect",
+  ];
   let options = [
     Option<"use64bitIndex", "use-64bit-index",
            "bool", /*default=*/"false",
diff --git a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
index 272e3de8723ae..ae89774239b58 100644
--- a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
+++ b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Conversion/FuncToSPIRV/FuncToSPIRV.h"
 #include "mlir/Conversion/GPUToSPIRV/GPUToSPIRV.h"
 #include "mlir/Conversion/MemRefToSPIRV/MemRefToSPIRV.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVOps.h"
@@ -54,22 +55,47 @@ void GPUToSPIRVPass::runOnOperation() {
 
   SmallVector<Operation *, 1> gpuModules;
   OpBuilder builder(context);
+
+  auto targetEnvSupportsKernelCapability = [](gpu::GPUModuleOp moduleOp) {
+    Operation *gpuModule = moduleOp.getOperation();
+    auto targetAttr = spirv::lookupTargetEnvOrDefault(gpuModule);
+    spirv::TargetEnv targetEnv(targetAttr);
+    return targetEnv.allows(spirv::Capability::Kernel);
+  };
+
   module.walk([&](gpu::GPUModuleOp moduleOp) {
     // Clone each GPU kernel module for conversion, given that the GPU
     // launch op still needs the original GPU kernel module.
-    builder.setInsertionPoint(moduleOp.getOperation());
+    // For Vulkan Shader capabilities, we insert the newly converted SPIR-V
+    // module right after the original GPU module, as that's the expectation of
+    // the in-tree Vulkan runner.
+    // For OpenCL Kernel capabilities, we insert the newly converted SPIR-V
+    // module inside the original GPU module, as that's the expectaion of the
+    // normal GPU compilation pipeline.
+    if (targetEnvSupportsKernelCapability(moduleOp)) {
+      builder.setInsertionPoint(moduleOp.getBody(),
+                                moduleOp.getBody()->begin());
+    } else {
+      builder.setInsertionPoint(moduleOp.getOperation());
+    }
     gpuModules.push_back(builder.clone(*moduleOp.getOperation()));
   });
 
   // Run conversion for each module independently as they can have different
   // TargetEnv attributes.
   for (Operation *gpuModule : gpuModules) {
+    spirv::TargetEnvAttr targetAttr =
+        spirv::lookupTargetEnvOrDefault(gpuModule);
+
     // Map MemRef memory space to SPIR-V storage class first if requested.
     if (mapMemorySpace) {
       std::unique_ptr<ConversionTarget> target =
           spirv::getMemorySpaceToStorageClassTarget(*context);
       spirv::MemorySpaceToStorageClassMap memorySpaceMap =
-          spirv::mapMemorySpaceToVulkanStorageClass;
+          targetEnvSupportsKernelCapability(
+              dyn_cast<gpu::GPUModuleOp>(gpuModule))
+              ? spirv::mapMemorySpaceToOpenCLStorageClass
+              : spirv::mapMemorySpaceToVulkanStorageClass;
       spirv::MemorySpaceToStorageClassConverter converter(memorySpaceMap);
 
       RewritePatternSet patterns(context);
@@ -79,7 +105,6 @@ void GPUToSPIRVPass::runOnOperation() {
         return signalPassFailure();
     }
 
-    auto targetAttr = spirv::lookupTargetEnvOrDefault(gpuModule);
     std::unique_ptr<ConversionTarget> target =
         SPIRVConversionTarget::get(targetAttr);
 
@@ -108,6 +133,25 @@ void GPUToSPIRVPass::runOnOperation() {
     if (failed(applyFullConversion(gpuModule, *target, std::move(patterns))))
       return signalPassFailure();
   }
+
+  // For OpenCL, the gpu.func op in the original gpu.module op needs to be
+  // replaced with an empty func.func op with the same arguments as the gpu.func
+  // op. The func.func op needs gpu.kernel attribute set.
+  module.walk([&](gpu::GPUModuleOp moduleOp) {
+    if (targetEnvSupportsKernelCapability(moduleOp)) {
+      moduleOp.walk([&](gpu::GPUFuncOp funcOp) {
+        builder.setInsertionPoint(funcOp);
+        auto newFuncOp = builder.create<func::FuncOp>(
+            funcOp.getLoc(), funcOp.getName(), funcOp.getFunctionType());
+        auto entryBlock = newFuncOp.addEntryBlock();
+        builder.setInsertionPointToEnd(entryBlock);
+        builder.create<func::ReturnOp>(funcOp.getLoc());
+        newFuncOp->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
+                           builder.getUnitAttr());
+        funcOp.erase();
+      });
+    }
+  });
 }
 
 } // namespace
diff --git a/mlir/test/Conversion/GPUToSPIRV/module-opencl.mlir b/mlir/test/Conversion/GPUToSPIRV/module-opencl.mlir
index 0aa50cc1e2529..4b8d17cd64493 100644
--- a/mlir/test/Conversion/GPUToSPIRV/module-opencl.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/module-opencl.mlir
@@ -12,6 +12,8 @@ module attributes {
     //  CHECK-SAME:     {{%.*}}: !spirv.ptr<!spirv.array<12 x f32>, CrossWorkgroup>
     //   CHECK-NOT:     spirv.interface_var_abi
     //  CHECK-SAME:     spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 4, 1]>
+    // CHECK-LABEL:   func.func @basic_module_structure
+    //  CHECK-SAME:     attributes {gpu.kernel}
     gpu.func @basic_module_structure(%arg0 : f32, %arg1 : memref<12xf32, #spirv.storage_class<CrossWorkgroup>>) kernel
         attributes {spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 4, 1]>} {
       gpu.return
@@ -45,6 +47,8 @@ module attributes {
     //  CHECK-SAME:     {{%.*}}: !spirv.ptr<!spirv.array<12 x f32>, CrossWorkgroup>
     //   CHECK-NOT:     spirv.interface_var_abi
     //  CHECK-SAME:     spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 4, 1]>
+    // CHECK-LABEL:   func.func @basic_module_structure
+    //  CHECK-SAME:     attributes {gpu.kernel}
     gpu.func @basic_module_structure(%arg0 : f32, %arg1 : memref<12xf32, #spirv.storage_class<CrossWorkgroup>>) kernel
         attributes {spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 4, 1]>} {
       gpu.return