diff --git a/lib/Backends/OpenCL/OpenCL.cpp b/lib/Backends/OpenCL/OpenCL.cpp index bcb3f2046b..77fd1aaf71 100644 --- a/lib/Backends/OpenCL/OpenCL.cpp +++ b/lib/Backends/OpenCL/OpenCL.cpp @@ -922,8 +922,9 @@ llvm::Error OpenCLFunction::execute(ExecutionContext *context) { continue; } - if (auto *BRA = dyn_cast(&I)) { + if (auto *BRA = dyn_cast(&I)) { auto axis = BRA->getAxis(); + auto axisSrcSliceSize = BRA->getAxisSrcSliceSize(); // Determine and store the slice sizes of each input dimension excluding // the reduce axis into batchSliceSizes. Determine also the slice size on @@ -931,25 +932,7 @@ llvm::Error OpenCLFunction::execute(ExecutionContext *context) { // to index correctly into the input buffer. If the input has one // dimension (that is also the reduce axis), store one slice of size 1 // into batchSliceSizes. - auto batchDims = BRA->getBatch()->getType()->dims(); - auto numBatchDims = batchDims.size(); - std::vector batchSliceSizes( - numBatchDims > 1 ? numBatchDims - 1 : 1, 1); - size_t currentSliceSize = 1, axisSliceSize = 1; - for (ssize_t i = batchDims.size() - 1, j = batchSliceSizes.size() - 1; - i >= 0; --i) { - // If i is the reduce axis, currentSliceSize is the slice size at the - // reduce axis. Store it in axisSliceSize and not in batchSliceSizes. If - // not, do the opposite. - if (i == axis) { - axisSliceSize = currentSliceSize; - } else { - batchSliceSizes[j--] = currentSliceSize; - } - - // Compute the slice size for the next iteration. - currentSliceSize *= batchDims[i]; - } + auto batchDims = BRA->getSrc()->getType()->dims(); // Determine and store the slice sizes of each output dimension excluding // the reduce axis into destSliceSizes. These are used by the kernel to @@ -960,64 +943,14 @@ llvm::Error OpenCLFunction::execute(ExecutionContext *context) { if (destDims.empty()) { destDimsVec.emplace_back(1); } - auto numDestDims = destDimsVec.size(); - std::vector destSliceSizes(numDestDims > 0 ? numDestDims : 1, 1); - - // Start i at destDimsVec.size() - 2 because the last slice size is always - // known to be 1. - for (ssize_t i = destDimsVec.size() - 2; i >= 0; --i) { - // The slice size of the current dimension is the slice size of the - // previous dimension multiplied by the number of elements in that - // dimension. - destSliceSizes[i] = destSliceSizes[i + 1] * destDimsVec[i + 1]; - } - - // Allocate device buffers for batchSliceSizes and destSliceSizes. - size_t batchSlicesBufSize = batchSliceSizes.size() * sizeof(size_t); - size_t destSlicesBufSize = destSliceSizes.size() * sizeof(size_t); - cl_mem batchSlicesBuf = allocDeviceBuffer(batchSlicesBufSize); - cl_mem destSlicesBuf = allocDeviceBuffer(destSlicesBufSize); - - // Copy batchSliceSizes and destSliceSizes from host to device. - cl_event writeBatchSlicesEvent{nullptr}, writeDestSlicesEvent{nullptr}; - cl_int err = clEnqueueWriteBuffer( - commands_, batchSlicesBuf, /*blocking_write=*/CL_FALSE, /*offset=*/0, - batchSlicesBufSize, batchSliceSizes.data(), - /* num_events_in_wait_list */ 0, - /* event_list */ nullptr, - /* event */ kernelProfiling_ ? &writeBatchSlicesEvent : nullptr); - CHECK_EQ(err, CL_SUCCESS) << "Unable to copy BRA data to the device"; - if (kernelProfiling_) { - kernelLaunches_.emplace_back(KernelLaunch("batchedReduceAddSliceData", - "batchedReduceAddSliceData", - writeBatchSlicesEvent)); - } - - err = clEnqueueWriteBuffer( - commands_, destSlicesBuf, /*blocking_write=*/CL_FALSE, /*offset=*/0, - destSlicesBufSize, destSliceSizes.data(), - /* num_events_in_wait_list */ 0, - /* event_list */ nullptr, - /* event */ kernelProfiling_ ? &writeDestSlicesEvent : nullptr); - CHECK_EQ(err, CL_SUCCESS) << "Unable to copy BRA data to the device"; - if (kernelProfiling_) { - kernelLaunches_.emplace_back(KernelLaunch("batchedReduceAddSliceData", - "batchedReduceAddSliceData", - writeDestSlicesEvent)); - } - - // Wait for the writes to finish. - clFinish(commands_); // Create kernel and set arguments. cl_kernel kernel = createKernel(kernelName); setKernelArg(kernel, 0, deviceBuffer_); auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_); - setKernelArg(kernel, numArgs + 1, batchSlicesBuf); - setKernelArg(kernel, numArgs + 2, destSlicesBuf); - setKernelArg(kernel, numArgs + 3, batchDims[axis]); - setKernelArg(kernel, numArgs + 4, axisSliceSize); + setKernelArg(kernel, numArgs + 1, batchDims[axis]); + setKernelArg(kernel, numArgs + 2, axisSrcSliceSize); // Parallelize on each element in the slice. enqueueKernel(I.getName(), commands_, kernel, deviceId_, destDimsVec, @@ -1962,6 +1895,7 @@ bool OCLBackend::isOpSupported(const NodeInfo &NI) const { case Kinded::Kind::SaveNodeKind: case Kinded::Kind::ReshapeNodeKind: + case Kinded::Kind::OCLBatchedReduceAddNodeKind: case Kinded::Kind::TraceEventNodeKind: // These work regardless of the underlying type. return true; diff --git a/lib/Backends/OpenCL/Transforms.cpp b/lib/Backends/OpenCL/Transforms.cpp index 733136bb1e..c0ba22f2db 100644 --- a/lib/Backends/OpenCL/Transforms.cpp +++ b/lib/Backends/OpenCL/Transforms.cpp @@ -33,12 +33,13 @@ bool OCLBackend::transformPostLowering(Function *F, LOG_SCOPE(F->getLogContext(), "OCLBackend::transformPostLowering") - if (cctx.compMode == CompilationMode::Train) - return false; - bool changed = false; for (auto &node : F->getNodes()) { if (auto *CN = dyn_cast(&node)) { + if (cctx.compMode == CompilationMode::Train) { + continue; + } + // TODO: OpenCL fast convolution kernel itself has some issue with group > // 1, which will be investigated later. So far, if the group > 1, we just // call the slow convolution kernel. @@ -50,17 +51,101 @@ bool OCLBackend::transformPostLowering(Function *F, continue; } if (auto *PMN = dyn_cast(&node)) { + if (cctx.compMode == CompilationMode::Train) { + continue; + } + auto *NR = convertPoolToNCHWPool(PMN, F); PMN->getResult().replaceAllUsesOfWith(NR); changed = true; continue; } if (auto *PAN = dyn_cast(&node)) { + if (cctx.compMode == CompilationMode::Train) { + continue; + } + auto *NR = convertPoolToNCHWPool(PAN, F); PAN->getResult().replaceAllUsesOfWith(NR); changed = true; continue; } + // The code below replaces a regular BatchedReduceAddNode with a + // semantically identical OCLBatchedReduceAddNode that has two additional + // inputs for the slice sizes of the input and output nodes. The OpenCL + // implementation of the batchedreduceadd instruction needs this information + // and storing it in graph Constants ensures that it will be copied to the + // device with the rest of the Function's Constants. Consequently, it does + // not need to be copied separately or at runtime (which would increase + // execution latency). + if (auto *BRA = dyn_cast(&node)) { + auto axis = BRA->getAxis(); + + // Determine and store the slice sizes of each input dimension excluding + // the reduce axis into batchSliceSizes. Determine also the slice size on + // the reduce axis and store that separately. These are used by the kernel + // to index correctly into the input buffer. If the input has one + // dimension (that is also the reduce axis), store one slice of size 1 + // into batchSliceSizes. + auto batchDims = BRA->getBatch().getType()->dims(); + auto numBatchDims = batchDims.size(); + auto batchSliceSizesLen = numBatchDims > 1 ? numBatchDims - 1 : 1; + auto *batchSliceSizes = F->getParent()->createConstant( + ElemKind::Int32ITy, {batchSliceSizesLen}, "batchSliceSizes"); + auto batchSliceSizesH = + batchSliceSizes->getPayloadMutable().getHandle(); + batchSliceSizesH.clear(1); + + size_t currentSliceSize = 1, axisSliceSize = 1; + unsigned j = batchSliceSizesLen - 1; + for (ssize_t i = batchDims.size() - 1; i >= 0; --i) { + // If i is the reduce axis, currentSliceSize is the slice size at the + // reduce axis. Store it in axisSliceSize and not in batchSliceSizes. If + // not, do the opposite. + if (i == axis) { + axisSliceSize = currentSliceSize; + } else { + batchSliceSizesH.at({j--}) = currentSliceSize; + } + // Compute the slice size for the next iteration. + currentSliceSize *= batchDims[i]; + } + + // Determine and store the slice sizes of each output dimension excluding + // the reduce axis into destSliceSizes. These are used by the kernel to + // index correctly into the output buffer. If the output has zero + // dimensions store one slice of size 1 into destSliceSizes. + auto destDims = BRA->getResult().getType()->dims(); + std::vector destDimsVec(destDims.begin(), destDims.end()); + if (destDims.empty()) { + destDimsVec.emplace_back(1); + } + auto numDestDims = destDimsVec.size(); + auto destSliceSizesLen = numDestDims > 0 ? numDestDims : 1; + auto *destSliceSizes = F->getParent()->createConstant( + ElemKind::Int32ITy, {destSliceSizesLen}, "destSliceSizes"); + auto destSliceSizesH = + destSliceSizes->getPayloadMutable().getHandle(); + destSliceSizesH.clear(1); + + // Start i at destDimsVec.size() - 2 because the last slice size is always + // known to be 1. + for (ssize_t i = destDimsVec.size() - 2; i >= 0; --i) { + // The slice size of the current dimension is the slice size of the + // previous dimension multiplied by the number of elements in that + // dimension. + destSliceSizesH.at({static_cast(i)}) = + destSliceSizesH.at({static_cast(i + 1)}) * + destDimsVec[i + 1]; + } + + auto *OCLBRA = F->addNode(new OCLBatchedReduceAddNode( + BRA->getName(), BRA->getResult().getType(), BRA->getBatch(), + destSliceSizes, batchSliceSizes, axis, axisSliceSize)); + BRA->getResult().replaceAllUsesOfWith(OCLBRA); + changed = true; + continue; + } } return changed; } diff --git a/lib/Backends/OpenCL/kernels.cl b/lib/Backends/OpenCL/kernels.cl index af93833529..8649c79ce7 100644 --- a/lib/Backends/OpenCL/kernels.cl +++ b/lib/Backends/OpenCL/kernels.cl @@ -618,11 +618,11 @@ __kernel void elementcmplteW(__global void *mem, cl_uint32_t dest, elementcmplteK(&mem[dest], &mem[LHS], &mem[RHS]); } -__kernel void batchedreduceaddK(__global float *dest, __global float *batch, - __global cl_host_size_t *batchSliceSizes, - __global cl_host_size_t *destSliceSize, - cl_uint32_t numSlices, - cl_uint32_t axisSliceSize) { +__kernel void oclbatchedreduceaddK(__global float *dest, __global float *batch, + __global cl_int32_t *destSliceSizes, + __global cl_int32_t *batchSliceSizes, + cl_uint32_t numSlices, + cl_uint32_t axisSliceSize) { size_t workDim = get_work_dim(); // This is the component of the offset into batch that depends only on the @@ -648,7 +648,7 @@ __kernel void batchedreduceaddK(__global float *dest, __global float *batch, for (size_t i = 0; i < workDim; ++i) { size_t id = get_global_id(i); batchOffset += id * batchSliceSizes[i]; - destOffset += id * destSliceSize[i]; + destOffset += id * destSliceSizes[i]; } // Perform the actual reduce. Add the slice number * the slice size at the @@ -660,11 +660,11 @@ __kernel void batchedreduceaddK(__global float *dest, __global float *batch, } __kernel void -batchedreduceaddW(__global void *mem, cl_uint32_t dest, cl_uint32_t batch, - __global void *batchSliceSizes, __global void *destSliceSizes, - cl_uint32_t numSlices, cl_uint32_t axisSliceSize) { - batchedreduceaddK(&mem[dest], &mem[batch], batchSliceSizes, destSliceSizes, - numSlices, axisSliceSize); +oclbatchedreduceaddW(__global void *mem, cl_uint32_t dest, cl_uint32_t batch, + cl_uint32_t destSliceSizes, cl_uint32_t batchSliceSizes, + cl_uint32_t numSlices, cl_uint32_t axisSliceSize) { + oclbatchedreduceaddK(&mem[dest], &mem[batch], &mem[destSliceSizes], + &mem[batchSliceSizes], numSlices, axisSliceSize); } __kernel void batchedaddK(__global float *dest, __global float *batch, diff --git a/tools/ClassGen/Backends/OpenCL/OpenCLSpecificInstrs.h b/tools/ClassGen/Backends/OpenCL/OpenCLSpecificInstrs.h index 12fec9d1b5..76f0ad50c2 100644 --- a/tools/ClassGen/Backends/OpenCL/OpenCLSpecificInstrs.h +++ b/tools/ClassGen/Backends/OpenCL/OpenCLSpecificInstrs.h @@ -47,6 +47,16 @@ BB.newBackendSpecificInstr("OCLMaxPool") .autoIRGen() .autoVerify(VerifyKind::SameElementType, {"Dest", "Src"}); +BB.newBackendSpecificInstr("OCLBatchedReduceAdd") + .addOperand("Dest", OperandKind::Out) + .addOperand("Src", OperandKind::In) + .addOperand("DestSliceSizes", OperandKind::In) + .addOperand("SrcSliceSizes", OperandKind::In) + .addMember(MemberType::Unsigned, "Axis") + .addMember(MemberType::Unsigned, "AxisSrcSliceSize") + .autoVerify(VerifyKind::SameElementType, {"Dest", "Src"}) + .autoIRGen(); + BB.includeBackendSpecificVerification( "glow/OpenCLSpecificInstrsVerification.h"); diff --git a/tools/ClassGen/Backends/OpenCL/OpenCLSpecificNodes.h b/tools/ClassGen/Backends/OpenCL/OpenCLSpecificNodes.h index de3fe0c0d9..0683117803 100644 --- a/tools/ClassGen/Backends/OpenCL/OpenCLSpecificNodes.h +++ b/tools/ClassGen/Backends/OpenCL/OpenCLSpecificNodes.h @@ -51,6 +51,17 @@ BB.newNode("OCLMaxPool") "provided " "Kernel, Stride, and Pads. The input and output are in NCHW format"); +BB.newNode("OCLBatchedReduceAdd") + .addInput("Input") + .addInput("DestSliceSizes") + .addInput("SrcSliceSizes") + .addMember(MemberType::Unsigned, "Axis") + .addMember(MemberType::Unsigned, "AxisSrcSliceSize") + .addResultFromCtorArg() + .setDocstring( + "This is an OpenCL-specific BatchedReduceAdd operation which has the " + "slice sizes of the input and output as explicit inputs."); + BB.includeBackendSpecificVerification("glow/OpenCLSpecificNodesVerification.h"); #endif // GLOW_WITH_CPU diff --git a/tools/ClassGen/Backends/OpenCL/OpenCLSpecificNodesVerification.h b/tools/ClassGen/Backends/OpenCL/OpenCLSpecificNodesVerification.h index 44ff7a18eb..553d9b2435 100644 --- a/tools/ClassGen/Backends/OpenCL/OpenCLSpecificNodesVerification.h +++ b/tools/ClassGen/Backends/OpenCL/OpenCLSpecificNodesVerification.h @@ -30,4 +30,71 @@ bool OCLConvolutionNode::verify() const { bool OCLAvgPoolNode::verify() const { return true; } bool OCLMaxPoolNode::verify() const { return true; } + +bool OCLBatchedReduceAddNode::verify() const { + Constant *destSliceSizes = + llvm::dyn_cast(getDestSliceSizes().getNode()); + Constant *srcSliceSizes = + llvm::dyn_cast(getSrcSliceSizes().getNode()); + + // Both the destSliceSizes and srcSliceSizes should be Constants. + if (!destSliceSizes || !srcSliceSizes) { + return false; + } + + // Check that the values of destSliceSizes and srcSliceSizes still match the + // Types of the Input and Result. For more information, see + // OCLBackend::transformPostLowering. + bool ok = true; + auto srcSliceSizesH = srcSliceSizes->getPayload().getHandle(); + auto srcDims = getInput().getType()->dims(); + + if (!srcDims.empty()) { + unsigned_t currentSliceSize = 1; + unsigned j = srcSliceSizesH.size() - 1; + for (ssize_t i = srcDims.size() - 1; i >= 0; --i) { + if (i == getAxis()) { + ok &= expectCompareTrue("axisSrcSlizeSize is incorrect", + getAxisSrcSliceSize(), currentSliceSize, this); + } else { + ok &= + expectCompareTrue("srcSliceSize is incorrect", + static_cast(srcSliceSizesH.at({j--})), + currentSliceSize, this); + } + currentSliceSize *= srcDims[i]; + } + } else { + ok &= expectCompareTrue("axisSrcSlizeSize is incorrect", + getAxisSrcSliceSize(), static_cast(1), + this); + ok &= + expectCompareTrue("srcSliceSizes has the wrong shape", + srcSliceSizesH.size(), static_cast(1), this); + ok &= expectCompareTrue("srcSliceSizes is incorrect", + srcSliceSizesH.at({0}), 1, this); + } + + auto destDims = getResult().getType()->dims(); + std::vector destDimsVec(destDims.begin(), destDims.end()); + if (destDims.empty()) { + destDimsVec.emplace_back(1); + } + auto destSliceSizesH = destSliceSizes->getPayload().getHandle(); + + ok &= + expectCompareTrue("destSliceSizes is incorrect", + destSliceSizesH.at({destDimsVec.size() - 1}), 1, this); + + for (ssize_t i = destDimsVec.size() - 2; i >= 0; --i) { + ok &= expectCompareTrue("destSliceSizes is incorrect", + destSliceSizesH.at({static_cast(i)}), + destSliceSizesH.at({static_cast(i + 1)}) * + destDimsVec[i + 1], + this); + } + + return ok; +} + #endif // GLOW_WITH_OPENCL