Skip to content

[OpenCL] Optimize BatchedReduceAdd implementation #3190

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 6 additions & 72 deletions lib/Backends/OpenCL/OpenCL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -922,34 +922,17 @@ llvm::Error OpenCLFunction::execute(ExecutionContext *context) {
continue;
}

if (auto *BRA = dyn_cast<BatchedReduceAddInst>(&I)) {
if (auto *BRA = dyn_cast<OCLBatchedReduceAddInst>(&I)) {
auto axis = BRA->getAxis();
auto axisSrcSliceSize = BRA->getAxisSrcSliceSize();

// Determine and store the slice sizes of each input dimension excluding
// the reduce axis into batchSliceSizes. Determine also the slice size on
// the reduce axis and store that separately. These are used by the kernel
// to index correctly into the input buffer. If the input has one
// dimension (that is also the reduce axis), store one slice of size 1
// into batchSliceSizes.
auto batchDims = BRA->getBatch()->getType()->dims();
auto numBatchDims = batchDims.size();
std::vector<size_t> batchSliceSizes(
numBatchDims > 1 ? numBatchDims - 1 : 1, 1);
size_t currentSliceSize = 1, axisSliceSize = 1;
for (ssize_t i = batchDims.size() - 1, j = batchSliceSizes.size() - 1;
i >= 0; --i) {
// If i is the reduce axis, currentSliceSize is the slice size at the
// reduce axis. Store it in axisSliceSize and not in batchSliceSizes. If
// not, do the opposite.
if (i == axis) {
axisSliceSize = currentSliceSize;
} else {
batchSliceSizes[j--] = currentSliceSize;
}

// Compute the slice size for the next iteration.
currentSliceSize *= batchDims[i];
}
auto batchDims = BRA->getSrc()->getType()->dims();

// Determine and store the slice sizes of each output dimension excluding
// the reduce axis into destSliceSizes. These are used by the kernel to
Expand All @@ -960,64 +943,14 @@ llvm::Error OpenCLFunction::execute(ExecutionContext *context) {
if (destDims.empty()) {
destDimsVec.emplace_back(1);
}
auto numDestDims = destDimsVec.size();
std::vector<size_t> destSliceSizes(numDestDims > 0 ? numDestDims : 1, 1);

// Start i at destDimsVec.size() - 2 because the last slice size is always
// known to be 1.
for (ssize_t i = destDimsVec.size() - 2; i >= 0; --i) {
// The slice size of the current dimension is the slice size of the
// previous dimension multiplied by the number of elements in that
// dimension.
destSliceSizes[i] = destSliceSizes[i + 1] * destDimsVec[i + 1];
}

// Allocate device buffers for batchSliceSizes and destSliceSizes.
size_t batchSlicesBufSize = batchSliceSizes.size() * sizeof(size_t);
size_t destSlicesBufSize = destSliceSizes.size() * sizeof(size_t);
cl_mem batchSlicesBuf = allocDeviceBuffer(batchSlicesBufSize);
cl_mem destSlicesBuf = allocDeviceBuffer(destSlicesBufSize);

// Copy batchSliceSizes and destSliceSizes from host to device.
cl_event writeBatchSlicesEvent{nullptr}, writeDestSlicesEvent{nullptr};
cl_int err = clEnqueueWriteBuffer(
commands_, batchSlicesBuf, /*blocking_write=*/CL_FALSE, /*offset=*/0,
batchSlicesBufSize, batchSliceSizes.data(),
/* num_events_in_wait_list */ 0,
/* event_list */ nullptr,
/* event */ kernelProfiling_ ? &writeBatchSlicesEvent : nullptr);
CHECK_EQ(err, CL_SUCCESS) << "Unable to copy BRA data to the device";
if (kernelProfiling_) {
kernelLaunches_.emplace_back(KernelLaunch("batchedReduceAddSliceData",
"batchedReduceAddSliceData",
writeBatchSlicesEvent));
}

err = clEnqueueWriteBuffer(
commands_, destSlicesBuf, /*blocking_write=*/CL_FALSE, /*offset=*/0,
destSlicesBufSize, destSliceSizes.data(),
/* num_events_in_wait_list */ 0,
/* event_list */ nullptr,
/* event */ kernelProfiling_ ? &writeDestSlicesEvent : nullptr);
CHECK_EQ(err, CL_SUCCESS) << "Unable to copy BRA data to the device";
if (kernelProfiling_) {
kernelLaunches_.emplace_back(KernelLaunch("batchedReduceAddSliceData",
"batchedReduceAddSliceData",
writeDestSlicesEvent));
}

// Wait for the writes to finish.
clFinish(commands_);

// Create kernel and set arguments.
cl_kernel kernel = createKernel(kernelName);
setKernelArg(kernel, 0, deviceBuffer_);
auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_);

setKernelArg(kernel, numArgs + 1, batchSlicesBuf);
setKernelArg(kernel, numArgs + 2, destSlicesBuf);
setKernelArg<cl_uint>(kernel, numArgs + 3, batchDims[axis]);
setKernelArg<cl_uint>(kernel, numArgs + 4, axisSliceSize);
setKernelArg<cl_uint>(kernel, numArgs + 1, batchDims[axis]);
setKernelArg<cl_uint>(kernel, numArgs + 2, axisSrcSliceSize);

// Parallelize on each element in the slice.
enqueueKernel(I.getName(), commands_, kernel, deviceId_, destDimsVec,
Expand Down Expand Up @@ -1962,6 +1895,7 @@ bool OCLBackend::isOpSupported(const NodeInfo &NI) const {

case Kinded::Kind::SaveNodeKind:
case Kinded::Kind::ReshapeNodeKind:
case Kinded::Kind::OCLBatchedReduceAddNodeKind:
case Kinded::Kind::TraceEventNodeKind:
// These work regardless of the underlying type.
return true;
Expand Down
91 changes: 88 additions & 3 deletions lib/Backends/OpenCL/Transforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,13 @@ bool OCLBackend::transformPostLowering(Function *F,

LOG_SCOPE(F->getLogContext(), "OCLBackend::transformPostLowering")

if (cctx.compMode == CompilationMode::Train)
return false;

bool changed = false;
for (auto &node : F->getNodes()) {
if (auto *CN = dyn_cast<ConvolutionNode>(&node)) {
if (cctx.compMode == CompilationMode::Train) {
continue;
}

// TODO: OpenCL fast convolution kernel itself has some issue with group >
// 1, which will be investigated later. So far, if the group > 1, we just
// call the slow convolution kernel.
Expand All @@ -50,17 +51,101 @@ bool OCLBackend::transformPostLowering(Function *F,
continue;
}
if (auto *PMN = dyn_cast<MaxPoolNode>(&node)) {
if (cctx.compMode == CompilationMode::Train) {
continue;
}

auto *NR = convertPoolToNCHWPool<MaxPoolNode, OCLMaxPoolNode>(PMN, F);
PMN->getResult().replaceAllUsesOfWith(NR);
changed = true;
continue;
}
if (auto *PAN = dyn_cast<AvgPoolNode>(&node)) {
if (cctx.compMode == CompilationMode::Train) {
continue;
}

auto *NR = convertPoolToNCHWPool<AvgPoolNode, OCLAvgPoolNode>(PAN, F);
PAN->getResult().replaceAllUsesOfWith(NR);
changed = true;
continue;
}
// The code below replaces a regular BatchedReduceAddNode with a
// semantically identical OCLBatchedReduceAddNode that has two additional
// inputs for the slice sizes of the input and output nodes. The OpenCL
// implementation of the batchedreduceadd instruction needs this information
// and storing it in graph Constants ensures that it will be copied to the
// device with the rest of the Function's Constants. Consequently, it does
// not need to be copied separately or at runtime (which would increase
// execution latency).
if (auto *BRA = dyn_cast<BatchedReduceAddNode>(&node)) {
auto axis = BRA->getAxis();

// Determine and store the slice sizes of each input dimension excluding
// the reduce axis into batchSliceSizes. Determine also the slice size on
// the reduce axis and store that separately. These are used by the kernel
// to index correctly into the input buffer. If the input has one
// dimension (that is also the reduce axis), store one slice of size 1
// into batchSliceSizes.
auto batchDims = BRA->getBatch().getType()->dims();
auto numBatchDims = batchDims.size();
auto batchSliceSizesLen = numBatchDims > 1 ? numBatchDims - 1 : 1;
auto *batchSliceSizes = F->getParent()->createConstant(
ElemKind::Int32ITy, {batchSliceSizesLen}, "batchSliceSizes");
auto batchSliceSizesH =
batchSliceSizes->getPayloadMutable().getHandle<int32_t>();
batchSliceSizesH.clear(1);

size_t currentSliceSize = 1, axisSliceSize = 1;
unsigned j = batchSliceSizesLen - 1;
for (ssize_t i = batchDims.size() - 1; i >= 0; --i) {
// If i is the reduce axis, currentSliceSize is the slice size at the
// reduce axis. Store it in axisSliceSize and not in batchSliceSizes. If
// not, do the opposite.
if (i == axis) {
axisSliceSize = currentSliceSize;
} else {
batchSliceSizesH.at({j--}) = currentSliceSize;
}
// Compute the slice size for the next iteration.
currentSliceSize *= batchDims[i];
}

// Determine and store the slice sizes of each output dimension excluding
// the reduce axis into destSliceSizes. These are used by the kernel to
// index correctly into the output buffer. If the output has zero
// dimensions store one slice of size 1 into destSliceSizes.
auto destDims = BRA->getResult().getType()->dims();
std::vector<size_t> destDimsVec(destDims.begin(), destDims.end());
if (destDims.empty()) {
destDimsVec.emplace_back(1);
}
auto numDestDims = destDimsVec.size();
auto destSliceSizesLen = numDestDims > 0 ? numDestDims : 1;
auto *destSliceSizes = F->getParent()->createConstant(
ElemKind::Int32ITy, {destSliceSizesLen}, "destSliceSizes");
auto destSliceSizesH =
destSliceSizes->getPayloadMutable().getHandle<int32_t>();
destSliceSizesH.clear(1);

// Start i at destDimsVec.size() - 2 because the last slice size is always
// known to be 1.
for (ssize_t i = destDimsVec.size() - 2; i >= 0; --i) {
// The slice size of the current dimension is the slice size of the
// previous dimension multiplied by the number of elements in that
// dimension.
destSliceSizesH.at({static_cast<unsigned>(i)}) =
destSliceSizesH.at({static_cast<unsigned>(i + 1)}) *
destDimsVec[i + 1];
}

auto *OCLBRA = F->addNode(new OCLBatchedReduceAddNode(
BRA->getName(), BRA->getResult().getType(), BRA->getBatch(),
destSliceSizes, batchSliceSizes, axis, axisSliceSize));
BRA->getResult().replaceAllUsesOfWith(OCLBRA);
changed = true;
continue;
}
}
return changed;
}
22 changes: 11 additions & 11 deletions lib/Backends/OpenCL/kernels.cl
Original file line number Diff line number Diff line change
Expand Up @@ -618,11 +618,11 @@ __kernel void elementcmplteW(__global void *mem, cl_uint32_t dest,
elementcmplteK(&mem[dest], &mem[LHS], &mem[RHS]);
}

__kernel void batchedreduceaddK(__global float *dest, __global float *batch,
__global cl_host_size_t *batchSliceSizes,
__global cl_host_size_t *destSliceSize,
cl_uint32_t numSlices,
cl_uint32_t axisSliceSize) {
__kernel void oclbatchedreduceaddK(__global float *dest, __global float *batch,
__global cl_int32_t *destSliceSizes,
__global cl_int32_t *batchSliceSizes,
cl_uint32_t numSlices,
cl_uint32_t axisSliceSize) {
size_t workDim = get_work_dim();

// This is the component of the offset into batch that depends only on the
Expand All @@ -648,7 +648,7 @@ __kernel void batchedreduceaddK(__global float *dest, __global float *batch,
for (size_t i = 0; i < workDim; ++i) {
size_t id = get_global_id(i);
batchOffset += id * batchSliceSizes[i];
destOffset += id * destSliceSize[i];
destOffset += id * destSliceSizes[i];
}

// Perform the actual reduce. Add the slice number * the slice size at the
Expand All @@ -660,11 +660,11 @@ __kernel void batchedreduceaddK(__global float *dest, __global float *batch,
}

__kernel void
batchedreduceaddW(__global void *mem, cl_uint32_t dest, cl_uint32_t batch,
__global void *batchSliceSizes, __global void *destSliceSizes,
cl_uint32_t numSlices, cl_uint32_t axisSliceSize) {
batchedreduceaddK(&mem[dest], &mem[batch], batchSliceSizes, destSliceSizes,
numSlices, axisSliceSize);
oclbatchedreduceaddW(__global void *mem, cl_uint32_t dest, cl_uint32_t batch,
cl_uint32_t destSliceSizes, cl_uint32_t batchSliceSizes,
cl_uint32_t numSlices, cl_uint32_t axisSliceSize) {
oclbatchedreduceaddK(&mem[dest], &mem[batch], &mem[destSliceSizes],
&mem[batchSliceSizes], numSlices, axisSliceSize);
}

__kernel void batchedaddK(__global float *dest, __global float *batch,
Expand Down
10 changes: 10 additions & 0 deletions tools/ClassGen/Backends/OpenCL/OpenCLSpecificInstrs.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,16 @@ BB.newBackendSpecificInstr("OCLMaxPool")
.autoIRGen()
.autoVerify(VerifyKind::SameElementType, {"Dest", "Src"});

BB.newBackendSpecificInstr("OCLBatchedReduceAdd")
.addOperand("Dest", OperandKind::Out)
.addOperand("Src", OperandKind::In)
.addOperand("DestSliceSizes", OperandKind::In)
.addOperand("SrcSliceSizes", OperandKind::In)
.addMember(MemberType::Unsigned, "Axis")
.addMember(MemberType::Unsigned, "AxisSrcSliceSize")
.autoVerify(VerifyKind::SameElementType, {"Dest", "Src"})
.autoIRGen();

BB.includeBackendSpecificVerification(
"glow/OpenCLSpecificInstrsVerification.h");

Expand Down
11 changes: 11 additions & 0 deletions tools/ClassGen/Backends/OpenCL/OpenCLSpecificNodes.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,17 @@ BB.newNode("OCLMaxPool")
"provided "
"Kernel, Stride, and Pads. The input and output are in NCHW format");

BB.newNode("OCLBatchedReduceAdd")
.addInput("Input")
.addInput("DestSliceSizes")
.addInput("SrcSliceSizes")
.addMember(MemberType::Unsigned, "Axis")
.addMember(MemberType::Unsigned, "AxisSrcSliceSize")
.addResultFromCtorArg()
.setDocstring(
"This is an OpenCL-specific BatchedReduceAdd operation which has the "
"slice sizes of the input and output as explicit inputs.");

BB.includeBackendSpecificVerification("glow/OpenCLSpecificNodesVerification.h");

#endif // GLOW_WITH_CPU
67 changes: 67 additions & 0 deletions tools/ClassGen/Backends/OpenCL/OpenCLSpecificNodesVerification.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,71 @@ bool OCLConvolutionNode::verify() const {
bool OCLAvgPoolNode::verify() const { return true; }

bool OCLMaxPoolNode::verify() const { return true; }

bool OCLBatchedReduceAddNode::verify() const {
Constant *destSliceSizes =
llvm::dyn_cast<Constant>(getDestSliceSizes().getNode());
Constant *srcSliceSizes =
llvm::dyn_cast<Constant>(getSrcSliceSizes().getNode());

// Both the destSliceSizes and srcSliceSizes should be Constants.
if (!destSliceSizes || !srcSliceSizes) {
return false;
}

// Check that the values of destSliceSizes and srcSliceSizes still match the
// Types of the Input and Result. For more information, see
// OCLBackend::transformPostLowering.
bool ok = true;
auto srcSliceSizesH = srcSliceSizes->getPayload().getHandle<int32_t>();
auto srcDims = getInput().getType()->dims();

if (!srcDims.empty()) {
unsigned_t currentSliceSize = 1;
unsigned j = srcSliceSizesH.size() - 1;
for (ssize_t i = srcDims.size() - 1; i >= 0; --i) {
if (i == getAxis()) {
ok &= expectCompareTrue("axisSrcSlizeSize is incorrect",
getAxisSrcSliceSize(), currentSliceSize, this);
} else {
ok &=
expectCompareTrue("srcSliceSize is incorrect",
static_cast<unsigned_t>(srcSliceSizesH.at({j--})),
currentSliceSize, this);
}
currentSliceSize *= srcDims[i];
}
} else {
ok &= expectCompareTrue("axisSrcSlizeSize is incorrect",
getAxisSrcSliceSize(), static_cast<unsigned_t>(1),
this);
ok &=
expectCompareTrue("srcSliceSizes has the wrong shape",
srcSliceSizesH.size(), static_cast<size_t>(1), this);
ok &= expectCompareTrue("srcSliceSizes is incorrect",
srcSliceSizesH.at({0}), 1, this);
}

auto destDims = getResult().getType()->dims();
std::vector<int32_t> destDimsVec(destDims.begin(), destDims.end());
if (destDims.empty()) {
destDimsVec.emplace_back(1);
}
auto destSliceSizesH = destSliceSizes->getPayload().getHandle<int32_t>();

ok &=
expectCompareTrue("destSliceSizes is incorrect",
destSliceSizesH.at({destDimsVec.size() - 1}), 1, this);

for (ssize_t i = destDimsVec.size() - 2; i >= 0; --i) {
ok &= expectCompareTrue("destSliceSizes is incorrect",
destSliceSizesH.at({static_cast<unsigned>(i)}),
destSliceSizesH.at({static_cast<unsigned>(i + 1)}) *
destDimsVec[i + 1],
this);
}

return ok;
}

#endif // GLOW_WITH_OPENCL