Implement BatchedReduceAdd for arbitrary axes (#2958)

Meghan Lele · facebook-github-bot · commit 086fd84a16a9 · 2019-06-13T19:36:01.000-07:00
Summary: **Description** This commit extends the implementation of `BatchedReduceAdd` in the OpenCL backend so that it can handle any reduction axis, not just 1. This can be useful when the first dimension is the batch dimension, and a reduction needs to be performed within each example. The existing implementation for axis = 0 computes each slice element in parallel and linearizes the slice for simplicity (i.e. creates a 1D global workspace as large as the number of elements in the output). This implementation generalizes this concept by creating a global workspace with rank equal to the number of dimensions of the output and computing each one in parallel. The slice sizes of the input and output shapes are are precomputed on the host and passed in as kernel arguments so that the kernel can compute the correct offsets into the input and output buffer by multiplying its set of global IDs with those slice sizes. **Test Plan** This commit enables the existing non-zero axis `BatchedReduceAdd` unit test for OpenCL and modifies it test `axis=2`. All unit tests pass. Pull Request resolved: #2958 Differential Revision: D15462357 Pulled By: SplitInfinity fbshipit-source-id: c1cb526ad12fbb000c01215d531cd5dd6a0c0929
diff --git a/lib/Backends/OpenCL/OpenCL.cpp b/lib/Backends/OpenCL/OpenCL.cpp
@@ -913,18 +913,92 @@ llvm::Error OpenCLFunction::execute(ExecutionContext *context) {
     }
 
     if (auto *BRA = dyn_cast<BatchedReduceAddInst>(&I)) {
-      assert(BRA->getAxis() == 0 && "No current support for non-zero axis.");
+      auto axis = BRA->getAxis();
+
+      // Determine and store the slice sizes of each input dimension excluding
+      // the reduce axis into batchSliceSizes. These are used by the kernel to
+      // index correctly into the input buffer. If the input has one dimension
+      // (that is also the reduce axis), store one slice of size 1 into
+      // batchSliceSizes.
+      auto batchDims = BRA->getBatch()->getType()->dims();
+      auto numBatchDims = batchDims.size();
+      std::vector<size_t> batchSliceSizes(
+          numBatchDims > 1 ? numBatchDims - 1 : 1, 1);
+      size_t currentSliceSize = 1, axisSliceSize = 1;
+      for (size_t i = batchSliceSizes.size() - 1, j = i; i >= 0; ++i) {
+        if (i == axis) {
+          axisSliceSize = currentSliceSize;
+        } else {
+          batchSliceSizes[j--] = currentSliceSize;
+        }
+
+        currentSliceSize *= batchDims[i];
+      }
+
+      // Determine and store the slice sizes of each output dimension excluding
+      // the reduce axis into destSliceSizes. These are used by the kernel to
+      // index correctly into the output buffer. If the output has zero
+      // dimensions store one slice of size 1 into destSliceSizes.
+      auto destDims = BRA->getDest()->getType()->dims();
+      std::vector<size_t> destDimsVec(destDims.begin(), destDims.end());
+      if (destDims.empty()) {
+        destDimsVec.emplace_back(1);
+      }
+      auto numDestDims = destDimsVec.size();
+      std::vector<size_t> destSliceSizes(numDestDims > 0 ? numDestDims : 1, 1);
+      for (size_t i = 2, e = destDimsVec.size(); i <= e; ++i) {
+        destSliceSizes[e - i] = destSliceSizes[e - i + 1] * destDimsVec[e - i];
+      }
+
+      // Allocate device buffers for batchSliceSizes and destSliceSizes.
+      size_t batchSlicesBufSize = batchSliceSizes.size() * sizeof(size_t);
+      size_t destSlicesBufSize = destSliceSizes.size() * sizeof(size_t);
+      cl_mem batchSlicesBuf = allocDeviceBuffer(batchSlicesBufSize);
+      cl_mem destSlicesBuf = allocDeviceBuffer(destSlicesBufSize);
+
+      // Copy batchSliceSizes and destSliceSizes from host to device.
+      cl_event writeBatchSlicesEvent{nullptr}, writeDestSlicesEvent{nullptr};
+      cl_int err = clEnqueueWriteBuffer(
+          commands_, batchSlicesBuf, /*blocking_write=*/CL_FALSE, /*offset=*/0,
+          batchSlicesBufSize, batchSliceSizes.data(),
+          /* num_events_in_wait_list */ 0,
+          /* event_list */ nullptr,
+          /* event */ kernelProfiling_ ? &writeBatchSlicesEvent : nullptr);
+      GLOW_ASSERT(err == CL_SUCCESS && "Unable to copy BRA data to the device");
+      if (kernelProfiling_) {
+        kernelLaunches_.emplace_back(KernelLaunch("batchedReduceAddSliceData",
+                                                  "batchedReduceAddSliceData",
+                                                  writeBatchSlicesEvent));
+      }
 
+      err = clEnqueueWriteBuffer(
+          commands_, destSlicesBuf, /*blocking_write=*/CL_FALSE, /*offset=*/0,
+          destSlicesBufSize, destSliceSizes.data(),
+          /* num_events_in_wait_list */ 0,
+          /* event_list */ nullptr,
+          /* event */ kernelProfiling_ ? &writeDestSlicesEvent : nullptr);
+      GLOW_ASSERT(err == CL_SUCCESS && "Unable to copy BRA data to the device");
+      if (kernelProfiling_) {
+        kernelLaunches_.emplace_back(KernelLaunch("batchedReduceAddSliceData",
+                                                  "batchedReduceAddSliceData",
+                                                  writeDestSlicesEvent));
+      }
+
+      // Wait for the writes to finish.
+      clFinish(commands_);
+
+      // Create kernel and set arguments.
       cl_kernel kernel = createKernel(kernelName);
       setKernelArg(kernel, 0, deviceBuffer_);
       auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_);
 
-      auto bdim = flattenCdr(BRA->getBatch()->dims());
-      setKernelArg<cl_uint>(kernel, numArgs + 1, bdim.first);
-      setKernelArg<cl_uint>(kernel, numArgs + 2, bdim.second);
+      setKernelArg(kernel, numArgs + 1, batchSlicesBuf);
+      setKernelArg(kernel, numArgs + 2, destSlicesBuf);
+      setKernelArg<cl_uint>(kernel, numArgs + 3, batchDims[axis]);
+      setKernelArg<cl_uint>(kernel, numArgs + 4, axisSliceSize);
 
       // Parallelize on each element in the slice.
-      enqueueKernel(I.getName(), commands_, kernel, deviceId_, {bdim.second},
+      enqueueKernel(I.getName(), commands_, kernel, deviceId_, destDimsVec,
                     kernelLaunches_);
       continue;
     }
diff --git a/lib/Backends/OpenCL/kernels.cl b/lib/Backends/OpenCL/kernels.cl
@@ -619,18 +619,52 @@ __kernel void elementcmplteW(__global void *mem, cl_uint32_t dest,
 }
 
 __kernel void batchedreduceaddK(__global float *dest, __global float *batch,
-                                cl_uint32_t numSlice, cl_uint32_t sliceSize) {
-  size_t s = get_global_id(0);
-  dest[s] = 0;
-  for (size_t n = 0; n < numSlice; n++) {
-    dest[s] += batch[n * sliceSize + s];
+                                __global cl_host_size_t *batchSliceSizes,
+                                __global cl_host_size_t *destSliceSize,
+                                cl_uint32_t numSlices,
+                                cl_uint32_t axisSliceSize) {
+  size_t workDim = get_work_dim();
+
+  // This is the component of the offset into batch that depends only on the
+  // kernel's global IDs.
+  size_t batchOffset = 0;
+
+  // This is the offset into dest. It depends only on the kernel's global IDs.
+  size_t destOffset = 0;
+
+  // Compute batchOffset and destOffset by multiplying the kernel's global IDs
+  // with the corresponding batch and dest slice sizes.
+  //
+  // For example, suppose the input shape is {3, 4, 5} and the reduce axis is 1.
+  // Then, the output shape is {3, 5}. In this case, batchSliceSizes is {4 * 5 =
+  // 20, 1} (axis 1 is missing) and destSliceSizes is {5, 1}. The global
+  // workspace this kernel was launched with has dimensions {3, 5} (one for each
+  // output element). A kernel with IDs {i, j} will add together elements
+  // {i, 0..4, j} and store the result in element {i, j}, so (i * 20 + j * 1)
+  // will be a component of every offset it uses to access batch, and (i * 5 + j
+  // * 1) will be the offset it uses to access dest. This is precisely what
+  // batchOffset and destOffset are. The loop below precomputes these offsets
+  // before the actual reduce.
+  for (size_t i = 0; i < workDim; ++i) {
+    size_t id = get_global_id(i);
+    batchOffset += id * batchSliceSizes[i];
+    destOffset += id * destSliceSize[i];
+  }
+
+  // Perform the actual reduce. Add the slice number * the slice size at the
+  // axis index to batchOffset to get the elements to add together.
+  dest[destOffset] = 0;
+  for (size_t n = 0; n < numSlices; n++) {
+    dest[destOffset] += batch[n * axisSliceSize + batchOffset];
   }
 }
 
-__kernel void batchedreduceaddW(__global void *mem, cl_uint32_t dest,
-                                cl_uint32_t batch, cl_uint32_t numSlice,
-                                cl_uint32_t sliceSize) {
-  batchedreduceaddK(&mem[dest], &mem[batch], numSlice, sliceSize);
+__kernel void
+batchedreduceaddW(__global void *mem, cl_uint32_t dest, cl_uint32_t batch,
+                  __global void *batchSliceSizes, __global void *destSliceSizes,
+                  cl_uint32_t numSlices, cl_uint32_t axisSliceSize) {
+  batchedreduceaddK(&mem[dest], &mem[batch], batchSliceSizes, destSliceSizes,
+                    numSlices, axisSliceSize);
 }
 
 __kernel void batchedaddK(__global float *dest, __global float *batch,
diff --git a/tests/unittests/OperatorTest.cpp b/tests/unittests/OperatorTest.cpp
@@ -794,22 +794,33 @@ static void testBatchedReduceAddWithAxis(glow::PlaceholderBindings &bindings,
   bindings.allocate(batch)->getHandle<DataType>() = {0, 1, 2, 3, 4,  5,
                                                      6, 7, 8, 9, 10, 11};
 
-  auto OT = uniqueTypeConditionallyQuantized(mod, DTy, {2, 2});
-  auto *R = F->createBatchedReduceAdd("reduce.add", OT, batch, /* axis */ 1);
-  auto *save = F->createSave("save", R);
-  auto *result = bindings.allocate(save->getPlaceholder());
+  auto OT1 = uniqueTypeConditionallyQuantized(mod, DTy, {2, 2});
+  auto *R1 =
+      F->createBatchedReduceAdd("reduce.add.axis.1", OT1, batch, /* axis */ 1);
+  auto OT2 = uniqueTypeConditionallyQuantized(mod, DTy, {2, 3});
+  auto *R2 =
+      F->createBatchedReduceAdd("reduce.add.axis.2", OT2, batch, /* axis */ 2);
+  auto *save1 = F->createSave("save1", R1);
+  auto *save2 = F->createSave("save2", R2);
+
+  auto *result1 = bindings.allocate(save1->getPlaceholder());
+  auto *result2 = bindings.allocate(save2->getPlaceholder());
 
   EE.compile(CompilationMode::Infer, F);
   EE.run(bindings);
 
-  auto expected = createTensorConditionallyQuantized(DTy, {2, 2});
-  expected.getHandle<DataType>() = {6, 9, 24, 27};
-  EXPECT_TRUE(result->isEqual(expected));
+  auto expected1 = createTensorConditionallyQuantized(DTy, {2, 2});
+  expected1.getHandle<DataType>() = {6, 9, 24, 27};
+  EXPECT_TRUE(result1->isEqual(expected1));
+
+  auto expected2 = createTensorConditionallyQuantized(DTy, {2, 3});
+  expected2.getHandle<DataType>() = {1, 5, 9, 13, 17, 21};
+  EXPECT_TRUE(result2->isEqual(expected2));
 }
 
 /// Test that batchedReduceAddWithAxis is correctly supported in FloatTy.
 TEST_P(OperatorTest, batchedReduceAddWithAxis_Float) {
-  ENABLED_BACKENDS(Interpreter, CPU);
+  ENABLED_BACKENDS(Interpreter, CPU, OpenCL);
   testBatchedReduceAddWithAxis<float>(bindings_, mod_, F_, EE_,
                                       ElemKind::FloatTy);
 }