diff --git a/lib/Backends/OpenCL/OpenCL.cpp b/lib/Backends/OpenCL/OpenCL.cpp
index bcb3f2046b..77fd1aaf71 100644
--- a/lib/Backends/OpenCL/OpenCL.cpp
+++ b/lib/Backends/OpenCL/OpenCL.cpp
@@ -922,8 +922,9 @@ llvm::Error OpenCLFunction::execute(ExecutionContext *context) {
       continue;
     }
 
-    if (auto *BRA = dyn_cast<BatchedReduceAddInst>(&I)) {
+    if (auto *BRA = dyn_cast<OCLBatchedReduceAddInst>(&I)) {
       auto axis = BRA->getAxis();
+      auto axisSrcSliceSize = BRA->getAxisSrcSliceSize();
 
       // Determine and store the slice sizes of each input dimension excluding
       // the reduce axis into batchSliceSizes. Determine also the slice size on
@@ -931,25 +932,7 @@ llvm::Error OpenCLFunction::execute(ExecutionContext *context) {
       // to index correctly into the input buffer. If the input has one
       // dimension (that is also the reduce axis), store one slice of size 1
       // into batchSliceSizes.
-      auto batchDims = BRA->getBatch()->getType()->dims();
-      auto numBatchDims = batchDims.size();
-      std::vector<size_t> batchSliceSizes(
-          numBatchDims > 1 ? numBatchDims - 1 : 1, 1);
-      size_t currentSliceSize = 1, axisSliceSize = 1;
-      for (ssize_t i = batchDims.size() - 1, j = batchSliceSizes.size() - 1;
-           i >= 0; --i) {
-        // If i is the reduce axis, currentSliceSize is the slice size at the
-        // reduce axis. Store it in axisSliceSize and not in batchSliceSizes. If
-        // not, do the opposite.
-        if (i == axis) {
-          axisSliceSize = currentSliceSize;
-        } else {
-          batchSliceSizes[j--] = currentSliceSize;
-        }
-
-        // Compute the slice size for the next iteration.
-        currentSliceSize *= batchDims[i];
-      }
+      auto batchDims = BRA->getSrc()->getType()->dims();
 
       // Determine and store the slice sizes of each output dimension excluding
       // the reduce axis into destSliceSizes. These are used by the kernel to
@@ -960,64 +943,14 @@ llvm::Error OpenCLFunction::execute(ExecutionContext *context) {
       if (destDims.empty()) {
         destDimsVec.emplace_back(1);
       }
-      auto numDestDims = destDimsVec.size();
-      std::vector<size_t> destSliceSizes(numDestDims > 0 ? numDestDims : 1, 1);
-
-      // Start i at destDimsVec.size() - 2 because the last slice size is always
-      // known to be 1.
-      for (ssize_t i = destDimsVec.size() - 2; i >= 0; --i) {
-        // The slice size of the current dimension is the slice size of the
-        // previous dimension multiplied by the number of elements in that
-        // dimension.
-        destSliceSizes[i] = destSliceSizes[i + 1] * destDimsVec[i + 1];
-      }
-
-      // Allocate device buffers for batchSliceSizes and destSliceSizes.
-      size_t batchSlicesBufSize = batchSliceSizes.size() * sizeof(size_t);
-      size_t destSlicesBufSize = destSliceSizes.size() * sizeof(size_t);
-      cl_mem batchSlicesBuf = allocDeviceBuffer(batchSlicesBufSize);
-      cl_mem destSlicesBuf = allocDeviceBuffer(destSlicesBufSize);
-
-      // Copy batchSliceSizes and destSliceSizes from host to device.
-      cl_event writeBatchSlicesEvent{nullptr}, writeDestSlicesEvent{nullptr};
-      cl_int err = clEnqueueWriteBuffer(
-          commands_, batchSlicesBuf, /*blocking_write=*/CL_FALSE, /*offset=*/0,
-          batchSlicesBufSize, batchSliceSizes.data(),
-          /* num_events_in_wait_list */ 0,
-          /* event_list */ nullptr,
-          /* event */ kernelProfiling_ ? &writeBatchSlicesEvent : nullptr);
-      CHECK_EQ(err, CL_SUCCESS) << "Unable to copy BRA data to the device";
-      if (kernelProfiling_) {
-        kernelLaunches_.emplace_back(KernelLaunch("batchedReduceAddSliceData",
-                                                  "batchedReduceAddSliceData",
-                                                  writeBatchSlicesEvent));
-      }
-
-      err = clEnqueueWriteBuffer(
-          commands_, destSlicesBuf, /*blocking_write=*/CL_FALSE, /*offset=*/0,
-          destSlicesBufSize, destSliceSizes.data(),
-          /* num_events_in_wait_list */ 0,
-          /* event_list */ nullptr,
-          /* event */ kernelProfiling_ ? &writeDestSlicesEvent : nullptr);
-      CHECK_EQ(err, CL_SUCCESS) << "Unable to copy BRA data to the device";
-      if (kernelProfiling_) {
-        kernelLaunches_.emplace_back(KernelLaunch("batchedReduceAddSliceData",
-                                                  "batchedReduceAddSliceData",
-                                                  writeDestSlicesEvent));
-      }
-
-      // Wait for the writes to finish.
-      clFinish(commands_);
 
       // Create kernel and set arguments.
       cl_kernel kernel = createKernel(kernelName);
       setKernelArg(kernel, 0, deviceBuffer_);
       auto numArgs = setKernelArgsForBuffers(kernel, I, 1, runtimeBundle_);
 
-      setKernelArg(kernel, numArgs + 1, batchSlicesBuf);
-      setKernelArg(kernel, numArgs + 2, destSlicesBuf);
-      setKernelArg<cl_uint>(kernel, numArgs + 3, batchDims[axis]);
-      setKernelArg<cl_uint>(kernel, numArgs + 4, axisSliceSize);
+      setKernelArg<cl_uint>(kernel, numArgs + 1, batchDims[axis]);
+      setKernelArg<cl_uint>(kernel, numArgs + 2, axisSrcSliceSize);
 
       // Parallelize on each element in the slice.
       enqueueKernel(I.getName(), commands_, kernel, deviceId_, destDimsVec,
@@ -1962,6 +1895,7 @@ bool OCLBackend::isOpSupported(const NodeInfo &NI) const {
 
   case Kinded::Kind::SaveNodeKind:
   case Kinded::Kind::ReshapeNodeKind:
+  case Kinded::Kind::OCLBatchedReduceAddNodeKind:
   case Kinded::Kind::TraceEventNodeKind:
     // These work regardless of the underlying type.
     return true;
diff --git a/lib/Backends/OpenCL/Transforms.cpp b/lib/Backends/OpenCL/Transforms.cpp
index 733136bb1e..c0ba22f2db 100644
--- a/lib/Backends/OpenCL/Transforms.cpp
+++ b/lib/Backends/OpenCL/Transforms.cpp
@@ -33,12 +33,13 @@ bool OCLBackend::transformPostLowering(Function *F,
 
   LOG_SCOPE(F->getLogContext(), "OCLBackend::transformPostLowering")
 
-  if (cctx.compMode == CompilationMode::Train)
-    return false;
-
   bool changed = false;
   for (auto &node : F->getNodes()) {
     if (auto *CN = dyn_cast<ConvolutionNode>(&node)) {
+      if (cctx.compMode == CompilationMode::Train) {
+        continue;
+      }
+
       // TODO: OpenCL fast convolution kernel itself has some issue with group >
       // 1, which will be investigated later. So far, if the group > 1, we just
       // call the slow convolution kernel.
@@ -50,17 +51,101 @@ bool OCLBackend::transformPostLowering(Function *F,
       continue;
     }
     if (auto *PMN = dyn_cast<MaxPoolNode>(&node)) {
+      if (cctx.compMode == CompilationMode::Train) {
+        continue;
+      }
+
       auto *NR = convertPoolToNCHWPool<MaxPoolNode, OCLMaxPoolNode>(PMN, F);
       PMN->getResult().replaceAllUsesOfWith(NR);
       changed = true;
       continue;
     }
     if (auto *PAN = dyn_cast<AvgPoolNode>(&node)) {
+      if (cctx.compMode == CompilationMode::Train) {
+        continue;
+      }
+
       auto *NR = convertPoolToNCHWPool<AvgPoolNode, OCLAvgPoolNode>(PAN, F);
       PAN->getResult().replaceAllUsesOfWith(NR);
       changed = true;
       continue;
     }
+    // The code below replaces a regular BatchedReduceAddNode with a
+    // semantically identical OCLBatchedReduceAddNode that has two additional
+    // inputs for the slice sizes of the input and output nodes. The OpenCL
+    // implementation of the batchedreduceadd instruction needs this information
+    // and storing it in graph Constants ensures that it will be copied to the
+    // device with the rest of the Function's Constants. Consequently, it does
+    // not need to be copied separately or at runtime (which would increase
+    // execution latency).
+    if (auto *BRA = dyn_cast<BatchedReduceAddNode>(&node)) {
+      auto axis = BRA->getAxis();
+
+      // Determine and store the slice sizes of each input dimension excluding
+      // the reduce axis into batchSliceSizes. Determine also the slice size on
+      // the reduce axis and store that separately. These are used by the kernel
+      // to index correctly into the input buffer. If the input has one
+      // dimension (that is also the reduce axis), store one slice of size 1
+      // into batchSliceSizes.
+      auto batchDims = BRA->getBatch().getType()->dims();
+      auto numBatchDims = batchDims.size();
+      auto batchSliceSizesLen = numBatchDims > 1 ? numBatchDims - 1 : 1;
+      auto *batchSliceSizes = F->getParent()->createConstant(
+          ElemKind::Int32ITy, {batchSliceSizesLen}, "batchSliceSizes");
+      auto batchSliceSizesH =
+          batchSliceSizes->getPayloadMutable().getHandle<int32_t>();
+      batchSliceSizesH.clear(1);
+
+      size_t currentSliceSize = 1, axisSliceSize = 1;
+      unsigned j = batchSliceSizesLen - 1;
+      for (ssize_t i = batchDims.size() - 1; i >= 0; --i) {
+        // If i is the reduce axis, currentSliceSize is the slice size at the
+        // reduce axis. Store it in axisSliceSize and not in batchSliceSizes. If
+        // not, do the opposite.
+        if (i == axis) {
+          axisSliceSize = currentSliceSize;
+        } else {
+          batchSliceSizesH.at({j--}) = currentSliceSize;
+        }
+        // Compute the slice size for the next iteration.
+        currentSliceSize *= batchDims[i];
+      }
+
+      // Determine and store the slice sizes of each output dimension excluding
+      // the reduce axis into destSliceSizes. These are used by the kernel to
+      // index correctly into the output buffer. If the output has zero
+      // dimensions store one slice of size 1 into destSliceSizes.
+      auto destDims = BRA->getResult().getType()->dims();
+      std::vector<size_t> destDimsVec(destDims.begin(), destDims.end());
+      if (destDims.empty()) {
+        destDimsVec.emplace_back(1);
+      }
+      auto numDestDims = destDimsVec.size();
+      auto destSliceSizesLen = numDestDims > 0 ? numDestDims : 1;
+      auto *destSliceSizes = F->getParent()->createConstant(
+          ElemKind::Int32ITy, {destSliceSizesLen}, "destSliceSizes");
+      auto destSliceSizesH =
+          destSliceSizes->getPayloadMutable().getHandle<int32_t>();
+      destSliceSizesH.clear(1);
+
+      // Start i at destDimsVec.size() - 2 because the last slice size is always
+      // known to be 1.
+      for (ssize_t i = destDimsVec.size() - 2; i >= 0; --i) {
+        // The slice size of the current dimension is the slice size of the
+        // previous dimension multiplied by the number of elements in that
+        // dimension.
+        destSliceSizesH.at({static_cast<unsigned>(i)}) =
+            destSliceSizesH.at({static_cast<unsigned>(i + 1)}) *
+            destDimsVec[i + 1];
+      }
+
+      auto *OCLBRA = F->addNode(new OCLBatchedReduceAddNode(
+          BRA->getName(), BRA->getResult().getType(), BRA->getBatch(),
+          destSliceSizes, batchSliceSizes, axis, axisSliceSize));
+      BRA->getResult().replaceAllUsesOfWith(OCLBRA);
+      changed = true;
+      continue;
+    }
   }
   return changed;
 }
diff --git a/lib/Backends/OpenCL/kernels.cl b/lib/Backends/OpenCL/kernels.cl
index af93833529..8649c79ce7 100644
--- a/lib/Backends/OpenCL/kernels.cl
+++ b/lib/Backends/OpenCL/kernels.cl
@@ -618,11 +618,11 @@ __kernel void elementcmplteW(__global void *mem, cl_uint32_t dest,
   elementcmplteK(&mem[dest], &mem[LHS], &mem[RHS]);
 }
 
-__kernel void batchedreduceaddK(__global float *dest, __global float *batch,
-                                __global cl_host_size_t *batchSliceSizes,
-                                __global cl_host_size_t *destSliceSize,
-                                cl_uint32_t numSlices,
-                                cl_uint32_t axisSliceSize) {
+__kernel void oclbatchedreduceaddK(__global float *dest, __global float *batch,
+                                   __global cl_int32_t *destSliceSizes,
+                                   __global cl_int32_t *batchSliceSizes,
+                                   cl_uint32_t numSlices,
+                                   cl_uint32_t axisSliceSize) {
   size_t workDim = get_work_dim();
 
   // This is the component of the offset into batch that depends only on the
@@ -648,7 +648,7 @@ __kernel void batchedreduceaddK(__global float *dest, __global float *batch,
   for (size_t i = 0; i < workDim; ++i) {
     size_t id = get_global_id(i);
     batchOffset += id * batchSliceSizes[i];
-    destOffset += id * destSliceSize[i];
+    destOffset += id * destSliceSizes[i];
   }
 
   // Perform the actual reduce. Add the slice number * the slice size at the
@@ -660,11 +660,11 @@ __kernel void batchedreduceaddK(__global float *dest, __global float *batch,
 }
 
 __kernel void
-batchedreduceaddW(__global void *mem, cl_uint32_t dest, cl_uint32_t batch,
-                  __global void *batchSliceSizes, __global void *destSliceSizes,
-                  cl_uint32_t numSlices, cl_uint32_t axisSliceSize) {
-  batchedreduceaddK(&mem[dest], &mem[batch], batchSliceSizes, destSliceSizes,
-                    numSlices, axisSliceSize);
+oclbatchedreduceaddW(__global void *mem, cl_uint32_t dest, cl_uint32_t batch,
+                     cl_uint32_t destSliceSizes, cl_uint32_t batchSliceSizes,
+                     cl_uint32_t numSlices, cl_uint32_t axisSliceSize) {
+  oclbatchedreduceaddK(&mem[dest], &mem[batch], &mem[destSliceSizes],
+                       &mem[batchSliceSizes], numSlices, axisSliceSize);
 }
 
 __kernel void batchedaddK(__global float *dest, __global float *batch,
diff --git a/tools/ClassGen/Backends/OpenCL/OpenCLSpecificInstrs.h b/tools/ClassGen/Backends/OpenCL/OpenCLSpecificInstrs.h
index 12fec9d1b5..76f0ad50c2 100644
--- a/tools/ClassGen/Backends/OpenCL/OpenCLSpecificInstrs.h
+++ b/tools/ClassGen/Backends/OpenCL/OpenCLSpecificInstrs.h
@@ -47,6 +47,16 @@ BB.newBackendSpecificInstr("OCLMaxPool")
     .autoIRGen()
     .autoVerify(VerifyKind::SameElementType, {"Dest", "Src"});
 
+BB.newBackendSpecificInstr("OCLBatchedReduceAdd")
+    .addOperand("Dest", OperandKind::Out)
+    .addOperand("Src", OperandKind::In)
+    .addOperand("DestSliceSizes", OperandKind::In)
+    .addOperand("SrcSliceSizes", OperandKind::In)
+    .addMember(MemberType::Unsigned, "Axis")
+    .addMember(MemberType::Unsigned, "AxisSrcSliceSize")
+    .autoVerify(VerifyKind::SameElementType, {"Dest", "Src"})
+    .autoIRGen();
+
 BB.includeBackendSpecificVerification(
     "glow/OpenCLSpecificInstrsVerification.h");
 
diff --git a/tools/ClassGen/Backends/OpenCL/OpenCLSpecificNodes.h b/tools/ClassGen/Backends/OpenCL/OpenCLSpecificNodes.h
index de3fe0c0d9..0683117803 100644
--- a/tools/ClassGen/Backends/OpenCL/OpenCLSpecificNodes.h
+++ b/tools/ClassGen/Backends/OpenCL/OpenCLSpecificNodes.h
@@ -51,6 +51,17 @@ BB.newNode("OCLMaxPool")
         "provided "
         "Kernel, Stride, and Pads. The input and output are in NCHW format");
 
+BB.newNode("OCLBatchedReduceAdd")
+    .addInput("Input")
+    .addInput("DestSliceSizes")
+    .addInput("SrcSliceSizes")
+    .addMember(MemberType::Unsigned, "Axis")
+    .addMember(MemberType::Unsigned, "AxisSrcSliceSize")
+    .addResultFromCtorArg()
+    .setDocstring(
+        "This is an OpenCL-specific BatchedReduceAdd operation which has the "
+        "slice sizes of the input and output as explicit inputs.");
+
 BB.includeBackendSpecificVerification("glow/OpenCLSpecificNodesVerification.h");
 
 #endif // GLOW_WITH_CPU
diff --git a/tools/ClassGen/Backends/OpenCL/OpenCLSpecificNodesVerification.h b/tools/ClassGen/Backends/OpenCL/OpenCLSpecificNodesVerification.h
index 44ff7a18eb..553d9b2435 100644
--- a/tools/ClassGen/Backends/OpenCL/OpenCLSpecificNodesVerification.h
+++ b/tools/ClassGen/Backends/OpenCL/OpenCLSpecificNodesVerification.h
@@ -30,4 +30,71 @@ bool OCLConvolutionNode::verify() const {
 bool OCLAvgPoolNode::verify() const { return true; }
 
 bool OCLMaxPoolNode::verify() const { return true; }
+
+bool OCLBatchedReduceAddNode::verify() const {
+  Constant *destSliceSizes =
+      llvm::dyn_cast<Constant>(getDestSliceSizes().getNode());
+  Constant *srcSliceSizes =
+      llvm::dyn_cast<Constant>(getSrcSliceSizes().getNode());
+
+  // Both the destSliceSizes and srcSliceSizes should be Constants.
+  if (!destSliceSizes || !srcSliceSizes) {
+    return false;
+  }
+
+  // Check that the values of destSliceSizes and srcSliceSizes still match the
+  // Types of the Input and Result. For more information, see
+  // OCLBackend::transformPostLowering.
+  bool ok = true;
+  auto srcSliceSizesH = srcSliceSizes->getPayload().getHandle<int32_t>();
+  auto srcDims = getInput().getType()->dims();
+
+  if (!srcDims.empty()) {
+    unsigned_t currentSliceSize = 1;
+    unsigned j = srcSliceSizesH.size() - 1;
+    for (ssize_t i = srcDims.size() - 1; i >= 0; --i) {
+      if (i == getAxis()) {
+        ok &= expectCompareTrue("axisSrcSlizeSize is incorrect",
+                                getAxisSrcSliceSize(), currentSliceSize, this);
+      } else {
+        ok &=
+            expectCompareTrue("srcSliceSize is incorrect",
+                              static_cast<unsigned_t>(srcSliceSizesH.at({j--})),
+                              currentSliceSize, this);
+      }
+      currentSliceSize *= srcDims[i];
+    }
+  } else {
+    ok &= expectCompareTrue("axisSrcSlizeSize is incorrect",
+                            getAxisSrcSliceSize(), static_cast<unsigned_t>(1),
+                            this);
+    ok &=
+        expectCompareTrue("srcSliceSizes has the wrong shape",
+                          srcSliceSizesH.size(), static_cast<size_t>(1), this);
+    ok &= expectCompareTrue("srcSliceSizes is incorrect",
+                            srcSliceSizesH.at({0}), 1, this);
+  }
+
+  auto destDims = getResult().getType()->dims();
+  std::vector<int32_t> destDimsVec(destDims.begin(), destDims.end());
+  if (destDims.empty()) {
+    destDimsVec.emplace_back(1);
+  }
+  auto destSliceSizesH = destSliceSizes->getPayload().getHandle<int32_t>();
+
+  ok &=
+      expectCompareTrue("destSliceSizes is incorrect",
+                        destSliceSizesH.at({destDimsVec.size() - 1}), 1, this);
+
+  for (ssize_t i = destDimsVec.size() - 2; i >= 0; --i) {
+    ok &= expectCompareTrue("destSliceSizes is incorrect",
+                            destSliceSizesH.at({static_cast<unsigned>(i)}),
+                            destSliceSizesH.at({static_cast<unsigned>(i + 1)}) *
+                                destDimsVec[i + 1],
+                            this);
+  }
+
+  return ok;
+}
+
 #endif // GLOW_WITH_OPENCL