pytorch
diff --git a/‎include/glow/Base/Tensor.h
Lines changed: 6 additions & 0 deletions b/‎include/glow/Base/Tensor.h
Lines changed: 6 additions & 0 deletions
diff --git a/‎include/glow/Graph/Graph.h
Lines changed: 3 additions & 2 deletions b/‎include/glow/Graph/Graph.h
Lines changed: 3 additions & 2 deletions
diff --git a/‎lib/Backends/CPU/LLVMIRGen.cpp
Lines changed: 14 additions & 8 deletions b/‎lib/Backends/CPU/LLVMIRGen.cpp
Lines changed: 14 additions & 8 deletions
diff --git a/‎lib/Backends/CPU/libjit/libjit.cpp
Lines changed: 58 additions & 21 deletions b/‎lib/Backends/CPU/libjit/libjit.cpp
Lines changed: 58 additions & 21 deletions
diff --git a/‎lib/Backends/CPU/libjit/libjit_defs.h
Lines changed: 9 additions & 0 deletions b/‎lib/Backends/CPU/libjit/libjit_defs.h
Lines changed: 9 additions & 0 deletions
diff --git a/‎lib/Backends/Interpreter/InterpreterNodes.cpp
Lines changed: 81 additions & 36 deletions b/‎lib/Backends/Interpreter/InterpreterNodes.cpp
Lines changed: 81 additions & 36 deletions
diff --git a/‎lib/Backends/OpenCL/OpenCL.cpp
Lines changed: 2 additions & 0 deletions b/‎lib/Backends/OpenCL/OpenCL.cpp
Lines changed: 2 additions & 0 deletions
diff --git a/‎lib/Base/Tensor.cpp
Lines changed: 8 additions & 0 deletions b/‎lib/Base/Tensor.cpp
Lines changed: 8 additions & 0 deletions
diff --git a/‎lib/Graph/Graph.cpp
Lines changed: 17 additions & 7 deletions b/‎lib/Graph/Graph.cpp
Lines changed: 17 additions & 7 deletions
@@ -39,6 +39,12 @@ class Tensor;
 void genericTranspose(Tensor *src, Tensor *dest,
                       llvm::ArrayRef<unsigned> shuffle);
 
+/// Helper function that \returns a ShapeVector of those dimensions in \p
+/// currDims expanded with dimension = 1 until the maximum tensor dimension is
+/// reached. The number of elements in the input dims is the same as in the
+/// returned dims. For example, input {2,1,4} would result in {2,1,4,1,1,1}.
+ShapeVector expandDimsToMax(llvm::ArrayRef<size_t> currDims);
+
 /// A class that represents a contiguous n-dimensional array (a tensor).
 class Tensor final {
   /// A pointer to the tensor data.
 
@@ -350,10 +350,11 @@ class Function final : public Named {
                            NodeValue rhs);
 
   BatchedReduceAddNode *createBatchedReduceAdd(llvm::StringRef name,
-                                               NodeValue batch);
+                                               NodeValue batch, size_t axis);
 
   BatchedReduceAddNode *createBatchedReduceAdd(llvm::StringRef name,
-                                               TypeRef outTy, NodeValue batch);
+                                               TypeRef outTy, NodeValue batch,
+                                               size_t axis);
 
   BatchedAddNode *createBatchedAdd(llvm::StringRef name, NodeValue batch,
                                    NodeValue sample);
 
@@ -217,7 +217,8 @@ void LLVMIRGen::initCodeGen() {
 
 /// \returns the LLVM type corresponding to the type of elements stored in \p
 /// val.
-llvm::Type *LLVMIRGen::getElementType(llvm::IRBuilder<> &builder, const Value *val) {
+llvm::Type *LLVMIRGen::getElementType(llvm::IRBuilder<> &builder,
+                                      const Value *val) {
   switch (val->getElementType()) {
   case ElemKind::IndexTy:
     return builder.getIntNTy(sizeof(size_t) * 8);
@@ -1305,11 +1306,14 @@ void LLVMIRGen::generateLLVMIRForInstr(llvm::IRBuilder<> &builder,
     auto *batch = BR->getBatch();
     auto *destPtr = emitValueAddress(builder, dest);
     auto *batchPtr = emitValueAddress(builder, batch);
+    auto *axis = emitConstSizeT(builder, BR->getAxis());
 
-    auto *destSize = emitConstSizeT(builder, dest->size());
-    auto bdim = flattenCdr(batch->dims());
-    auto *numSlice = emitConstSizeT(builder, bdim.first);
-    auto *sliceSize = emitConstSizeT(builder, bdim.second);
+    ShapeVector eBatchDims = expandDimsToMax(batch->dims());
+    ShapeVector eDestDims = eBatchDims;
+    eDestDims[BR->getAxis()] = 1;
+
+    auto *batchDims = emitConstArray(builder, eBatchDims);
+    auto *destDims = emitConstArray(builder, eDestDims);
 
     auto *F = getFunction("batchedreduceadd", dest->getElementType());
 
@@ -1332,11 +1336,13 @@ void LLVMIRGen::generateLLVMIRForInstr(llvm::IRBuilder<> &builder,
       auto *batchScale = emitConstI32(builder, batchScaleParams.scale_);
 
       createCall(builder, F,
-                 {destPtr, batchPtr, destSize, numSlice, sliceSize, destOffset,
-                  batchOffset, batchPre, batchPost, batchScale});
+                 {destPtr, batchPtr, destDims, batchDims, destOffset,
+                  batchOffset, batchPre, batchPost, batchScale, axis});
     } else {
+      auto *destSize = emitConstSizeT(builder, dest->size());
+
       createCall(builder, F,
-                 {destPtr, batchPtr, destSize, numSlice, sliceSize});
+                 {destPtr, batchPtr, destSize, destDims, batchDims, axis});
     }
     break;
   }
 
@@ -694,32 +694,69 @@ void libjit_batchedadd_i8(int8_t *dest, const int8_t *batch,
   }
 }
 
+/// The dimensions passed in here are pre-expanded in LLVMIRGen with 1s so that
+/// we can iterate over the shape here, regardless of the shape of the tensor.
 void libjit_batchedreduceadd_f(float *dest, const float *batch, size_t destSize,
-                               size_t numSlice, size_t sliceSize) {
-  for (size_t i = 0; i < destSize; i++) {
+                               const size_t *destDims, const size_t *batchDims,
+                               size_t axis) {
+  for (size_t i = 0; i < destSize; i++)
     dest[i] = 0.0;
-  }
-  for (size_t n = 0; n < numSlice; n++) {
-    size_t base = n * sliceSize;
-    for (size_t i = 0; i < sliceSize; i++) {
-      dest[i] += batch[base + i];
-    }
-  }
+
+  for (size_t x = 0; x < batchDims[0]; x++)
+    for (size_t y = 0; y < batchDims[1]; y++)
+      for (size_t z = 0; z < batchDims[2]; z++)
+        for (size_t w = 0; w < batchDims[3]; w++)
+          for (size_t q = 0; q < batchDims[4]; q++)
+            for (size_t r = 0; r < batchDims[5]; r++) {
+              size_t I[] = {x, y, z, w, q, r};
+              I[axis] = 0;
+              dest[libjit_getXYZWQR(destDims, I[0], I[1], I[2], I[3], I[4],
+                                    I[5])] +=
+                  batch[libjit_getXYZWQR(batchDims, x, y, z, w, q, r)];
+            }
 }
 
+/// Same as the non-quantized version, the dimensions here are pre-expanded in
+/// LLVMIRGen. However, for quantization, we must accumulate in the inner-most
+/// loop with higher precision (int32_t) and then clip the result back into the
+/// dest tensor. Thus we add max_tensor_dimensions different cases for this to
+/// ensure the axis is used as the inner-most loop.
 void libjit_batchedreduceadd_i8(int8_t *dest, const int8_t *batch,
-                                size_t destSize, size_t numSlice,
-                                size_t sliceSize, int32_t destOffset,
-                                int32_t batchOffset, int32_t batchPre,
-                                int32_t batchPost, int32_t batchScale) {
-  for (size_t i = 0; i < sliceSize; i++) {
-    int32_t sum = 0;
-    for (size_t n = 0; n < numSlice; n++) {
-      sum += batch[n * sliceSize + i] - batchOffset;
-    }
-    int32_t q =
-        libjit_scale_i32i8(sum, batchPre, batchPost, batchScale, destOffset);
-    dest[i] = libjit_clip(q);
+                                const size_t *destDims, const size_t *batchDims,
+                                int32_t destOffset, int32_t batchOffset,
+                                int32_t batchPre, int32_t batchPost,
+                                int32_t batchScale, size_t axis) {
+  switch (axis) {
+#define LOOP_AXIS_CASE(_D0, _D1, _D2, _D3, _D4, _D5_AXIS)                      \
+  case _D5_AXIS:                                                               \
+    for (size_t i##_D0 = 0; i##_D0 < batchDims[_D0]; i##_D0++)                 \
+      for (size_t i##_D1 = 0; i##_D1 < batchDims[_D1]; i##_D1++)               \
+        for (size_t i##_D2 = 0; i##_D2 < batchDims[_D2]; i##_D2++)             \
+          for (size_t i##_D3 = 0; i##_D3 < batchDims[_D3]; i##_D3++)           \
+            for (size_t i##_D4 = 0; i##_D4 < batchDims[_D4]; i##_D4++) {       \
+              int32_t sum = 0.0;                                               \
+              for (size_t i##_D5_AXIS = 0; i##_D5_AXIS < batchDims[_D5_AXIS];  \
+                   i##_D5_AXIS++) {                                            \
+                sum += batch[libjit_getXYZWQR(batchDims, i0, i1, i2, i3, i4,   \
+                                              i5)] -                           \
+                       batchOffset;                                            \
+              }                                                                \
+              size_t i##_D5_AXIS = 0;                                          \
+              int32_t res = libjit_scale_i32i8(sum, batchPre, batchPost,       \
+                                               batchScale, destOffset);        \
+              dest[libjit_getXYZWQR(destDims, i0, i1, i2, i3, i4, i5)] =       \
+                  libjit_clip(res);                                            \
+            }                                                                  \
+    return;
+
+    // Each loop order, with the inner-most dimension/index equal to the axis.
+    LOOP_AXIS_CASE(1, 2, 3, 4, 5, 0);
+    LOOP_AXIS_CASE(0, 2, 3, 4, 5, 1);
+    LOOP_AXIS_CASE(0, 1, 3, 4, 5, 2);
+    LOOP_AXIS_CASE(0, 1, 2, 4, 5, 3);
+    LOOP_AXIS_CASE(0, 1, 2, 3, 5, 4);
+    LOOP_AXIS_CASE(0, 1, 2, 3, 4, 5);
+#undef LOOP_AXIS_CASE
   }
 }
 
 
@@ -54,6 +54,15 @@ inline void AdduFloat8(float *p, float8 v) {
   StoreuFloat8(p, LoaduFloat8(p) + v);
 }
 
+/// \returns the index of the element at x,y,z,w,q,r.
+inline size_t libjit_getXYZWQR(const size_t *dims, size_t x, size_t y, size_t z,
+                               size_t w, size_t q, size_t r) {
+  return (x * dims[1] * dims[2] * dims[3] * dims[4] * dims[5]) +
+         (y * dims[2] * dims[3] * dims[4] * dims[5]) +
+         (z * dims[3] * dims[4] * dims[5]) + (w * dims[4] * dims[5]) +
+         (q * dims[5]) + r;
+}
+
 /// \returns the index of the element at x,y,z,w,q.
 inline size_t libjit_getXYZWQ(const size_t *dims, size_t x, size_t y, size_t z,
                               size_t w, size_t q) {
 
@@ -1302,54 +1302,99 @@ void Interpreter::fwdBatchedAddInst(const glow::BatchedAddInst *I) {
 }
 
 void Interpreter::fwdBatchedReduceAddInst(const glow::BatchedReduceAddInst *I) {
-  if (getTensor(I->getBatch())->getType().isQuantizedType()) {
-    auto dest = getWeightHandle<int8_t>(I->getDest());
-    auto batch = getWeightHandle<int8_t>(I->getBatch());
+  static_assert(max_tensor_dimensions == 6,
+                "Loops below assume max_tensor_dimensions = 6.");
 
-    auto destTy = I->getDest()->getType();
-    auto batchTy = I->getBatch()->getType();
+  auto *batch = I->getBatch();
+  auto *dest = I->getDest();
+  const auto axis = I->getAxis();
+
+  // Initialize both expanded batch and dest dims to the expanded batch
+  // dims. This allows us below to iterate over the tensor regardless of its
+  // shape using max_tensor_dimensions loops below.
+  ShapeVector eBatchDims = expandDimsToMax(batch->dims());
+  ShapeVector eDestDims = eBatchDims;
+
+  // Set the destination axis dimension (the one we are reducing) to 1.
+  eDestDims[axis] = 1;
+
+  if (getTensor(batch)->getType().isQuantizedType()) {
+    auto destTy = dest->getType();
+    auto batchTy = batch->getType();
 
     float destScale = destTy->getScale();
     float batchScale = batchTy->getScale();
 
     int32_t destOffset = destTy->getOffset();
     int32_t batchOffset = batchTy->getOffset();
 
-    auto bdim = flattenCdr(batch.dims());
-
-    // The following loop order is inefficient but easy to implement correctly;
-    // as this is the Interpreter, we prioritize simplicity and correctness
-    // above all else.
-    // For each element in the slice:
-    for (size_t i = 0; i < bdim.second; i++) {
-      float sum = 0.0;
-
-      // For each layer in the batch:
-      for (size_t n = 0; n < bdim.first; n++) {
-        size_t base = batch.getElementPtr({n});
-        sum += batch.raw(base + i) - batchOffset;
-      }
+    // Get unowned handles of the batch and dest with these new expanded dims.
+    auto eBatch = getTensor(batch)->getUnowned(eBatchDims);
+    auto eDest = getTensor(dest)->getUnowned(eDestDims);
+    auto eBatchH = eBatch.getHandle<int8_t>();
+    auto eDestH = eDest.getHandle<int8_t>();
+    eDestH.clear();
+
+    // For quantization, we must accumulate in the inner-most loop into a local
+    // float and then clip the result back into the dest tensor. Here are the
+    // max_tensor_dimensions cases for this, to ensure the axis is used as the
+    // inner-most loop.
+    switch (axis) {
+#define LOOP_AXIS_CASE(_D0, _D1, _D2, _D3, _D4, _D5_AXIS)                      \
+  case _D5_AXIS:                                                               \
+    for (size_t i##_D0 = 0; i##_D0 < eBatchDims[_D0]; i##_D0++)                \
+      for (size_t i##_D1 = 0; i##_D1 < eBatchDims[_D1]; i##_D1++)              \
+        for (size_t i##_D2 = 0; i##_D2 < eBatchDims[_D2]; i##_D2++)            \
+          for (size_t i##_D3 = 0; i##_D3 < eBatchDims[_D3]; i##_D3++)          \
+            for (size_t i##_D4 = 0; i##_D4 < eBatchDims[_D4]; i##_D4++) {      \
+              float sum = 0.0;                                                 \
+              for (size_t i##_D5_AXIS = 0; i##_D5_AXIS < eBatchDims[_D5_AXIS]; \
+                   i##_D5_AXIS++) {                                            \
+                sum += eBatchH.at({i0, i1, i2, i3, i4, i5}) - batchOffset;     \
+              }                                                                \
+              size_t i##_D5_AXIS = 0;                                          \
+              int32_t res =                                                    \
+                  std::round(sum * batchScale / destScale) + destOffset;       \
+              eDestH.at({i0, i1, i2, i3, i4, i5}) =                            \
+                  quantization::clip<int32_t, int8_t>(res);                    \
+            }                                                                  \
+    return;
 
-      int32_t q = std::round(sum * batchScale / destScale) + destOffset;
-      dest.raw(i) = quantization::clip<int32_t, int8_t>(q);
+      // Each loop order, with the inner-most dimension/index equal to the axis.
+      LOOP_AXIS_CASE(1, 2, 3, 4, 5, 0);
+      LOOP_AXIS_CASE(0, 2, 3, 4, 5, 1);
+      LOOP_AXIS_CASE(0, 1, 3, 4, 5, 2);
+      LOOP_AXIS_CASE(0, 1, 2, 4, 5, 3);
+      LOOP_AXIS_CASE(0, 1, 2, 3, 5, 4);
+      LOOP_AXIS_CASE(0, 1, 2, 3, 4, 5);
+#undef LOOP_AXIS_CASE
+    default:
+      llvm_unreachable("Axis should be less than max_tensor_dimensions.");
     }
-    return;
   }
 
-  auto batch = getWeightHandle(I->getBatch());
-  auto dest = getWeightHandle(I->getDest());
-
-  auto bdim = flattenCdr(batch.dims());
-
-  dest.clear();
-
-  // For each layer in the batch:
-  for (size_t n = 0; n < bdim.first; n++) {
-    size_t base = batch.getElementPtr({n});
-
-    // For each element in the slice:
-    for (size_t i = 0; i < bdim.second; i++) {
-      dest.raw(i) += batch.raw(base + i);
+  // Get unowned handles of the batch and dest with these new expanded dims.
+  auto eBatch = getTensor(batch)->getUnowned(eBatchDims);
+  auto eDest = getTensor(dest)->getUnowned(eDestDims);
+  auto eBatchH = eBatch.getHandle();
+  auto eDestH = eDest.getHandle();
+  eDestH.clear();
+
+  // We can use this loop for all shapes. Use the same indices for both the
+  // batch and dest, except for setting the axis index in the dest to 0.
+  for (size_t x = 0; x < eBatchDims[0]; x++) {
+    for (size_t y = 0; y < eBatchDims[1]; y++) {
+      for (size_t z = 0; z < eBatchDims[2]; z++) {
+        for (size_t w = 0; w < eBatchDims[3]; w++) {
+          for (size_t q = 0; q < eBatchDims[4]; q++) {
+            for (size_t r = 0; r < eBatchDims[5]; r++) {
+              size_t destIndices[] = {x, y, z, w, q, r};
+              destIndices[axis] = 0;
+              eDestH.at(destIndices) += eBatchH.at({x, y, z, w, q, r});
+            }
+          }
+        }
+      }
     }
   }
 }
 
@@ -739,6 +739,8 @@ void OCLBackend::doForwardPass() {
     }
 
     if (auto *BRA = dyn_cast<BatchedReduceAddInst>(&I)) {
+      assert(BRA->getAxis() == 0 && "No current support for non-zero axis.");
+
       cl_kernel kernel = createKernel(kernelName);
       setKernelArg(kernel, 0, deviceBuffer_);
 
 
@@ -335,3 +335,11 @@ void glow::genericTranspose(Tensor *src, Tensor *dest,
   }
   }
 }
+
+ShapeVector glow::expandDimsToMax(llvm::ArrayRef<size_t> currDims) {
+  ShapeVector newDims(currDims.begin(), currDims.end());
+  for (size_t i = newDims.size(); i < max_tensor_dimensions; i++) {
+    newDims.push_back(1);
+  }
+  return newDims;
+}
@@ -982,18 +982,28 @@ MatMulNode *Function::createMatMul(llvm::StringRef name, NodeValue lhs,
 
 BatchedReduceAddNode *Function::createBatchedReduceAdd(llvm::StringRef name,
                                                        TypeRef outTy,
-                                                       NodeValue batch) {
-  assert(outTy->size() == flattenCdr(batch.dims()).second);
+                                                       NodeValue batch,
+                                                       size_t axis) {
+  // Calculate the expected total number of elements in the output tensor based
+  // on the number of elements in the batch divided by the axis dimension.
+  const size_t outNumElements = batch.getType()->size() / batch.dims()[axis];
+  (void)outNumElements;
+  assert(outTy->size() == outNumElements &&
+         "Incorrect number of elements in the output type.");
   auto OT = getParent()->uniqueType(*outTy);
-  return addNode(new BatchedReduceAddNode(name, OT, batch));
+  return addNode(new BatchedReduceAddNode(name, OT, batch, axis));
 }
 
 BatchedReduceAddNode *Function::createBatchedReduceAdd(llvm::StringRef name,
-                                                       NodeValue batch) {
+                                                       NodeValue batch,
+                                                       size_t axis) {
   auto BT = batch.getType();
-  auto OT =
-      getParent()->uniqueType(BT->getElementType(), BT->dims().drop_front());
-  return createBatchedReduceAdd(name, OT, batch);
+
+  ShapeVector outDims(BT->dims().begin(), BT->dims().end());
+  outDims.erase(outDims.begin() + axis);
+
+  auto OT = getParent()->uniqueType(BT->getElementType(), outDims);
+  return createBatchedReduceAdd(name, OT, batch, axis);
 }
 
 BatchedAddNode *Function::createBatchedAdd(llvm::StringRef name,
Original file line number	Diff line number	Diff line change
`@@ -739,6 +739,8 @@ void OCLBackend::doForwardPass() {`
`739`	`739`	`}`
`740`	`740`
`741`	`741`	`if (auto *BRA = dyn_cast<BatchedReduceAddInst>(&I)) {`
	`742`	`+ assert(BRA->getAxis() == 0 && "No current support for non-zero axis.");`
	`743`	`+`
`742`	`744`	`cl_kernel kernel = createKernel(kernelName);`
`743`	`745`	`setKernelArg(kernel, 0, deviceBuffer_);`
`744`	`746`
Original file line number	Diff line number	Diff line change
`@@ -335,3 +335,11 @@ void glow::genericTranspose(Tensor src, Tensor dest,`
`335`	`335`	`}`
`336`	`336`	`}`
`337`	`337`	`}`
	`338`	`+`
	`339`	`+ShapeVector glow::expandDimsToMax(llvm::ArrayRef<size_t> currDims) {`
	`340`	`+ ShapeVector newDims(currDims.begin(), currDims.end());`
	`341`	`+ for (size_t i = newDims.size(); i < max_tensor_dimensions; i++) {`
	`342`	`+ newDims.push_back(1);`
	`343`	`+ }`
	`344`	`+ return newDims;`
	`345`	`+}`