Add data structures for compute and communication time; add function to fill in compute and memory bandwidth bound times for ops

nrsatish · beicy · commit 74f88b35bda0 · 2019-03-05T23:14:35.000-08:00
diff --git a/include/glow/Partitioner/Partitioner.h b/include/glow/Partitioner/Partitioner.h
@@ -25,6 +25,7 @@ namespace glow {
 using namespace runtime;
 
 using MemUsageMapTy = std::unordered_map<Node *, size_t>;
+using ComputeTimeMapTy = std::unordered_map<Node *, float>;
 using NodesSetTy = std::set<Node *>;
 using PartitionCostMapTy = llvm::DenseMap<Function *, GraphMemInfo>;
 
@@ -97,6 +98,9 @@ class Partitioner {
   /// The map of each operator and the corresponding memory size.
   MemUsageMapTy memUsage_;
 
+  /// The map of each operator and the compute runtime.
+  ComputeTimeMapTy computeTime_;
+
   /// Get the representative function (the one with the largest input) and
   /// update the memSize.
   static Function *selectRepFunc(Module *parent, size_t &memSize);
@@ -105,6 +109,9 @@ class Partitioner {
   /// function.
   void initOpMemUsage();
 
+  /// Inititalize the minimal compute time for each op in the function.
+  void initOpComputeTime();
+
   /// Combine the partitions if necessary : if all outside uses of the nodes in
   /// /// partition1 is in partition2, and the sum of memory consumption of
   /// partition1 and partition2 is less than availableMemory, combine partition1
@@ -140,6 +147,9 @@ class Partitioner {
 
   /// Decompose each function in a module and return a list of DAGNodes.
   DAGNodeList &Partition();
+
+  /// Get function for computeTime_
+  ComputeTimeMapTy getComputeTime() const { return computeTime_; }
 };
 } // namespace glow
 #endif // GLOW_PARTITIONER_PARTITIONER_H
diff --git a/include/glow/Runtime/RuntimeTypes.h b/include/glow/Runtime/RuntimeTypes.h
@@ -48,6 +48,17 @@ using ResultCBTy = std::function<void(
 struct DeviceInfo {
   /// Available memory on device in bytes.
   size_t availableMemory;
+  /// Available SRAM capacity in bytes.
+  size_t sramCapacity;
+  /// Peak compute on device in ops/second. Assumes all ops are in int8.
+  /// TODO: distinguish between data types with different peak flops.
+  float peakCompute;
+  /// Peak memory bandwidth from DRAM on device in bytes/second.
+  float peakDramBw;
+  /// Peak memory bandwidth from SRAM on device in bytes/second.
+  float peakSramBw;
+  /// Peak ingress/egress PCI-E bandwidth from device in bytes/second.
+  float peakPCIeBw;
 };
 
 /// Individual Node in the DAG for a given network. This contains all the
diff --git a/lib/Partitioner/Partitioner.cpp b/lib/Partitioner/Partitioner.cpp
@@ -81,6 +81,126 @@ void Partitioner::initOpMemUsage() {
   }
 }
 
+/// Get the minimal compute time for each op in the function.
+void Partitioner::initOpComputeTime() {
+  computeTime_.clear();
+
+  // This code assumes all ops are BW limited from SRAM; except
+  // if the input does not fit in SRAM -- then it is DRAM BW limited
+  float peakDramBw = deviceInfo_[0].peakDramBw;
+  float peakSramBw = deviceInfo_[0].peakSramBw;
+  size_t sramCapacity = deviceInfo_[0].sramCapacity;
+  float peakCompute = deviceInfo_[0].peakCompute;
+
+  for (auto &node : F_->getNodes()) {
+    /// compute memory side bytes for inputs from DRAM, SRAM.
+    /// TODO: think about whether this is better off computed inside a Node.
+
+    int n = node.getNumInputs();
+    uint64_t sizeDram = 0;
+    uint64_t sizeSram = 0;
+    if (node.getKind() == Kinded::Kind::SaveNodeKind) {
+      computeTime_[&node] = 0.0f;
+      continue;
+    }
+
+    /// The memory bytes for embedding table lookups is data dependent,
+    /// so it needs to be calculated as per the number of indices accessed.
+    if (node.getKind() == Kinded::Kind::SparseLengthsWeightedSumNodeKind) {
+      auto *SLWSN = llvm::dyn_cast<SparseLengthsWeightedSumNode>(&node);
+      /// compute how many entries of the embedding table we look up
+      auto numLookups = SLWSN->getIndices().getNode()->dims(0).front();
+      /// compute how many bytes we read per lookup
+      auto tableSize = SLWSN->getData().getNode()->getType(0)->getSizeInBytes();
+      auto numRows = SLWSN->getData().getNode()->dims(0).front();
+      auto sizePerLookup = tableSize / numRows;
+      /// compute total bytes read
+      uint64_t sizeInput = numLookups * sizePerLookup;
+
+      /// does the table fit in SRAM or DRAM
+      if (tableSize > sramCapacity) {
+        sizeDram += sizeInput;
+      } else {
+        sizeSram += sizeInput;
+      }
+
+      /// we also read the indices, weights and lengths arrays
+      sizeSram += SLWSN->getIndices().getNode()->getType(0)->getSizeInBytes();
+      sizeSram += SLWSN->getWeights().getNode()->getType(0)->getSizeInBytes();
+      sizeSram += SLWSN->getLengths().getNode()->getType(0)->getSizeInBytes();
+    } else {
+      /// for all other ops, iterate through all inputs and get size in bytes
+      for (int i = 0; i < n; i++) {
+        auto ty = node.getNthInput(i).getNode()->getType(0);
+        uint64_t sizeInput = ty->getSizeInBytes();
+        if (sizeInput > sramCapacity) {
+          sizeDram += sizeInput;
+        } else {
+          sizeSram += sizeInput;
+        }
+      }
+    }
+
+    // Repeat for outputs
+    if (node.getNumResults() > 0) {
+      auto myty = node.getType(0);
+      uint64_t sizeOutput = myty->getSizeInBytes();
+      if (sizeOutput > sramCapacity) {
+        sizeDram += sizeOutput;
+      } else {
+        sizeSram += sizeOutput;
+      }
+    }
+
+    /// Calculate compute ops. Currently only computed for Matmul, Conv, FC
+    /// TODO: think about whether this is better off computed inside a Node.
+    uint64_t totalOps = 0;
+    switch (node.getKind()) {
+    case Kinded::Kind::MatMulNodeKind: {
+      auto *MMN = llvm::dyn_cast<MatMulNode>(&node);
+      auto lhsDims = MMN->getLHS().dims();
+      auto rhsDims = MMN->getRHS().dims();
+      totalOps = 2 * lhsDims[0] * lhsDims[1] * rhsDims[1];
+      break;
+    }
+    case Kinded::Kind::FullyConnectedNodeKind: {
+      auto *FCN = llvm::dyn_cast<FullyConnectedNode>(&node);
+      auto inputDims = FCN->getInput().dims();
+      auto wtDims = FCN->getWeights().dims();
+      totalOps = 2 * inputDims[0] * inputDims[1] * wtDims[1];
+      break;
+    }
+    case Kinded::Kind::ConvolutionNodeKind: {
+      auto *CN = llvm::dyn_cast<ConvolutionNode>(&node);
+      auto resultDims = CN->getResult().dims();
+      // Get the product of batch, output height, output dims, output channels
+      totalOps = resultDims[0];
+      for (size_t i = 1, e = resultDims.size(); i < e; i++) {
+        totalOps *= resultDims[i];
+      }
+      // Multiply in kernel height, kernel width
+      auto kernelDims = CN->getKernels();
+      totalOps *= kernelDims[0] * kernelDims[1];
+      // Multiply in input channels/groups
+      auto inputChannels = CN->getInput().dims()[1];
+      auto nGroups = CN->getGroup();
+      totalOps *= (inputChannels * 1.0 / nGroups);
+      break;
+    }
+    default:
+      break;
+    }
+
+    /// Compute compute roofline as max of flops, DRAM, SRAM BW
+    /// See https://bit.ly/2UdJ3mz
+    /// Add epsilons to prevent seg faults on unitialized peak values
+    computeTime_[&node] =
+        std::max(totalOps * 1.0f / std::max(peakCompute, 1e-6f),
+                 std::max(sizeDram * 1.0f / std::max(peakDramBw, 1e-6f),
+                          sizeSram * 1.0f / std::max(peakSramBw, 1e-6f)));
+  }
+}
+
 // Combine the partitions if necessary : if all outside uses of the nodes in
 // partition1 is in partition2, and the sum of memory consumption of partition1
 // and partition2 is less than availableMemory, combine partition1 and
@@ -403,7 +523,6 @@ DAGNodeList &Partitioner::Partition() {
 
   // Find the representive function for running partitioning algrithm.
   F_ = selectRepFunc(module_, memSize_);
-
   size_t availMem = deviceInfo_[0].availableMemory;
 
   if (memSize_ < availMem) {
@@ -427,9 +546,8 @@ DAGNodeList &Partitioner::Partition() {
   // Prepare 1: Get the min memory usage for each op.
   initOpMemUsage();
 
-  // Prepare 2: TODO: get the minimal comunication cost for any 2 ops (i.e. the
-  // output data size) Will calculate it on the fly. -- Will double check which
-  // way is better.
+  // Prepare 2: Get the roofline memory bandwidth estimate for each op.
+  initOpComputeTime();
 
   // Partition
   // Use BFS to do the initial partitioning. Starting from the final node, BFS
diff --git a/tests/unittests/PartitionerTest.cpp b/tests/unittests/PartitionerTest.cpp
@@ -208,3 +208,104 @@ TEST_F(PartitionerTest, Basic2) {
     EXPECT_TRUE(ref.isEqual(test));
   }
 }
+
+/// This one tests the roofline computed with compute, memory and communication
+/// costs
+TEST_F(PartitionerTest, Basic1Roofline) {
+  auto *input =
+      mod_.createPlaceholder(ElemKind::FloatTy, {1, 32}, "input", false);
+  auto *w1 = mod_.createConstant(ElemKind::FloatTy, {32, 16}, "w1");
+  auto *b1 = mod_.createConstant(ElemKind::FloatTy, {16}, "b1");
+  ctx_.allocate(input);
+  w1->getHandle<>().randomize(-2.0, 2.0, mod_.getPRNG());
+  b1->getHandle<>().randomize(-2.0, 2.0, mod_.getPRNG());
+
+  // Initial FC.
+  Node *I = F_->createFullyConnected("initial_fc", input, w1, b1);
+  I = F_->createSigmoid("initial_sigmoid", I);
+
+  // Left branch.
+  auto *w2 = mod_.createConstant(ElemKind::FloatTy, {16, 16}, "w2");
+  auto *b2 = mod_.createConstant(ElemKind::FloatTy, {16}, "b2");
+  w2->getHandle<>().randomize(-2.0, 2.0, mod_.getPRNG());
+  b2->getHandle<>().randomize(-2.0, 2.0, mod_.getPRNG());
+  Node *L = F_->createFullyConnected("left_fc1", I, w2, b2);
+  L = F_->createSigmoid("left_sigmoid1", L);
+  auto *w3 = mod_.createConstant(ElemKind::FloatTy, {16, 8}, "w3");
+  auto *b3 = mod_.createConstant(ElemKind::FloatTy, {8}, "b3");
+  w3->getHandle<>().randomize(-2.0, 2.0, mod_.getPRNG());
+  b3->getHandle<>().randomize(-2.0, 2.0, mod_.getPRNG());
+  L = F_->createFullyConnected("left_fc2", L, w3, b3);
+  L = F_->createSigmoid("left_sigmoid2", L);
+
+  // Right branch.
+  auto *w4 = mod_.createConstant(ElemKind::FloatTy, {16, 16}, "w4");
+  auto *b4 = mod_.createConstant(ElemKind::FloatTy, {16}, "b4");
+  w4->getHandle<>().randomize(-2.0, 2.0, mod_.getPRNG());
+  b4->getHandle<>().randomize(-2.0, 2.0, mod_.getPRNG());
+  Node *R = F_->createFullyConnected("right_fc1", I, w4, b4);
+  R = F_->createSigmoid("right_sigmoid1", R);
+  auto *w5 = mod_.createConstant(ElemKind::FloatTy, {16, 8}, "w5");
+  auto *b5 = mod_.createConstant(ElemKind::FloatTy, {8}, "b5");
+  w5->getHandle<>().randomize(-2.0, 2.0, mod_.getPRNG());
+  b5->getHandle<>().randomize(-2.0, 2.0, mod_.getPRNG());
+  R = F_->createFullyConnected("right_fc2", R, w5, b5);
+  R = F_->createSigmoid("right_sigmoid2", R);
+
+  // Join branches.
+  auto *mul = F_->createMul("mul", L, R);
+  auto *save = F_->createSave("ret", mul);
+  auto &res = *ctx_.allocate(save->getPlaceholder());
+
+  // Infer using the un-partitioned graph.
+  Tensor in(ElemKind::FloatTy, {1, 32});
+  ExecutionEngine EE;
+
+  EE.compile(CompilationMode::Infer, F_);
+  updateInputPlaceholders(ctx_, {input}, {&in});
+  EE.run(ctx_);
+  Tensor ref = res.clone();
+
+  std::unordered_map<Node *, std::string> nodeNamesMap;
+  for (auto &node : F_->getNodes()) {
+    nodeNamesMap[&node] = node.getName();
+  }
+
+  std::vector<DeviceInfo> devices = {{3072, 100, 10, 0.1, 1, 0.05},
+                                     {3072, 100, 10, 0.1, 1, 0.05},
+                                     {3072, 100, 10, 0.1, 1, 0.05}};
+  Partitioner myPartitioner(&mod_, devices);
+
+  DAGNodeList myList = std::move(myPartitioner.Partition());
+
+  // check compute costs
+  std::unordered_map<std::string, float> expectedComputeTime{
+      {"initial_sigmoid", 128},
+      {"left_sigmoid2", 64},
+      {"fc_add_bias3", 192},
+      {"right_sigmoid1", 128},
+      {"mul", 96},
+      {"fc_add_bias2", 96},
+      {"ret", 0},
+      {"fc_dot", 21760},
+      {"left_sigmoid1", 128},
+      {"fc_add_bias", 192},
+      {"fc_dot1", 10240},
+      {"right_sigmoid2", 64},
+      {"fc_add_bias1", 192},
+      {"fc_dot2", 5120},
+      {"fc_dot3", 10240},
+      {"fc_dot4", 5120},
+      {"fc_add_bias4", 96},
+  };
+  ASSERT_EQ(myPartitioner.getComputeTime().size(), expectedComputeTime.size());
+  for (auto &el : myPartitioner.getComputeTime()) {
+    Node *n = el.first;
+    float expected = expectedComputeTime[nodeNamesMap[n].c_str()];
+    float res = el.second;
+    ASSERT_EQ(expected, res);
+  }
+
+  ASSERT_EQ(mod_.getFunctions().size(), 3);
+  ASSERT_EQ(myList.roots.size(), 1);
+}