diff --git a/include/glow/Partitioner/Partitioner.h b/include/glow/Partitioner/Partitioner.h index f16d19698d..c33fb41238 100644 --- a/include/glow/Partitioner/Partitioner.h +++ b/include/glow/Partitioner/Partitioner.h @@ -25,6 +25,7 @@ namespace glow { using namespace runtime; using MemUsageMapTy = std::unordered_map; +using ComputeTimeMapTy = std::unordered_map; using NodesSetTy = std::set; using PartitionCostMapTy = llvm::DenseMap; @@ -97,6 +98,9 @@ class Partitioner { /// The map of each operator and the corresponding memory size. MemUsageMapTy memUsage_; + /// The map of each operator and the compute runtime. + ComputeTimeMapTy computeTime_; + /// Get the representative function (the one with the largest input) and /// update the memSize. static Function *selectRepFunc(Module *parent, size_t &memSize); @@ -105,6 +109,9 @@ class Partitioner { /// function. void initOpMemUsage(); + /// Inititalize the minimal compute time for each op in the function. + void initOpComputeTime(); + /// Combine the partitions if necessary : if all outside uses of the nodes in /// /// partition1 is in partition2, and the sum of memory consumption of /// partition1 and partition2 is less than availableMemory, combine partition1 @@ -140,6 +147,9 @@ class Partitioner { /// Decompose each function in a module and return a list of DAGNodes. DAGNodeList &Partition(); + + /// Get function for computeTime_ + ComputeTimeMapTy getComputeTime() const { return computeTime_; } }; } // namespace glow #endif // GLOW_PARTITIONER_PARTITIONER_H diff --git a/include/glow/Runtime/RuntimeTypes.h b/include/glow/Runtime/RuntimeTypes.h index 380477c48b..b4766007a3 100644 --- a/include/glow/Runtime/RuntimeTypes.h +++ b/include/glow/Runtime/RuntimeTypes.h @@ -48,6 +48,17 @@ using ResultCBTy = std::functiongetNodes()) { + /// compute memory side bytes for inputs from DRAM, SRAM. + /// TODO: think about whether this is better off computed inside a Node. + + int n = node.getNumInputs(); + uint64_t sizeDram = 0; + uint64_t sizeSram = 0; + if (node.getKind() == Kinded::Kind::SaveNodeKind) { + computeTime_[&node] = 0.0f; + continue; + } + + /// The memory bytes for embedding table lookups is data dependent, + /// so it needs to be calculated as per the number of indices accessed. + if (node.getKind() == Kinded::Kind::SparseLengthsWeightedSumNodeKind) { + auto *SLWSN = llvm::dyn_cast(&node); + /// compute how many entries of the embedding table we look up + auto numLookups = SLWSN->getIndices().getNode()->dims(0).front(); + /// compute how many bytes we read per lookup + auto tableSize = SLWSN->getData().getNode()->getType(0)->getSizeInBytes(); + auto numRows = SLWSN->getData().getNode()->dims(0).front(); + auto sizePerLookup = tableSize / numRows; + /// compute total bytes read + uint64_t sizeInput = numLookups * sizePerLookup; + + /// does the table fit in SRAM or DRAM + if (tableSize > sramCapacity) { + sizeDram += sizeInput; + } else { + sizeSram += sizeInput; + } + + /// we also read the indices, weights and lengths arrays + sizeSram += SLWSN->getIndices().getNode()->getType(0)->getSizeInBytes(); + sizeSram += SLWSN->getWeights().getNode()->getType(0)->getSizeInBytes(); + sizeSram += SLWSN->getLengths().getNode()->getType(0)->getSizeInBytes(); + } else { + /// for all other ops, iterate through all inputs and get size in bytes + for (int i = 0; i < n; i++) { + auto ty = node.getNthInput(i).getNode()->getType(0); + uint64_t sizeInput = ty->getSizeInBytes(); + if (sizeInput > sramCapacity) { + sizeDram += sizeInput; + } else { + sizeSram += sizeInput; + } + } + } + + // Repeat for outputs + if (node.getNumResults() > 0) { + auto myty = node.getType(0); + uint64_t sizeOutput = myty->getSizeInBytes(); + if (sizeOutput > sramCapacity) { + sizeDram += sizeOutput; + } else { + sizeSram += sizeOutput; + } + } + + /// Calculate compute ops. Currently only computed for Matmul, Conv, FC + /// TODO: think about whether this is better off computed inside a Node. + uint64_t totalOps = 0; + switch (node.getKind()) { + case Kinded::Kind::MatMulNodeKind: { + auto *MMN = llvm::dyn_cast(&node); + auto lhsDims = MMN->getLHS().dims(); + auto rhsDims = MMN->getRHS().dims(); + totalOps = 2 * lhsDims[0] * lhsDims[1] * rhsDims[1]; + break; + } + case Kinded::Kind::FullyConnectedNodeKind: { + auto *FCN = llvm::dyn_cast(&node); + auto inputDims = FCN->getInput().dims(); + auto wtDims = FCN->getWeights().dims(); + totalOps = 2 * inputDims[0] * inputDims[1] * wtDims[1]; + break; + } + case Kinded::Kind::ConvolutionNodeKind: { + auto *CN = llvm::dyn_cast(&node); + auto resultDims = CN->getResult().dims(); + // Get the product of batch, output height, output dims, output channels + totalOps = resultDims[0]; + for (size_t i = 1, e = resultDims.size(); i < e; i++) { + totalOps *= resultDims[i]; + } + // Multiply in kernel height, kernel width + auto kernelDims = CN->getKernels(); + totalOps *= kernelDims[0] * kernelDims[1]; + // Multiply in input channels/groups + auto inputChannels = CN->getInput().dims()[1]; + auto nGroups = CN->getGroup(); + totalOps *= (inputChannels * 1.0 / nGroups); + break; + } + default: + break; + } + + /// Compute compute roofline as max of flops, DRAM, SRAM BW + /// See https://bit.ly/2UdJ3mz + /// Add epsilons to prevent seg faults on unitialized peak values + computeTime_[&node] = + std::max(totalOps * 1.0f / std::max(peakCompute, 1e-6f), + std::max(sizeDram * 1.0f / std::max(peakDramBw, 1e-6f), + sizeSram * 1.0f / std::max(peakSramBw, 1e-6f))); + } +} + // Combine the partitions if necessary : if all outside uses of the nodes in // partition1 is in partition2, and the sum of memory consumption of partition1 // and partition2 is less than availableMemory, combine partition1 and @@ -403,7 +523,6 @@ DAGNodeList &Partitioner::Partition() { // Find the representive function for running partitioning algrithm. F_ = selectRepFunc(module_, memSize_); - size_t availMem = deviceInfo_[0].availableMemory; if (memSize_ < availMem) { @@ -427,9 +546,8 @@ DAGNodeList &Partitioner::Partition() { // Prepare 1: Get the min memory usage for each op. initOpMemUsage(); - // Prepare 2: TODO: get the minimal comunication cost for any 2 ops (i.e. the - // output data size) Will calculate it on the fly. -- Will double check which - // way is better. + // Prepare 2: Get the roofline memory bandwidth estimate for each op. + initOpComputeTime(); // Partition // Use BFS to do the initial partitioning. Starting from the final node, BFS diff --git a/tests/unittests/PartitionerTest.cpp b/tests/unittests/PartitionerTest.cpp index 1dc8b983c8..b04500e848 100644 --- a/tests/unittests/PartitionerTest.cpp +++ b/tests/unittests/PartitionerTest.cpp @@ -208,3 +208,104 @@ TEST_F(PartitionerTest, Basic2) { EXPECT_TRUE(ref.isEqual(test)); } } + +/// This one tests the roofline computed with compute, memory and communication +/// costs +TEST_F(PartitionerTest, Basic1Roofline) { + auto *input = + mod_.createPlaceholder(ElemKind::FloatTy, {1, 32}, "input", false); + auto *w1 = mod_.createConstant(ElemKind::FloatTy, {32, 16}, "w1"); + auto *b1 = mod_.createConstant(ElemKind::FloatTy, {16}, "b1"); + ctx_.allocate(input); + w1->getHandle<>().randomize(-2.0, 2.0, mod_.getPRNG()); + b1->getHandle<>().randomize(-2.0, 2.0, mod_.getPRNG()); + + // Initial FC. + Node *I = F_->createFullyConnected("initial_fc", input, w1, b1); + I = F_->createSigmoid("initial_sigmoid", I); + + // Left branch. + auto *w2 = mod_.createConstant(ElemKind::FloatTy, {16, 16}, "w2"); + auto *b2 = mod_.createConstant(ElemKind::FloatTy, {16}, "b2"); + w2->getHandle<>().randomize(-2.0, 2.0, mod_.getPRNG()); + b2->getHandle<>().randomize(-2.0, 2.0, mod_.getPRNG()); + Node *L = F_->createFullyConnected("left_fc1", I, w2, b2); + L = F_->createSigmoid("left_sigmoid1", L); + auto *w3 = mod_.createConstant(ElemKind::FloatTy, {16, 8}, "w3"); + auto *b3 = mod_.createConstant(ElemKind::FloatTy, {8}, "b3"); + w3->getHandle<>().randomize(-2.0, 2.0, mod_.getPRNG()); + b3->getHandle<>().randomize(-2.0, 2.0, mod_.getPRNG()); + L = F_->createFullyConnected("left_fc2", L, w3, b3); + L = F_->createSigmoid("left_sigmoid2", L); + + // Right branch. + auto *w4 = mod_.createConstant(ElemKind::FloatTy, {16, 16}, "w4"); + auto *b4 = mod_.createConstant(ElemKind::FloatTy, {16}, "b4"); + w4->getHandle<>().randomize(-2.0, 2.0, mod_.getPRNG()); + b4->getHandle<>().randomize(-2.0, 2.0, mod_.getPRNG()); + Node *R = F_->createFullyConnected("right_fc1", I, w4, b4); + R = F_->createSigmoid("right_sigmoid1", R); + auto *w5 = mod_.createConstant(ElemKind::FloatTy, {16, 8}, "w5"); + auto *b5 = mod_.createConstant(ElemKind::FloatTy, {8}, "b5"); + w5->getHandle<>().randomize(-2.0, 2.0, mod_.getPRNG()); + b5->getHandle<>().randomize(-2.0, 2.0, mod_.getPRNG()); + R = F_->createFullyConnected("right_fc2", R, w5, b5); + R = F_->createSigmoid("right_sigmoid2", R); + + // Join branches. + auto *mul = F_->createMul("mul", L, R); + auto *save = F_->createSave("ret", mul); + auto &res = *ctx_.allocate(save->getPlaceholder()); + + // Infer using the un-partitioned graph. + Tensor in(ElemKind::FloatTy, {1, 32}); + ExecutionEngine EE; + + EE.compile(CompilationMode::Infer, F_); + updateInputPlaceholders(ctx_, {input}, {&in}); + EE.run(ctx_); + Tensor ref = res.clone(); + + std::unordered_map nodeNamesMap; + for (auto &node : F_->getNodes()) { + nodeNamesMap[&node] = node.getName(); + } + + std::vector devices = {{3072, 100, 10, 0.1, 1, 0.05}, + {3072, 100, 10, 0.1, 1, 0.05}, + {3072, 100, 10, 0.1, 1, 0.05}}; + Partitioner myPartitioner(&mod_, devices); + + DAGNodeList myList = std::move(myPartitioner.Partition()); + + // check compute costs + std::unordered_map expectedComputeTime{ + {"initial_sigmoid", 128}, + {"left_sigmoid2", 64}, + {"fc_add_bias3", 192}, + {"right_sigmoid1", 128}, + {"mul", 96}, + {"fc_add_bias2", 96}, + {"ret", 0}, + {"fc_dot", 21760}, + {"left_sigmoid1", 128}, + {"fc_add_bias", 192}, + {"fc_dot1", 10240}, + {"right_sigmoid2", 64}, + {"fc_add_bias1", 192}, + {"fc_dot2", 5120}, + {"fc_dot3", 10240}, + {"fc_dot4", 5120}, + {"fc_add_bias4", 96}, + }; + ASSERT_EQ(myPartitioner.getComputeTime().size(), expectedComputeTime.size()); + for (auto &el : myPartitioner.getComputeTime()) { + Node *n = el.first; + float expected = expectedComputeTime[nodeNamesMap[n].c_str()]; + float res = el.second; + ASSERT_EQ(expected, res); + } + + ASSERT_EQ(mod_.getFunctions().size(), 3); + ASSERT_EQ(myList.roots.size(), 1); +}