diff --git a/include/glow/Partitioner/Partitioner.h b/include/glow/Partitioner/Partitioner.h index 6f4fe9ce9a..83b80f5cd5 100644 --- a/include/glow/Partitioner/Partitioner.h +++ b/include/glow/Partitioner/Partitioner.h @@ -16,7 +16,7 @@ #ifndef GLOW_PARTITIONER_PARTITIONER_H #define GLOW_PARTITIONER_PARTITIONER_H -#include "glow/Partitioner/PartitionerUtils.h" +#include "glow/Partitioner/PartitionerTypes.h" #include "glow/Support/Error.h" namespace glow { @@ -26,9 +26,6 @@ using namespace runtime; /// Given a module, partitions each of the its functions into multiple ones /// based on memory constraints and minimizes the communication cost. class Partitioner { - using MemUsageMap = std::unordered_map; - using ComputeTimeMap = std::unordered_map; - /// The module that needs to be decomposed. Module *module_; @@ -59,12 +56,6 @@ class Partitioner { /// Total memory (bytes) requested by one module. uint64_t memSize_; - /// The map of each operator and the corresponding memory size. - MemUsageMap memUsage_; - - /// The map of each operator and the compute runtime. - ComputeTimeMap computeTime_; - /// Flag to set if the Partitioner should attempt to saturate the host, and /// use all available devices. bool saturateHost_; @@ -81,12 +72,6 @@ class Partitioner { /// update the memSize. static Function *selectRepFunc(Module *parent, uint64_t &memSize); - /// Get the minimal memory requirement for each op in the function \p F - void initOpMemUsage(Function *F); - - /// Inititalize the minimal compute time for each op in the function \p F. - void initOpComputeTime(Function *F); - /// After getting the initial partitions, adjust the partitions to minimize /// communication and computation cost. void partitionsAdjust(NodeToFunctionMap &partitions, @@ -180,20 +165,6 @@ class Partitioner { /// a function family and they have the same partition, we only dump the one /// function's partition. void dumpDAG(llvm::StringRef dotFilename) const; - - /// Get function for computeTime_ - float getComputeTime(Node *N) const { - auto it = computeTime_.find(N); - assert(it != computeTime_.end()); - return it == computeTime_.end() ? 0.0 : it->second; - } - - /// Get function for memUsage_ - uint64_t getMemUsage(Node *N) const { - auto it = memUsage_.find(N); - assert(it != memUsage_.end()); - return it == memUsage_.end() ? 0 : it->second; - } }; } // namespace glow #endif // GLOW_PARTITIONER_PARTITIONER_H diff --git a/include/glow/Partitioner/PartitionerTypes.h b/include/glow/Partitioner/PartitionerTypes.h index 5006b573df..de517e80c4 100644 --- a/include/glow/Partitioner/PartitionerTypes.h +++ b/include/glow/Partitioner/PartitionerTypes.h @@ -65,6 +65,17 @@ struct BackendInfo { size_t num = 0; /// The memory constraints for this backend. uint64_t memSize; + /// The following peakCompute, peakDramBw, peakSramBw, peakPCIeBw are from + /// DeviceInfo_. Available SRAM capacity in bytes. + uint64_t sramCapacity; + /// Peak compute on device in ops/second. Assumes all ops are in int8. + float peakCompute; + /// Peak memory bandwidth from DRAM on device in bytes/second. + float peakDramBw; + /// Peak memory bandwidth from SRAM on device in bytes/second. + float peakSramBw; + /// Peak ingress/egress PCI-E bandwidth from device in bytes/second. + float peakPCIeBw; /// Backend pointer. Backend *backend = nullptr; /// The non-supported nodes kind. diff --git a/include/glow/Partitioner/PartitionerUtils.h b/include/glow/Partitioner/PartitionerUtils.h index 3a466333df..7bc265075f 100644 --- a/include/glow/Partitioner/PartitionerUtils.h +++ b/include/glow/Partitioner/PartitionerUtils.h @@ -36,9 +36,15 @@ std::vector getOutUsersWithOnePredecessor(const NodesSet &nodes); /// in the set \p nodes. uint64_t getOutMemPerNode(const NodesSet &nodes, const Node *node); -/// Given a node, \return the NodeSet of inputs of this node. +/// Given a node, \returns the NodeSet of inputs of this node. NodesSet getInputs(const Node *node); +/// Return the estimated op computation time based on \p backendInfo. +float getNodeComputeTime(const Node *node, const BackendInfo &backendInfo); + +/// Given a node, \returns the memory usage of its inputs (i.e. Storage input). +uint64_t getNodeMemUsage(const Node *node); + /// Given nodes set \p currNodes and its memory usage info \p info, \returns the /// new memory usage if \p newNode is added into \p currNodes. GraphMemInfo updateGraphMemInfoByAddingNode(const NodesSet &currNodes, diff --git a/lib/Partitioner/Partitioner.cpp b/lib/Partitioner/Partitioner.cpp index 50a157f8af..23d6af2b7f 100644 --- a/lib/Partitioner/Partitioner.cpp +++ b/lib/Partitioner/Partitioner.cpp @@ -17,6 +17,7 @@ #include "glow/Partitioner/Partitioner.h" #include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h" #include "glow/Partitioner/PartitionerOptimizer.h" +#include "glow/Partitioner/PartitionerUtils.h" #include "glow/Partitioner/PartitionerValidation.h" #include "glow/Support/Support.h" @@ -174,227 +175,6 @@ Function *Partitioner::selectRepFunc(Module *parent, uint64_t &memSize) { return ret; } -/// Get the minimal memory requirement (constant) for each op in the function. -void Partitioner::initOpMemUsage(Function *F) { - memUsage_.clear(); - for (auto &node : F->getNodes()) { - int n = node.getNumInputs(); - uint64_t size = 0; - if (node.getKind() == Kinded::Kind::SaveNodeKind) { - memUsage_[&node] = size; - continue; - } - for (int i = 0; i < n; i++) { - Storage *in = llvm::dyn_cast(node.getNthInput(i).getNode()); - if (in) { - auto ty = in->getType(); - size += ty->getSizeInBytes(); - } - } - memUsage_[&node] = size; - } -} - -/// Get the minimal compute time for each op in the function. -void Partitioner::initOpComputeTime(Function *F) { - computeTime_.clear(); - - // This code assumes all ops are BW limited from SRAM; except - // if the input does not fit in SRAM -- then it is DRAM BW limited - float peakDramBw = deviceInfo_[0].peakDramBw; - float peakSramBw = deviceInfo_[0].peakSramBw; - uint64_t sramCapacity = deviceInfo_[0].sramCapacity; - float peakCompute = deviceInfo_[0].peakCompute; - - for (auto &node : F->getNodes()) { - // compute memory side bytes for inputs from DRAM, SRAM. - // TODO: think about whether this is better off computed inside a Node. - - int n = node.getNumInputs(); - uint64_t sizeDram = 0; - uint64_t sizeSram = 0; - if (node.getKind() == Kinded::Kind::SaveNodeKind) { - computeTime_[&node] = 0.0f; - continue; - } - - // The memory bytes for embedding table lookups is data dependent, - // so it needs to be calculated as per the number of indices accessed. - if (node.getKind() == Kinded::Kind::SparseLengthsWeightedSumNodeKind) { - auto *SLWSN = llvm::dyn_cast(&node); - // compute how many entries of the embedding table we look up - auto numLookups = SLWSN->getIndices().dims().front(); - // compute how many bytes we read per lookup - auto tableSize = SLWSN->getData().getType()->getSizeInBytes(); - auto numRows = SLWSN->getData().dims().front(); - auto sizePerLookup = tableSize / numRows; - // compute total bytes read - uint64_t sizeInput = numLookups * sizePerLookup; - - // tables are usually large and fit in DRAM - sizeDram += sizeInput; - // we also read the indices, weights and lengths arrays - sizeSram += SLWSN->getIndices().getType()->getSizeInBytes(); - sizeSram += SLWSN->getWeights().getType()->getSizeInBytes(); - sizeSram += SLWSN->getLengths().getType()->getSizeInBytes(); - } else if (node.getKind() == Kinded::Kind::SparseLengthsSumNodeKind) { - auto *SLSN = llvm::dyn_cast(&node); - // compute how many entries of the embedding table we look up - auto numLookups = SLSN->getIndices().dims().front(); - // compute how many bytes we read per lookup - auto tableSize = SLSN->getData().getType()->getSizeInBytes(); - auto numRows = SLSN->getData().dims().front(); - auto sizePerLookup = tableSize / numRows; - // compute total bytes read - uint64_t sizeInput = numLookups * sizePerLookup; - - // tables are usually large and fit in DRAM - sizeDram += sizeInput; - // we also read the indices and lengths arrays - sizeSram += SLSN->getIndices().getType()->getSizeInBytes(); - sizeSram += SLSN->getLengths().getType()->getSizeInBytes(); - } else if (node.getKind() == - Kinded::Kind:: - FusedRowwiseQuantizedSparseLengthsWeightedSumNodeKind) { - auto *FRQSLWSN = - llvm::dyn_cast( - &node); - // compute how many entries of the embedding table we look up - auto numLookups = FRQSLWSN->getIndices().dims().front(); - // compute how many bytes we read per lookup - auto tableSize = FRQSLWSN->getData().getType()->getSizeInBytes(); - auto numRows = FRQSLWSN->getData().dims().front(); - auto sizePerLookup = tableSize / numRows; - // compute total bytes read - uint64_t sizeInput = numLookups * sizePerLookup; - - // tables are usually large and fit in DRAM - sizeDram += sizeInput; - - // we also read the indices, weights and lengths arrays - sizeSram += FRQSLWSN->getIndices().getType()->getSizeInBytes(); - sizeSram += FRQSLWSN->getWeights().getType()->getSizeInBytes(); - sizeSram += FRQSLWSN->getLengths().getType()->getSizeInBytes(); - } else if (node.getKind() == - Kinded::Kind::FusedRowwiseQuantizedSparseLengthsSumNodeKind) { - auto *FRQSLSN = - llvm::dyn_cast(&node); - // compute how many entries of the embedding table we look up - auto numLookups = FRQSLSN->getIndices().dims().front(); - // compute how many bytes we read per lookup - auto tableSize = FRQSLSN->getData().getType()->getSizeInBytes(); - auto numRows = FRQSLSN->getData().dims().front(); - auto sizePerLookup = tableSize / numRows; - // compute total bytes read - uint64_t sizeInput = numLookups * sizePerLookup; - - // tables are usually large and fit in DRAM - sizeDram += sizeInput; - - // we also read the indices and lengths arrays - sizeSram += FRQSLSN->getIndices().getType()->getSizeInBytes(); - sizeSram += FRQSLSN->getLengths().getType()->getSizeInBytes(); - } else { - // for all other ops, iterate through all inputs and get size in bytes - for (int i = 0; i < n; i++) { - auto ty = node.getNthInput(i).getType(); - uint64_t sizeInput = ty->getSizeInBytes(); - if (sizeInput > sramCapacity) { - sizeDram += sizeInput; - } else { - sizeSram += sizeInput; - } - } - } - - // Repeat for outputs - for (size_t i = 0, e = node.getNumResults(); i < e; i++) { - auto myty = node.getType(i); - uint64_t sizeOutput = myty->getSizeInBytes(); - if (sizeOutput > sramCapacity) { - sizeDram += sizeOutput; - } else { - sizeSram += sizeOutput; - } - } - - // Calculate compute ops. Currently only computed for Matmul, Conv, FC - // TODO: think about whether this is better off computed inside a Node. - uint64_t totalOps = 0; - switch (node.getKind()) { - case Kinded::Kind::MatMulNodeKind: { - auto *MMN = llvm::dyn_cast(&node); - auto lhsDims = MMN->getLHS().dims(); - auto rhsDims = MMN->getRHS().dims(); - totalOps = 2 * lhsDims[0] * lhsDims[1] * rhsDims[1]; - break; - } - case Kinded::Kind::FullyConnectedNodeKind: { - auto *FCN = llvm::dyn_cast(&node); - auto inputDims = FCN->getInput().dims(); - auto wtDims = FCN->getWeights().dims(); - totalOps = 2 * inputDims[0] * inputDims[1] * wtDims[0]; - break; - } -#ifdef GLOW_WITH_HABANA - case Kinded::Kind::HabanaFullyConnectedNodeKind: { - auto *FCN = llvm::dyn_cast(&node); - auto inputDims = FCN->getInput().dims(); - auto wtDims = FCN->getWeights().dims(); - totalOps = 2 * inputDims[0] * inputDims[1] * wtDims[0]; - break; - } -#endif - case Kinded::Kind::ConvolutionNodeKind: { - auto *CN = llvm::dyn_cast(&node); - auto resultDims = CN->getResult().dims(); - // Get the product of batch, output height, output dims, output channels - totalOps = resultDims[0]; - for (size_t i = 1, e = resultDims.size(); i < e; i++) { - totalOps *= resultDims[i]; - } - // Multiply in kernel height, kernel width - auto kernelDims = CN->getKernels(); - totalOps *= kernelDims[0] * kernelDims[1]; - // Multiply in input channels/groups - auto inputChannels = CN->getInput().dims()[1]; - auto nGroups = CN->getGroup(); - totalOps *= (inputChannels * 1.0 / nGroups); - break; - } -#ifdef GLOW_WITH_HABANA - case Kinded::Kind::HabanaConvolutionNodeKind: { - auto *CN = llvm::dyn_cast(&node); - auto resultDims = CN->getResult().dims(); - // Get the product of batch, output height, output dims, output channels - totalOps = resultDims[0]; - for (size_t i = 1, e = resultDims.size(); i < e; i++) { - totalOps *= resultDims[i]; - } - // Multiply in kernel height, kernel width - auto kernelDims = CN->getKernels(); - totalOps *= kernelDims[0] * kernelDims[1]; - // Multiply in input channels/groups - auto inputChannels = CN->getInput().dims()[1]; - auto nGroups = CN->getGroup(); - totalOps *= (inputChannels * 1.0 / nGroups); - break; - } -#endif - default: - break; - } - - // Compute compute roofline as max of flops, DRAM, SRAM BW - // See https://bit.ly/2UdJ3mz - // Add epsilons to prevent seg faults on uninitialized peak values. - computeTime_[&node] = - std::max(totalOps * 1.0f / std::max(peakCompute, 1e-6f), - std::max(sizeDram * 1.0f / std::max(peakDramBw, 1e-6f), - sizeSram * 1.0f / std::max(peakSramBw, 1e-6f))); - } -} - void Partitioner::partitionsAdjust(NodeToFunctionMap &partitions, uint64_t availableMemory) { // For each partition, create a node set. @@ -743,6 +523,10 @@ void Partitioner::getBackendMap( // is the same. // TODO : will improve the algorithm for different memory size. backendInfo.memSize = deviceInfo_[i].availableMemory; + backendInfo.peakDramBw = deviceInfo_[i].peakDramBw; + backendInfo.peakSramBw = deviceInfo_[i].peakSramBw; + backendInfo.sramCapacity = deviceInfo_[i].sramCapacity; + backendInfo.peakCompute = deviceInfo_[i].peakCompute; backendInfo.nonSupportedNodesKinds = generateNodeKindsSet(deviceInfo_[i].nonSupportedNodes); backendInfo.supportedNodesKinds = @@ -827,7 +611,8 @@ llvm::Error Partitioner::loadBalancedPartitioning(Function *F, // Compute total roofline time float totalRooflineTime = 0; for (auto &n : F->getNodes()) { - totalRooflineTime += getComputeTime(&n); + totalRooflineTime += + getNodeComputeTime(&n, backendMap_[deviceInfo_[0].backendName]); } float timePerPartition = totalRooflineTime / numDevices; @@ -869,8 +654,9 @@ llvm::Error Partitioner::loadBalancedPartitioning(Function *F, } } - auto curOpTime = getComputeTime(N); - auto curOpMemory = getMemUsage(N); + auto curOpTime = + getNodeComputeTime(N, backendMap_[deviceInfo_[0].backendName]); + auto curOpMemory = getNodeMemUsage(N); // Find a partition to put this node into int curPartition = maxLogicalDeviceId; @@ -1001,12 +787,7 @@ llvm::Error Partitioner::Partition(CompilationContext &cctx) { RETURN_IF_ERR(::glow::optimizeFunction(func, *backend, cctx)); } - // Step 2.2 : get the min memory usage and the roofline memory bandwidth - // estimate for each op. - initOpMemUsage(func); - initOpComputeTime(func); - - // Step 2.3 : apply graph partitioning algrithm to find out the partition. + // Step 2.2 : apply graph partitioning algrithm to find out the partition. NodeToFunctionMap partitionMap = selectPartitions(func, availMem, i->second); mapping.insert(partitionMap); diff --git a/lib/Partitioner/PartitionerUtils.cpp b/lib/Partitioner/PartitionerUtils.cpp index cf002a87af..277944488f 100644 --- a/lib/Partitioner/PartitionerUtils.cpp +++ b/lib/Partitioner/PartitionerUtils.cpp @@ -171,6 +171,212 @@ NodesSet getInputs(const Node *node) { return result; } +uint64_t getNodeMemUsage(const Node *node) { + if (node->getKind() == Kinded::Kind::SaveNodeKind) { + return 0; + } + uint64_t size = 0; + for (size_t i = 0, e = node->getNumInputs(); i < e; i++) { + Storage *in = llvm::dyn_cast(node->getNthInput(i).getNode()); + if (in) { + auto ty = in->getType(); + size += ty->getSizeInBytes(); + } + } + return size; +} + +float getNodeComputeTime(const Node *node, const BackendInfo &backendInfo) { + // This code assumes all ops are BW limited from SRAM; except + // if the input does not fit in SRAM -- then it is DRAM BW limited + float peakDramBw = backendInfo.peakDramBw; + float peakSramBw = backendInfo.peakSramBw; + uint64_t sramCapacity = backendInfo.sramCapacity; + float peakCompute = backendInfo.peakCompute; + + // compute memory side bytes for inputs from DRAM, SRAM. + // TODO: think about whether this is better off computed inside a Node. + + int n = node->getNumInputs(); + uint64_t sizeDram = 0; + uint64_t sizeSram = 0; + if (node->getKind() == Kinded::Kind::SaveNodeKind) { + return 0.0f; + } + // The memory bytes for embedding table lookups is data dependent, + // so it needs to be calculated as per the number of indices accessed. + if (node->getKind() == Kinded::Kind::SparseLengthsWeightedSumNodeKind) { + auto *SLWSN = llvm::dyn_cast(node); + // compute how many entries of the embedding table we look up + auto numLookups = SLWSN->getIndices().dims().front(); + // compute how many bytes we read per lookup + auto tableSize = SLWSN->getData().getType()->getSizeInBytes(); + auto numRows = SLWSN->getData().dims().front(); + auto sizePerLookup = tableSize / numRows; + // compute total bytes read + uint64_t sizeInput = numLookups * sizePerLookup; + + // tables are usually large and fit in DRAM + sizeDram += sizeInput; + // we also read the indices, weights and lengths arrays + sizeSram += SLWSN->getIndices().getType()->getSizeInBytes(); + sizeSram += SLWSN->getWeights().getType()->getSizeInBytes(); + sizeSram += SLWSN->getLengths().getType()->getSizeInBytes(); + } else if (node->getKind() == Kinded::Kind::SparseLengthsSumNodeKind) { + auto *SLSN = llvm::dyn_cast(node); + // compute how many entries of the embedding table we look up + auto numLookups = SLSN->getIndices().dims().front(); + // compute how many bytes we read per lookup + auto tableSize = SLSN->getData().getType()->getSizeInBytes(); + auto numRows = SLSN->getData().dims().front(); + auto sizePerLookup = tableSize / numRows; + // compute total bytes read + uint64_t sizeInput = numLookups * sizePerLookup; + + // tables are usually large and fit in DRAM + sizeDram += sizeInput; + // we also read the indices and lengths arrays + sizeSram += SLSN->getIndices().getType()->getSizeInBytes(); + sizeSram += SLSN->getLengths().getType()->getSizeInBytes(); + } else if (node->getKind() == + Kinded::Kind:: + FusedRowwiseQuantizedSparseLengthsWeightedSumNodeKind) { + auto *FRQSLWSN = + llvm::dyn_cast(node); + // compute how many entries of the embedding table we look up + auto numLookups = FRQSLWSN->getIndices().dims().front(); + // compute how many bytes we read per lookup + auto tableSize = FRQSLWSN->getData().getType()->getSizeInBytes(); + auto numRows = FRQSLWSN->getData().dims().front(); + auto sizePerLookup = tableSize / numRows; + // compute total bytes read + uint64_t sizeInput = numLookups * sizePerLookup; + + // tables are usually large and fit in DRAM + sizeDram += sizeInput; + + // we also read the indices, weights and lengths arrays + sizeSram += FRQSLWSN->getIndices().getType()->getSizeInBytes(); + sizeSram += FRQSLWSN->getWeights().getType()->getSizeInBytes(); + sizeSram += FRQSLWSN->getLengths().getType()->getSizeInBytes(); + } else if (node->getKind() == + Kinded::Kind::FusedRowwiseQuantizedSparseLengthsSumNodeKind) { + auto *FRQSLSN = + llvm::dyn_cast(node); + // compute how many entries of the embedding table we look up + auto numLookups = FRQSLSN->getIndices().dims().front(); + // compute how many bytes we read per lookup + auto tableSize = FRQSLSN->getData().getType()->getSizeInBytes(); + auto numRows = FRQSLSN->getData().dims().front(); + auto sizePerLookup = tableSize / numRows; + // compute total bytes read + uint64_t sizeInput = numLookups * sizePerLookup; + + // tables are usually large and fit in DRAM + sizeDram += sizeInput; + + // we also read the indices and lengths arrays + sizeSram += FRQSLSN->getIndices().getType()->getSizeInBytes(); + sizeSram += FRQSLSN->getLengths().getType()->getSizeInBytes(); + } else { + // for all other ops, iterate through all inputs and get size in bytes + for (int i = 0; i < n; i++) { + auto ty = node->getNthInput(i).getType(); + uint64_t sizeInput = ty->getSizeInBytes(); + if (sizeInput > sramCapacity) { + sizeDram += sizeInput; + } else { + sizeSram += sizeInput; + } + } + } + + // Repeat for outputs + for (size_t i = 0, e = node->getNumResults(); i < e; i++) { + auto myty = node->getType(i); + uint64_t sizeOutput = myty->getSizeInBytes(); + if (sizeOutput > sramCapacity) { + sizeDram += sizeOutput; + } else { + sizeSram += sizeOutput; + } + } + + // Calculate compute ops. Currently only computed for Matmul, Conv, FC + // TODO: think about whether this is better off computed inside a Node. + uint64_t totalOps = 0; + switch (node->getKind()) { + case Kinded::Kind::MatMulNodeKind: { + auto *MMN = llvm::dyn_cast(node); + auto lhsDims = MMN->getLHS().dims(); + auto rhsDims = MMN->getRHS().dims(); + totalOps = 2 * lhsDims[0] * lhsDims[1] * rhsDims[1]; + break; + } + case Kinded::Kind::FullyConnectedNodeKind: { + auto *FCN = llvm::dyn_cast(node); + auto inputDims = FCN->getInput().dims(); + auto wtDims = FCN->getWeights().dims(); + totalOps = 2 * inputDims[0] * inputDims[1] * wtDims[0]; + break; + } +#ifdef GLOW_WITH_HABANA + case Kinded::Kind::HabanaFullyConnectedNodeKind: { + auto *FCN = llvm::dyn_cast(node); + auto inputDims = FCN->getInput().dims(); + auto wtDims = FCN->getWeights().dims(); + totalOps = 2 * inputDims[0] * inputDims[1] * wtDims[0]; + break; + } +#endif + case Kinded::Kind::ConvolutionNodeKind: { + auto *CN = llvm::dyn_cast(node); + auto resultDims = CN->getResult().dims(); + // Get the product of batch, output height, output dims, output channels + totalOps = resultDims[0]; + for (size_t i = 1, e = resultDims.size(); i < e; i++) { + totalOps *= resultDims[i]; + } + // Multiply in kernel height, kernel width + auto kernelDims = CN->getKernels(); + totalOps *= kernelDims[0] * kernelDims[1]; + // Multiply in input channels/groups + auto inputChannels = CN->getInput().dims()[1]; + auto nGroups = CN->getGroup(); + totalOps *= (inputChannels * 1.0 / nGroups); + break; + } +#ifdef GLOW_WITH_HABANA + case Kinded::Kind::HabanaConvolutionNodeKind: { + auto *CN = llvm::dyn_cast(node); + auto resultDims = CN->getResult().dims(); + // Get the product of batch, output height, output dims, output channels + totalOps = resultDims[0]; + for (size_t i = 1, e = resultDims.size(); i < e; i++) { + totalOps *= resultDims[i]; + } + // Multiply in kernel height, kernel width + auto kernelDims = CN->getKernels(); + totalOps *= kernelDims[0] * kernelDims[1]; + // Multiply in input channels/groups + auto inputChannels = CN->getInput().dims()[1]; + auto nGroups = CN->getGroup(); + totalOps *= (inputChannels * 1.0 / nGroups); + break; + } +#endif + default: + break; + } + + // Compute compute roofline as max of flops, DRAM, SRAM BW + // See https://bit.ly/2UdJ3mz + // Add epsilons to prevent seg faults on uninitialized peak values. + return std::max(totalOps * 1.0f / std::max(peakCompute, 1e-6f), + std::max(sizeDram * 1.0f / std::max(peakDramBw, 1e-6f), + sizeSram * 1.0f / std::max(peakSramBw, 1e-6f))); +} + /// Given nodes set \p currNodes and its memory usage info \p info, \returns the /// new memory usage if \p newNode is added into \p currNodes. GraphMemInfo updateGraphMemInfoByAddingNode(const NodesSet &currNodes, diff --git a/tests/unittests/PartitionerTest.cpp b/tests/unittests/PartitionerTest.cpp index eacd0de727..9430f12d85 100644 --- a/tests/unittests/PartitionerTest.cpp +++ b/tests/unittests/PartitionerTest.cpp @@ -17,6 +17,7 @@ #include "glow/ExecutionEngine/ExecutionEngine2.h" #include "glow/Graph/Graph.h" #include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h" +#include "glow/Partitioner/PartitionerUtils.h" #include "gtest/gtest.h" @@ -428,17 +429,6 @@ TEST_F(PartitionerTest, Basic1Roofline) { nodeNamesMap[&node] = node.getName(); } - std::vector devices = { - {3072, "Interpreter", "", "", 100, 10, 0.1, 1, 0.05}, - {3072, "Interpreter", "", "", 100, 10, 0.1, 1, 0.05}, - {3072, "Interpreter", "", "", 100, 10, 0.1, 1, 0.05}}; - Partitioner myPartitioner(&EEP.getModule(), devices); - - auto err = myPartitioner.Partition(cctx); - EXPECT_FALSE(errToBool(std::move(err))); - - DAGListTy dagList = std::move(myPartitioner.getPartitionResult()); - // check compute costs std::unordered_map expectedComputeTime{ {"initial_sigmoid", 128}, @@ -460,38 +450,17 @@ TEST_F(PartitionerTest, Basic1Roofline) { {"fc_add_bias4", 96}, }; + BackendInfo backendInfo; + backendInfo.sramCapacity = 100; + backendInfo.peakCompute = 10; + backendInfo.peakDramBw = 0.1; + backendInfo.peakSramBw = 1; + backendInfo.peakPCIeBw = 0.05; for (auto const &p : nodeNamesMap) { auto *N = p.first; - EXPECT_EQ(myPartitioner.getComputeTime(N), expectedComputeTime[p.second]); + EXPECT_EQ(getNodeComputeTime(N, backendInfo), + expectedComputeTime[p.second]); } - - // check memUsage - std::unordered_map expectedMemUsage{ - {"initial_sigmoid", 0}, - {"left_sigmoid2", 0}, - {"fc_add_bias3", 64}, - {"right_sigmoid1", 0}, - {"mul", 0}, - {"fc_add_bias2", 32}, - {"ret", 0}, - {"fc_dot", 2176}, - {"left_sigmoid1", 0}, - {"fc_add_bias", 64}, - {"fc_dot1", 1024}, - {"right_sigmoid2", 0}, - {"fc_add_bias1", 64}, - {"fc_dot2", 512}, - {"fc_dot3", 1024}, - {"fc_dot4", 512}, - {"fc_add_bias4", 32}, - }; - for (auto const &p : nodeNamesMap) { - auto *N = p.first; - EXPECT_EQ(myPartitioner.getMemUsage(N), expectedMemUsage[p.second]); - } - - EXPECT_EQ(EEP.getModule().getFunctions().size(), 3); - EXPECT_EQ(dagList.size(), 1); } TEST_F(PartitionerTest, SelectRepFunc) {