diff --git a/include/glow/Partitioner/Partitioner.h b/include/glow/Partitioner/Partitioner.h index 0e83c1bf42..6ccdca00ec 100644 --- a/include/glow/Partitioner/Partitioner.h +++ b/include/glow/Partitioner/Partitioner.h @@ -17,6 +17,7 @@ #define GLOW_PARTITIONER_PARTITIONER_H #include "glow/Graph/Graph.h" +#include "glow/Partitioner/PartitionerUtils.h" #include "glow/Runtime/RuntimeTypes.h" #include "llvm/ADT/DenseMap.h" @@ -29,19 +30,27 @@ namespace glow { using namespace runtime; -using MemUsageMap = std::unordered_map; +using MemUsageMapTy = std::unordered_map; +using NodesSetTy = std::set; +using PartitionCostMapTy = llvm::DenseMap; + +/// Helper structure for building a partition. Records 1) a mapping of nodes in +/// the original function to destination partitions, along with a list of the +/// newly-created functions; 2) a mapping of newly-created functions aalong with +/// a set of nodes sets. +using NodeToFunctionMapTy = llvm::DenseMap; +using FunctionToNodesMapTy = llvm::DenseMap; -/// Helper structure for building a partition. Records a mapping of nodes in the -/// original function to destination partitions, along with a list of the -/// newly-created functions. class NodeToFunctionMap { - using Map = llvm::DenseMap; /// Newly-created partitions. FunctionList functions_; /// Map of nodes in the original function to their target partition. - Map nodeToFunction_; + NodeToFunctionMapTy nodeToFunction_; + + /// Map of sub-fuctions to their memory consumption. + PartitionCostMapTy partitionCost_; public: /// Create a new partition \p F. @@ -54,10 +63,22 @@ class NodeToFunctionMap { const FunctionList &getPartitions() const { return functions_; } /// Map API. - Map::iterator find(Node *N) { return nodeToFunction_.find(N); } - Map::iterator begin() { return nodeToFunction_.begin(); } - Map::iterator end() { return nodeToFunction_.end(); } + NodeToFunctionMapTy::iterator find(Node *N) { + return nodeToFunction_.find(N); + } + NodeToFunctionMapTy::iterator begin() { return nodeToFunction_.begin(); } + NodeToFunctionMapTy::iterator end() { return nodeToFunction_.end(); } + Function *operator[](Node *n) { return nodeToFunction_[n]; } + void deletePartition(Function *func) { functions_.remove(func); } + + /// Set the memory consumption \p cost for a partition \p func. + void setGraphMemInfo(Function *func, GraphMemInfo cost) { + partitionCost_[func] = cost; + } + + /// Get the memory consumption for a partition \p func. + GraphMemInfo getGraphMemInfo(Function *func) { return partitionCost_[func]; } }; /// The struct contains all the created DAGNodes. This DAGNodeList owns all the @@ -92,7 +113,7 @@ class Partitioner { size_t memSize_; /// The map of each operator and the corresponding memory size. - MemUsageMap memUsage_; + MemUsageMapTy memUsage_; /// Get the representative function (the one with the largest input) and /// update the memSize. @@ -102,6 +123,19 @@ class Partitioner { /// function. void initOpMemUsage(); + /// Combine the partitions if necessary : if all outside uses of the nodes in + /// /// partition1 is in partition2, and the sum of memory consumption of + /// partition1 and partition2 is less than availableMemory, combine partition1 + /// and partition2. + void partitionsCombine(NodeToFunctionMap &partitions, + FunctionToNodesMapTy &nodesSet, + uint64_t availableMemory); + + /// After getting the intial partitions, ajust the partitions to miminize + /// communication and computation cost. + void partitionsAdjust(NodeToFunctionMap &partitions, + uint64_t availableMemory); + /// Assign nodes to partitions and return the mapping. NodeToFunctionMap selectPartitions(Function *F, unsigned availableMemory); diff --git a/include/glow/Partitioner/PartitionerUtils.h b/include/glow/Partitioner/PartitionerUtils.h new file mode 100644 index 0000000000..ef6af4a0f3 --- /dev/null +++ b/include/glow/Partitioner/PartitionerUtils.h @@ -0,0 +1,48 @@ +/** + * Copyright (c) 2017-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef GLOW_PARTITIONER_PARTITIONUTILS_H +#define GLOW_PARTITIONER_PARTITIONUTILS_H + +#include "glow/Graph/Graph.h" + +namespace glow { + +/// The memory usage of a subgraph (i.e. a list of nodes of a function). +struct GraphMemInfo { + // The memory usage of all input nodes (whose predecessors are not included in + // this subgraph) of this subgraph. + uint64_t inMemSize; + // The memory usage of all output nodes (whose successors are not included in + // this subgraph) of this subgraph. + uint64_t outMemSize; + // The memory usage of all constants used in this subgraph. + uint64_t constMemSize; + + GraphMemInfo() : inMemSize(0), outMemSize(0), constMemSize(0){}; +}; + +/// Given \p nodes, return a list of nodes who use any node in this set. +std::vector getOutUsers(const std::set &nodes); + +/// Given \p nodes, return a list of nodes who use only the nodes in this set or +/// constant. +std::vector +getOutUsersWithOnePredecessor(const std::set &nodes); + +/// Return the memory usage of a given nodes set. +GraphMemInfo getGraphMemInfo(const std::set &nodes); +} // namespace glow +#endif // GLOW_PARTITIONER_PARTITIONUTILS_H diff --git a/lib/Partitioner/CMakeLists.txt b/lib/Partitioner/CMakeLists.txt index ffd6ea532b..2f410ff73c 100644 --- a/lib/Partitioner/CMakeLists.txt +++ b/lib/Partitioner/CMakeLists.txt @@ -1,5 +1,6 @@ add_library(Partitioner - Partitioner.cpp) + PartitionerUtils.cpp + Partitioner.cpp) target_link_libraries(Partitioner PRIVATE diff --git a/lib/Partitioner/Partitioner.cpp b/lib/Partitioner/Partitioner.cpp index 1e32e5c9cb..3505c51cb3 100644 --- a/lib/Partitioner/Partitioner.cpp +++ b/lib/Partitioner/Partitioner.cpp @@ -139,6 +139,148 @@ static BFSLevel getBFSLevel(Function *F) { return bfs; } +// Combine the partitions if necessary : if all outside uses of the nodes in +// partition1 is in partition2, and the sum of memory consumption of partition1 +// and partition2 is less than availableMemory, combine partition1 and +// partition2. +void Partitioner::partitionsCombine(NodeToFunctionMap &partitions, + FunctionToNodesMapTy &nodesSet, + uint64_t availableMemory) { + + for (FunctionToNodesMapTy::iterator it = nodesSet.begin(); + it != nodesSet.end(); ++it) { + std::vector outUsers = getOutUsers((*it).second); + if (outUsers.empty()) { + continue; + } + + bool flag = true; + for (int i = 1, e = outUsers.size(); i < e; i++) { + if (partitions[outUsers[i]] != partitions[outUsers[i - 1]]) { + flag = false; + break; + } + } + if (flag) { + // This partition only has one successor. + Function *cur = (*it).first; + Function *suc = partitions[outUsers[0]]; + NodesSetTy tmp = nodesSet.lookup(suc); + GraphMemInfo cost1 = partitions.getGraphMemInfo(cur); + GraphMemInfo cost2 = partitions.getGraphMemInfo(suc); + if (cost1.constMemSize + cost1.inMemSize + cost2.constMemSize + + cost2.inMemSize - cost1.outMemSize < + availableMemory) { + // We can combine the two partitions to fit one device. + for (NodesSetTy::iterator it2 = tmp.begin(); it2 != tmp.end(); ++it2) { + partitions.add(*it2, cur); + } + (*it).second.insert(tmp.begin(), tmp.end()); + partitions.deletePartition(suc); + nodesSet.erase(suc); + module_->eraseFunction(suc); + } + } + } +} + +void Partitioner::partitionsAdjust(NodeToFunctionMap &partitions, + uint64_t availableMemory) { + // For each partitioin, create a node set. + FunctionToNodesMapTy nodesSet; + for (NodeToFunctionMapTy::iterator it = partitions.begin(); + it != partitions.end(); ++it) { + nodesSet[(*it).second].insert((*it).first); + } + + // Initial the memory cost for each partition. Now we use the output size to + // represent the communication cost. + for (FunctionToNodesMapTy::iterator it = nodesSet.begin(); + it != nodesSet.end(); ++it) { + GraphMemInfo cost = getGraphMemInfo((*it).second); + partitions.setGraphMemInfo((*it).first, cost); + } + + // Move/Exchange nodes between any two connected partitions, until no gain is + // get. + // Step1 Move: Assume Partition1 -> Partition2, try to move nodes from + // Partition2 to Partition1 if those nodes only use the nodes in + // Partition1(recursively) and the move won't make Partition1's memory exceeds + // the memory constraint, and the communication cost is minimized. + bool gain = true; + while (gain) { + // gain is initialized as false, it will be set to be true if there is at + // least one node can be moved from one set to antoher set. + gain = false; + for (FunctionToNodesMapTy::iterator it = nodesSet.begin(); + it != nodesSet.end(); ++it) { + NodesSetTy nSet = (*it).second; + std::vector outUsers = getOutUsersWithOnePredecessor(nSet); + if (outUsers.empty()) { + continue; + } + Function *cur = (*it).first; + uint64_t memSize = partitions.getGraphMemInfo(cur).constMemSize + + partitions.getGraphMemInfo(cur).inMemSize; + uint64_t communicationCost = partitions.getGraphMemInfo(cur).outMemSize; + // Check if a node can be moved to current node set (i.e nSet). + for (int i = 0, e = outUsers.size(); i < e; i++) { + // Rule 1: this move won't break memory constraint. + if (memUsage_[outUsers[i]] + memSize > availableMemory) { + continue; + } + // Rule 2: this move won't cause constant duplication. + bool cont = false; + for (int j = 0, e1 = outUsers[i]->getNumInputs(); j < e1; j++) { + auto in = outUsers[i]->getNthInput(j); + if (isa(in.getNode()) && !in.hasOneUse()) { + cont = true; + break; + } + } + if (cont) { + continue; + } + // Rule 3: this move won't increase communication cost. Even if this + // move won't change communication cost, according to rule 1 and rule 2, + // the memory consumption of the partition where this node (i.e + // outUsers[i]) belongs can be reduced. Therefore, it may trigger later + // node movement or paritionCombine. + nSet.insert(outUsers[i]); + GraphMemInfo cost = getGraphMemInfo(nSet); + if (cost.outMemSize <= communicationCost) { + // Move this node to current node set. + nSet.insert(outUsers[i]); + nodesSet[cur].insert(outUsers[i]); + Function *suc = partitions[outUsers[i]]; + nodesSet[suc].erase(outUsers[i]); + // Update the partitions. + partitions.add(outUsers[i], cur); + partitions.setGraphMemInfo(cur, cost); + if (nodesSet[suc].empty()) { + // It is possible that after moving a node from Partition2 to + // Partition1, Partition2 become empty. Remove the empty partition. + partitions.deletePartition(suc); + module_->eraseFunction(suc); + } else { + cost = getGraphMemInfo(nodesSet[suc]); + partitions.setGraphMemInfo(suc, cost); + } + gain = true; + communicationCost = cost.outMemSize; + memSize += memUsage_[outUsers[i]]; + } + } + } + } + + // TODO... :Step 2: exchange two nodes from two partitions to minimize + // communication cost. + + // Combine the current partitions if necessary. + partitionsCombine(partitions, nodesSet, availableMemory); +} + /// Assign nodes to partitions and return the mapping. NodeToFunctionMap Partitioner::selectPartitions(Function *F, unsigned availableMemory) { @@ -149,7 +291,7 @@ NodeToFunctionMap Partitioner::selectPartitions(Function *F, // (cut[1], cut[0] - 1], ..., (-1, cut[n] - 1]. std::vector cut; - // Step 1 : get the initial cut based on BFS levels and avaiableMemory. + // Step 1 : get the initial cut based on BFS levels and availableMemory. // TODO .. need to remove the duplicated memory usage. unsigned mem = 0; for (int i = level - 1; i >= 0; i--) { @@ -199,9 +341,9 @@ NodeToFunctionMap Partitioner::selectPartitions(Function *F, } } } - // Step 3 : adjust the partition based on performance (Advanced Graph - // Paritioning algrithm will be applied here). - // --- TODO + + // Step 3 : adjust the partition based on performance. + partitionsAdjust(mapping, availableMemory); return mapping; } diff --git a/lib/Partitioner/PartitionerUtils.cpp b/lib/Partitioner/PartitionerUtils.cpp new file mode 100644 index 0000000000..e1122d8de6 --- /dev/null +++ b/lib/Partitioner/PartitionerUtils.cpp @@ -0,0 +1,130 @@ + +/** + * Copyright (c) 2017-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "glow/Partitioner/PartitionerUtils.h" + +#include + +namespace glow { + +/// Given \p nodes, return a list of nodes who are not in this set but use any +/// node in this set. +std::vector getOutUsers(const std::set &nodes) { + std::vector ret; + for (std::set::iterator it = nodes.begin(); it != nodes.end(); ++it) { + Node *cur = *it; + for (auto &U : cur->getUsers()) { + if (nodes.count(U.getUser())) { + continue; + } + ret.push_back(U.getUser()); + } + } + return ret; +} + +/// Given \p nodes, return a list of nodes who are not in this set but use only +/// the nodes in this set or constant. +std::vector +getOutUsersWithOnePredecessor(const std::set &nodes) { + std::vector ret; + for (std::set::iterator it = nodes.begin(); it != nodes.end(); ++it) { + Node *cur = *it; + for (auto &U : cur->getUsers()) { + Node *user = U.getUser(); + if (nodes.count(user)) { + continue; + } + bool flag = true; + for (int i = 0, e = user->getNumInputs(); i < e; i++) { + Node *in = user->getNthInput(i).getNode(); + if (llvm::isa(in) || nodes.count(in)) { + continue; + } + flag = false; + break; + } + if (flag) { + ret.push_back(user); + } + } + } + return ret; +} + +GraphMemInfo getGraphMemInfo(const std::set &nodes) { + GraphMemInfo ret; + std::set nSet; + for (std::set::iterator it = nodes.begin(); it != nodes.end(); ++it) { + Node *cur = *it; + // For Save onde, the only required memory is for output. + if (auto *SN = llvm::dyn_cast(cur)) { + Storage *out = llvm::dyn_cast(SN->getOutput().getNode()); + ret.outMemSize += out->getType()->getSizeInBytes(); + continue; + } + // Check the inputs of each node in this subgraph and decide if it + // contributes to the memory usage: + for (int i = 0, e = cur->getNumInputs(); i < e; i++) { + Node *node = cur->getNthInput(i).getNode(); + if (nodes.count(node) || nSet.count(node)) { + // This input belongs to this subgraph or it has been considered + // already, nothing to do. + continue; + } + nSet.insert(node); + Storage *in = llvm::dyn_cast(node); + if (in) { + uint64_t size = in->getType()->getSizeInBytes(); + if (node->getKind() == Kinded::Kind::ConstantKind) { + // Constant. + ret.constMemSize += size; + } else { + // PlaceHolder for Input. + ret.inMemSize += size; + } + } else { + // In this case, this input is neither a storage type node nor belongs + // to this subgraph. Therefore, when creating paritions, we need to add + // a PlaceHolder for the data from outside. + for (auto &U : node->getUsers()) { + if (U.getUser() == cur) { + ret.inMemSize += node->getType(0)->getSizeInBytes(); + break; + } + } + } + } + // Check the outputs of each node in this subgraph and decide if it + // contributes to the memory usage. Although at the stage, the output may + // not be a storage node, after real partitioning, a Save node will be added + // to hold the output: + for (int i = 0, e = cur->getNumResults(); i < e; i++) { + for (auto &U : cur->getNthResult(i).getNode()->getUsers()) { + Node *node = U.getUser(); + if (nodes.count(node) || nSet.count(node)) { + // The output belongs to this subgraph, nothing needs to do. + continue; + } + nSet.insert(node); + ret.outMemSize += cur->getType(i)->getSizeInBytes(); + } + } + } + return ret; +} +} // namespace glow diff --git a/tests/unittests/PartitionerTest.cpp b/tests/unittests/PartitionerTest.cpp index 0ab51752c1..1dc8b983c8 100644 --- a/tests/unittests/PartitionerTest.cpp +++ b/tests/unittests/PartitionerTest.cpp @@ -196,7 +196,7 @@ TEST_F(PartitionerTest, Basic2) { Partitioner myPartitioner(&mod_, devices); DAGNodeList myList = std::move(myPartitioner.Partition()); - ASSERT_EQ(mod_.getFunctions().size(), 3); + ASSERT_EQ(mod_.getFunctions().size(), 2); ASSERT_EQ(myList.roots.size(), 1); // Run the paritioned graph and compare the results.