diff --git a/include/glow/Partitioner/Partitioner.h b/include/glow/Partitioner/Partitioner.h
index 83b80f5cd5..48ac4340b8 100644
--- a/include/glow/Partitioner/Partitioner.h
+++ b/include/glow/Partitioner/Partitioner.h
@@ -16,7 +16,7 @@
 #ifndef GLOW_PARTITIONER_PARTITIONER_H
 #define GLOW_PARTITIONER_PARTITIONER_H
 
-#include "glow/Partitioner/PartitionerTypes.h"
+#include "glow/Partitioner/PartitionerBase.h"
 #include "glow/Support/Error.h"
 
 namespace glow {
@@ -25,7 +25,7 @@ using namespace runtime;
 
 /// Given a module, partitions each of the its functions into multiple ones
 /// based on memory constraints and minimizes the communication cost.
-class Partitioner {
+class Partitioner final : public PartitionerBase {
   /// The module that needs to be decomposed.
   Module *module_;
 
@@ -36,7 +36,10 @@ class Partitioner {
   /// The cost model related to device.
   std::vector<DeviceInfo> deviceInfo_;
 
-  /// The backend pointers.
+  /// The backends created in Partitioner. Used for function optimization.
+  std::vector<std::unique_ptr<Backend>> backendHolder;
+
+  /// The raw backend pointers.
   std::vector<Backend *> backends_;
 
   /// The map between backend name and BackendInfo.
@@ -50,9 +53,6 @@ class Partitioner {
   /// needed after partitions.
   DeviceIDTy logicalDeviceID_;
 
-  /// The result of module partitioning.
-  DAGListTy partitions_;
-
   /// Total memory (bytes) requested by one module.
   uint64_t memSize_;
 
@@ -82,11 +82,19 @@ class Partitioner {
   NodeToFunctionMap selectPartitions(Function *F, uint64_t availableMemory,
                                      llvm::StringRef backendName);
 
-  /// Duplicates all networks in the module order to saturate the Host.
-  void saturateHost(unsigned logicalDeviceCount);
-
-  FunctionToBackendNameMap
-  backendBasedPartition(Function *F, std::vector<Backend *> &backends,
+  /// Duplicates \p partitions in the module order to saturate the Host. \p
+  /// logicalDeviceCount is the number of logical devices used by the current
+  /// partitions. For example: If a network is partitioned into two parts (\p
+  /// logicalDeviceCount) and there are six devices this would duplicate the
+  /// network three times.
+  void saturateHost(unsigned logicalDeviceCount, const DAGListTy &partitions);
+
+  /// Partition a function \p F based on backends \p backends. \returns the
+  /// final partition result(or an err) and a map between partitions and backend
+  /// names. \p cctx is used for functions optimization.
+  llvm::Expected<DAGListTy>
+  backendBasedPartition(FunctionToBackendNameMap &funcToBackend, Function *F,
+                        std::vector<Backend *> &backends,
                         CompilationContext &cctx);
 
   /// Performs a load balancing optimization pass to optimize for load
@@ -96,26 +104,20 @@ class Partitioner {
                                        llvm::StringRef backendName,
                                        NodeToFunctionMap &mapping);
 
-  /// Given the node-function mapping, do the actual partitioning. If \p saveDAG
-  /// is true, the DAG will be saved into partitions_, which is the final
-  /// partition result.
-  void doPartitioning(llvm::StringRef funcName, std::vector<Function *>,
-                      NodeToFunctionMap &mapping, bool saveDAG);
-
   /// If there is no need to do any partition, just generate the DAGNode based
   /// on current functions in this module for backend \p backendName found in \p
-  /// backendMap. \p cctx is used during optimization of the Function. \returns
-  /// whether there was an error encountered.
-  llvm::Error
+  /// backendMap. \p cctx is used for function optimization. \returns the
+  /// partition result or an error.
+  llvm::Expected<DAGListTy>
   createDAGWithoutPartition(llvm::StringRef backendName,
                             std::map<std::string, BackendInfo> &backendMap,
                             CompilationContext &cctx);
 
-  /// Get the map between the backend name and the concrete backend info (e.g.
-  /// backend pointer, mem, number) used in this partiton. If there are backends
-  /// need to be created, we use \p backendsHolder to hold them for memory
-  /// purpose.
-  void getBackendMap(std::map<std::string, BackendInfo> &backendMap,
+  /// Create the map between the backend name and the concrete backend info
+  /// (e.g. backend pointer, mem, number) used in this partiton. If there are
+  /// backends need to be created, we use \p backendsHolder to hold them for
+  /// memory purpose.
+  void genBackendMap(std::map<std::string, BackendInfo> &backendMap,
                      std::vector<std::unique_ptr<Backend>> &backendsHolder,
                      std::vector<Backend *> &backends);
 
@@ -141,30 +143,30 @@ class Partitioner {
               const std::vector<Backend *> &backends, bool saturateHost = false,
               bool optimized = false);
 
-  /// Based on partitionConfig_ passed into Partitioner, do the user-defined
+  /// Based on \p partitionConfig passed into Partitioner, do user-defined
   /// partition.
-  llvm::Error PartitionFromConfig();
-
-  /// Decompose each function in a module. Now we support partitioning a module
-  /// among different type of devices. \p cctx is used during optimization of
-  /// the Function. \returns whether there was an error encountered.
-  llvm::Error Partition(CompilationContext &cctx);
+  llvm::Expected<DAGListTy>
+  partitionFromConfig(const PartitionConfig &partitionConfig);
 
   /// This partition approach is used in Glow Quantization Profiling flow. The
   /// backendBasedPartition is applied first in case there are heterogeneous
   /// backends. Then each sub-function will be compiled and run in CPU backend
-  /// for profiling.
-  llvm::Error QuantizationProfilingPartition(CompilationContext &cctx,
-                                             Function *F,
-                                             std::vector<Backend *> backends);
-
-  /// Get the final partitions.
-  DAGListTy &getPartitionResult() { return partitions_; }
-
-  /// Dump the partition result to a dot file. Since now all functions belong to
-  /// a function family and they have the same partition, we only dump the one
-  /// function's partition.
-  void dumpDAG(llvm::StringRef dotFilename) const;
+  /// for profiling. \p cctx is used for function optimization. \returns the
+  /// partition result or an error.
+  llvm::Expected<DAGListTy>
+  quantizationProfilingPartition(CompilationContext &cctx);
+
+  /// This partition approch first do the partition based on backend types, and
+  /// then based on cost models(memory usage and performance). \p cctx is used
+  /// for function optimization. \returns the partition result or an error.
+  llvm::Expected<DAGListTy> heterogeneousPartition(CompilationContext &cctx);
+
+  /// Decompose each function in a module. Given the parameters, this function
+  /// will choose different partition approches supported in this class:
+  /// heterogeneous partition, user-defined partition or quantization profiling.
+  /// \p cctx is used for function optimization. \returns the partition result
+  /// or an error.
+  llvm::Expected<DAGListTy> partition(CompilationContext &cctx) override;
 };
 } // namespace glow
 #endif // GLOW_PARTITIONER_PARTITIONER_H
diff --git a/include/glow/Partitioner/PartitionerBase.h b/include/glow/Partitioner/PartitionerBase.h
new file mode 100644
index 0000000000..3766550a86
--- /dev/null
+++ b/include/glow/Partitioner/PartitionerBase.h
@@ -0,0 +1,49 @@
+/**
+ * Copyright (c) 2017-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef GLOW_PARTITIONER_PARTITIONERBASE_H
+#define GLOW_PARTITIONER_PARTITIONERBASE_H
+
+#include "glow/Partitioner/PartitionerTypes.h"
+#include "glow/Support/Error.h"
+
+namespace glow {
+
+using namespace runtime;
+/// Given a module, partitions each of the its functions into multiple ones
+/// based on memory constraints and minimizes the communication cost.
+class PartitionerBase {
+public:
+  virtual ~PartitionerBase() = default;
+
+  /// Decompose each function in a module. \p cctx is used in function
+  /// optimization. \returns the partition result.
+  virtual llvm::Expected<DAGListTy> partition(CompilationContext &cctx) = 0;
+
+  /// Dump the partition result \p partitions to a dot file with name \p
+  /// dotFilename. Since now all functions belong to a function family and they
+  /// have the same partition, we only dump the one function's partition.
+  void dumpDAG(llvm::StringRef dotFilename, const DAGListTy &partitions) const;
+
+protected:
+  /// Given the node-function mapping \p mapping, do the actual partitioning. If
+  /// \p saveDAG is true, the DAG will be generated. \returns the final
+  /// partitions or an empty partition (If \p saveDAG is false).
+  DAGListTy doPartitioning(llvm::StringRef funcName, std::vector<Function *>,
+                           Module *module, NodeToFunctionMap &mapping,
+                           bool saveDAG);
+};
+} // namespace glow
+#endif // GLOW_PARTITIONER_PARTITIONER_H
diff --git a/lib/Partitioner/CMakeLists.txt b/lib/Partitioner/CMakeLists.txt
index d3e304a3a0..e37ec05361 100644
--- a/lib/Partitioner/CMakeLists.txt
+++ b/lib/Partitioner/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_library(Partitioner
+              PartitionerBase.cpp
               PartitionerUtils.cpp
               PartitionerOptimizer.cpp
               PartitionerValidation.cpp
diff --git a/lib/Partitioner/Partitioner.cpp b/lib/Partitioner/Partitioner.cpp
index ac3c104ef0..55069e02bb 100644
--- a/lib/Partitioner/Partitioner.cpp
+++ b/lib/Partitioner/Partitioner.cpp
@@ -21,10 +21,6 @@
 #include "glow/Partitioner/PartitionerValidation.h"
 #include "glow/Support/Support.h"
 
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -64,61 +60,6 @@ bool sortMinMemory(const std::pair<Function *, uint64_t> &a,
   return a.second < b.second;
 }
 
-void Partitioner::dumpDAG(llvm::StringRef dotFilename) const {
-  if (partitions_.size() == 0)
-    return;
-  auto *root = partitions_[0].root.get();
-  LOG(INFO) << "Writing dotty graph for DAG after graph partitioning: "
-            << dotFilename.str();
-  std::ofstream myfile;
-  myfile.open(dotFilename);
-  myfile << "digraph DAG {\n\trankdir=TB;\n";
-  // Dump DAGNodes
-  std::vector<DAGNode *> nodes;
-  llvm::SmallSet<DAGNode *, 10> used;
-  nodes.push_back(root);
-  int cur = 0;
-  int num = 1;
-  while (cur < num) {
-    auto *node = nodes[cur];
-    for (size_t i = 0; i < node->children.size(); i++) {
-      auto child = node->children[i];
-      DescriptionBuilder db(child->name.c_str());
-      const std::string &backendName = child->backendName;
-      db.addParam("BackendName", backendName);
-      myfile << "\"" << escapeDottyString(child->name) << "\""
-             << " [ label = \"" << escapeDottyString(db) << "\"";
-      myfile << "\tshape = \"record\"\n";
-      myfile << "\tstyle=\"filled,rounded\"\n";
-      auto colorIdx = llvm::hash_value(backendName);
-      myfile << "\tfillcolor=" << getDotFileNodeColor(colorIdx) << "\n";
-      myfile << "penwidth = 2];\n";
-      if (used.count(child) == 0) {
-        nodes.push_back(child);
-        used.insert(child);
-        num++;
-      }
-    }
-    cur++;
-  }
-
-  // Dump edges.
-  for (size_t i = 0; i < nodes.size(); i++) {
-    auto *root = nodes[i];
-    for (size_t j = 0; j < root->children.size(); j++) {
-      auto child = root->children[j];
-      myfile << "\"" << escapeDottyString(root->name) << "\""
-             << " -> "
-             << "\"" << escapeDottyString(child->name) << "\""
-             << ";";
-    }
-  }
-  myfile << "}";
-
-  myfile.close();
-  return;
-}
-
 Partitioner::Partitioner(Module *parent, const std::vector<DeviceInfo> &devices,
                          const std::vector<Backend *> &backends,
                          bool saturateHost, bool optimized)
@@ -234,16 +175,14 @@ NodeToFunctionMap Partitioner::selectPartitions(Function *F,
   return mapping;
 }
 
-/// Duplicate the network to saturate the number of devices. For example: If a
-/// network is partitioned into two parts (\p logicalDeviceCount) and there are
-/// six devices this would duplicate the network three times.
-void Partitioner::saturateHost(unsigned logicalDeviceCount) {
+void Partitioner::saturateHost(unsigned logicalDeviceCount,
+                               const DAGListTy &partitions) {
   unsigned duplications = deviceInfo_.size() / logicalDeviceCount;
   if (duplications < 2) {
     return;
   }
   // Add additional logical devices to each node.
-  for (auto &network : partitions_) {
+  for (auto &network : partitions) {
     for (auto &node : network.nodes) {
       // Build list of new logical devices to add to node.
       std::vector<unsigned> newDevices;
@@ -263,143 +202,9 @@ void Partitioner::saturateHost(unsigned logicalDeviceCount) {
   }
 }
 
-/// Current only partition the representative function.
-void Partitioner::doPartitioning(llvm::StringRef funcName,
-                                 std::vector<Function *> funcs,
-                                 NodeToFunctionMap &mapping, bool saveDAG) {
-  // Add a dummy node to make sure that a DAG has a single entrance.
-  DAGNodePtr DAGRoot = llvm::make_unique<DAGNode>();
-  DAGNodePtrVec nodes;
-  DAGRoot->logicalDevices = {0};
-  DAGRoot->name = funcName;
-  DAGRoot->module = module_;
-  DAGRoot->deviceIDs = {0};
-  DAGNode *root = DAGRoot.get();
-
-  llvm::DenseMap<Node *, Node *> currToNew;
-
-  // Clone nodes into target partition.
-  for (size_t i = 0, e = funcs.size(); i < e; i++) {
-    for (auto &N : funcs[i]->getNodes()) {
-      auto *clone = N.clone();
-      currToNew[&N] = clone;
-      mapping[&N]->addNode(clone);
-    }
-  }
-
-  // For any dependency that crosses a partition, add a placeholder and save
-  // node. Record the dependence in the function graph.
-  std::unordered_map<NodeValue, Placeholder *> placeholders;
-  llvm::DenseMap<Function *, DAGNode *> funcDAG;
-  for (auto *subF : mapping.getPartitions()) {
-    if (funcDAG.find(subF) == funcDAG.end()) {
-      std::unique_ptr<DAGNode> subDAG = llvm::make_unique<DAGNode>();
-      subDAG->name = subF->getName();
-      subDAG->logicalDevices = mapping.getLogicalDeviceIDList(subF);
-      subDAG->backendName = mapping.getPartitionBackendName(subF);
-      funcDAG[subF] = subDAG.get();
-      nodes.push_back(std::move(subDAG));
-    }
-
-    // Link subF to its parents.
-    std::set<Function *> parents;
-    for (auto &N : subF->getNodes()) {
-      for (int inp = 0, e = N.getNumInputs(); inp < e; inp++) {
-        auto input = N.getNthInput(inp);
-        // No need to check Constant since it won't be the result of another
-        // function.
-        if (isa<Constant>(input.getNode())) {
-          continue;
-        }
-
-        Function *inputF = nullptr;
-        // It is possible that one input is the output of anther function.
-        if (Placeholder *ph = llvm::dyn_cast<Placeholder>(input.getNode())) {
-          for (auto &user : ph->getUsers()) {
-            if (auto *save = llvm::dyn_cast<SaveNode>(user.getUser())) {
-              placeholders[input] = save->getPlaceholder();
-              inputF = mapping[user.getUser()];
-              break;
-            }
-          }
-          if (!inputF) {
-            continue;
-          }
-        }
-
-        if (!inputF) {
-          inputF = mapping[input.getNode()];
-        }
-        if (subF == inputF)
-          continue;
-
-        // Check if a DAGNode for subF's parent is created or not. If not,
-        // create one.
-        if (funcDAG.find(inputF) == funcDAG.end()) {
-          std::unique_ptr<DAGNode> subDAG = llvm::make_unique<DAGNode>();
-          subDAG->name = inputF->getName();
-          subDAG->logicalDevices = mapping.getLogicalDeviceIDList(inputF);
-          subDAG->backendName = mapping.getPartitionBackendName(inputF);
-          funcDAG[inputF] = subDAG.get();
-          nodes.push_back(std::move(subDAG));
-        }
-
-        // subF is a child of inputF, inputF is a parent of subF.
-        if (parents.find(inputF) == parents.end()) {
-          funcDAG[inputF]->children.push_back(funcDAG[subF]);
-          funcDAG[subF]->parents.push_back(funcDAG[inputF]);
-          parents.insert(inputF);
-        }
-        // If we've already created a placeholder for this dependence, use it.
-        auto it = placeholders.find(input);
-        if (it != placeholders.end()) {
-          N.setNthInput(inp, it->second);
-          continue;
-        }
-
-        // Create a new placeholder to represent this dependence.
-        auto *save = inputF->createSave("tmp", input);
-        auto *tmp = save->getPlaceholder();
-        placeholders[input] = tmp;
-        N.setNthInput(inp, tmp);
-      }
-    }
-  }
-
-  if (saveDAG) {
-    DAG dag;
-    dag.root = std::move(DAGRoot);
-    dag.nodes = std::move(nodes);
-    partitions_.push_back(std::move(dag));
-  }
-
-  // Update links between nodes in the cloned functions. Add placeholders (and
-  // save nodes) where a link crosses a partition boundary.
-  for (auto *subF : mapping.getPartitions()) {
-    for (auto &N : subF->getNodes()) {
-      for (int inp = 0, e = N.getNumInputs(); inp < e; inp++) {
-        auto input = N.getNthInput(inp);
-        if (isa<Storage>(input.getNode()))
-          continue;
-        // Link this node to the clone of its input.
-        auto *clone = currToNew[input.getNode()];
-        N.setNthInput(inp, NodeValue(clone, input.getResNo()));
-      }
-    }
-  }
-
-  // For all DAGNode without parents, link them to the root DAG.
-  for (auto *subF : mapping.getPartitions()) {
-    if (funcDAG[subF]->parents.size() == 0) {
-      funcDAG[subF]->parents.push_back(root);
-      root->children.push_back(funcDAG[subF]);
-    }
-  }
-}
-
-FunctionToBackendNameMap Partitioner::backendBasedPartition(
-    Function *F, std::vector<Backend *> &backends, CompilationContext &cctx) {
-  FunctionToBackendNameMap ret;
+llvm::Expected<DAGListTy> Partitioner::backendBasedPartition(
+    FunctionToBackendNameMap &funcToBackend, Function *F,
+    std::vector<Backend *> &backends, CompilationContext &cctx) {
   NodeToFunctionMap mapping;
   llvm::DenseMap<Node *, std::string> nodeToBackendName;
 
@@ -437,8 +242,8 @@ FunctionToBackendNameMap Partitioner::backendBasedPartition(
         break;
       }
     }
-    assert(nodeToBackendName.find(&N) != nodeToBackendName.end() &&
-           "Node is not supported by any of the provided backends");
+    RETURN_ERR_IF_NOT(nodeToBackendName.find(&N) != nodeToBackendName.end(),
+                      "Node is not supported by any of the provided backends");
   }
 
   BFSLevel bfs = getBFSLevel(F);
@@ -452,10 +257,10 @@ FunctionToBackendNameMap Partitioner::backendBasedPartition(
     // When profiling, all the partition backend is assigned to
     // profilingBackend.
     mapping.createPartition(newF, profilingBackend);
-    ret[newF] = profilingBackend;
+    funcToBackend[newF] = profilingBackend;
   } else {
     mapping.createPartition(newF, backendName);
-    ret[newF] = backendName;
+    funcToBackend[newF] = backendName;
   }
   for (int i = level - 1; i >= 0; i--) {
     for (size_t j = 0, e = bfs[i].size(); j < e; j++) {
@@ -469,10 +274,10 @@ FunctionToBackendNameMap Partitioner::backendBasedPartition(
           // When profiling, all the partition backend is assigned to be
           // profilingBackend.
           mapping.createPartition(newF, profilingBackend);
-          ret[newF] = profilingBackend;
+          funcToBackend[newF] = profilingBackend;
         } else {
           mapping.createPartition(newF, backendName);
-          ret[newF] = backendName;
+          funcToBackend[newF] = backendName;
         }
       }
       mapping.add(N, newF);
@@ -493,28 +298,26 @@ FunctionToBackendNameMap Partitioner::backendBasedPartition(
       mapping.appendLogicalDeviceID(func, logicalDeviceID++);
     }
   }
-  doPartitioning(F->getName(), funcs, mapping, genDAG);
-
-  return ret;
+  return doPartitioning(F->getName(), funcs, module_, mapping, genDAG);
 }
 
-void Partitioner::getBackendMap(
+void Partitioner::genBackendMap(
     std::map<std::string, BackendInfo> &backendMap,
     std::vector<std::unique_ptr<Backend>> &backendsHolder,
     std::vector<Backend *> &backends) {
   // If the backends are created already, we use them directly.
   bool hasBackends = backends_.size() != 0;
   if (hasBackends) {
-    assert(backends_.size() == deviceInfo_.size() &&
-           "number of backends and devices is not match.");
+    DCHECK(backends_.size() == deviceInfo_.size())
+        << "number of backends and devices is not match.";
   }
 
   int n = 0;
   for (size_t i = 0, e = deviceInfo_.size(); i < e; i++) {
     std::string backendName = deviceInfo_[i].backendName;
     if (hasBackends) {
-      assert(backends_[i]->getBackendName() == backendName &&
-             "Backend Type mismatch.");
+      DCHECK(backends_[i]->getBackendName() == backendName)
+          << "Backend Type mismatch.";
     }
     if (backendMap.find(backendName) == backendMap.end()) {
       BackendInfo backendInfo;
@@ -545,9 +348,10 @@ void Partitioner::getBackendMap(
   }
 }
 
-llvm::Error Partitioner::createDAGWithoutPartition(
+llvm::Expected<DAGListTy> Partitioner::createDAGWithoutPartition(
     llvm::StringRef backendName, std::map<std::string, BackendInfo> &backendMap,
     CompilationContext &cctx) {
+  DAGListTy partitions;
   for (auto F : module_->getFunctions()) {
     if (!optimized_) {
       auto backend = backendMap[backendName].backend;
@@ -565,13 +369,13 @@ llvm::Error Partitioner::createDAGWithoutPartition(
     DAG0->children.push_back(DAG1.get());
     DAGNodePtrVec nodes;
     nodes.push_back(std::move(DAG1));
-    partitions_.push_back({std::move(DAG0), std::move(nodes)});
+    partitions.push_back({std::move(DAG0), std::move(nodes)});
   }
   if (saturateHost_) {
     // Saturate the Host.
-    saturateHost(1);
+    saturateHost(1, partitions);
   }
-  return llvm::Error::success();
+  return std::move(partitions);
 }
 
 llvm::Error Partitioner::loadBalancedPartitioning(Function *F,
@@ -659,7 +463,7 @@ llvm::Error Partitioner::loadBalancedPartitioning(Function *F,
       auto curOpMemory = getNodeMemUsage(N);
 
       // Find a partition to put this node into
-      int curPartition = maxLogicalDeviceId;
+      DeviceIDTy curPartition = maxLogicalDeviceId;
       const float allowedLoadImbalanceFraction = 0.5f;
       for (; curPartition < numDevices; curPartition++) {
         // Put the op in current partition if
@@ -695,7 +499,7 @@ llvm::Error Partitioner::loadBalancedPartitioning(Function *F,
                         "Load balance partition error");
     }
   }
-  for (int i = 0; i < numDevices; i++) {
+  for (size_t i = 0; i < numDevices; i++) {
     VLOG(1) << "Partition #" << i << " has estimated runtime " << deviceTime[i];
   }
 
@@ -703,17 +507,29 @@ llvm::Error Partitioner::loadBalancedPartitioning(Function *F,
   return llvm::Error::success();
 }
 
-llvm::Error Partitioner::QuantizationProfilingPartition(
-    CompilationContext &cctx, Function *F, std::vector<Backend *> backends) {
+llvm::Expected<DAGListTy>
+Partitioner::quantizationProfilingPartition(CompilationContext &cctx) {
+  // For quantization profiling flow, currently we assume there is only 1
+  // function in a module.
+  DCHECK(module_->getFunctions().size() == 1)
+      << "Invalid number of functions in a module. For quantization profiling "
+         "flow, the module can only contain 1 function";
+
   // Quantization profiling flow is run under CPU backend, so we don't really
   // need the concrete partition. The backendBasedPartition is necessary since
   // we need the mapping between quantized tensor and original tensor.
+  DAGListTy partitions;
+  std::vector<Backend *> backends;
+  genBackendMap(backendMap_, backendHolder, backends);
+  F_ = selectRepFunc(module_, memSize_);
+
   FunctionToBackendNameMap funcToBackend;
-  funcToBackend = backendBasedPartition(F_, backends, cctx);
+  ASSIGN_VALUE_OR_RETURN_ERR(
+      partitions, backendBasedPartition(funcToBackend, F_, backends, cctx));
   module_->eraseFunction(F_);
   std::unique_ptr<Backend> backend(createBackend(profilingBackend));
   for (Function *subF : module_->getFunctions()) {
-    assert(subF->verify() && "Conversion led to invalid function");
+    DCHECK(subF->verify()) << "Conversion led to invalid function";
     if (!optimized_) {
       RETURN_IF_ERR(::glow::optimizeFunction(subF, *backend, cctx));
     }
@@ -723,30 +539,20 @@ llvm::Error Partitioner::QuantizationProfilingPartition(
         << "Profiling a model to be partitioned cross different backends. Each "
            "sub-network will be optimized and run on cpu backend.\n";
   }
-  return llvm::Error::success();
+  return std::move(partitions);
 }
 
-llvm::Error Partitioner::Partition(CompilationContext &cctx) {
+llvm::Expected<DAGListTy>
+Partitioner::heterogeneousPartition(CompilationContext &cctx) {
+  DAGListTy partitions;
   // Prepare the mapping between BackendName and BackendInfo.
   std::vector<Backend *> backends;
-  std::vector<std::unique_ptr<Backend>> backendHolder;
-  getBackendMap(backendMap_, backendHolder, backends);
-
-  if (partitionConfig_.enabled()) {
-    // Jump into user-defined partition, and skip the following auto partition.
-    return PartitionFromConfig();
-  }
+  genBackendMap(backendMap_, backendHolder, backends);
 
   // Step 0: Find the representative function for running partitioning
   // algorithm.
   F_ = selectRepFunc(module_, memSize_);
 
-  if (cctx.precisionConfig.quantMode == QuantizationMode::Profile) {
-    // Jump into profiling flow, and leave without generating partitions for the
-    // backends with same type..
-    return QuantizationProfilingPartition(cctx, F_, backends);
-  }
-
   // Step 1 : do the partition based on backends type.
   FunctionToBackendNameMap funcToBackend;
   std::string origName(F_->getName().data());
@@ -767,8 +573,15 @@ llvm::Error Partitioner::Partition(CompilationContext &cctx) {
       }
       return createDAGWithoutPartition(backendName, backendMap_, cctx);
     }
+    DCHECK(module_->getFunctions().size() == 1)
+        << "Invalid number of functions in a module. Now in heterogeneouse "
+           "partition flow, the module can only contain 1 function";
   } else {
-    funcToBackend = backendBasedPartition(F_, backends, cctx);
+    DCHECK(module_->getFunctions().size() == 1)
+        << "Invalid number of functions in a module. Now in heterogeneouse "
+           "partition flow, the module can only contain 1 function";
+    ASSIGN_VALUE_OR_RETURN_ERR(
+        partitions, backendBasedPartition(funcToBackend, F_, backends, cctx));
     module_->eraseFunction(F_);
   }
 
@@ -781,7 +594,7 @@ llvm::Error Partitioner::Partition(CompilationContext &cctx) {
     auto *backend = backendMap_[i->second].backend;
     auto availMem = backendMap_[i->second].memSize;
     funcs.push_back(func);
-    assert(func->verify() && "Conversion led to invalid function");
+    DCHECK(func->verify()) << "Conversion led to invalid function";
     // Step 2.1 : optimize a function if it has not been optimized yet.
     if (!optimized_) {
       RETURN_IF_ERR(::glow::optimizeFunction(func, *backend, cctx));
@@ -825,10 +638,11 @@ llvm::Error Partitioner::Partition(CompilationContext &cctx) {
   }
 
   // Step 5 : do the real partitioning for the function list.
-  doPartitioning(origName, funcs, mapping, true);
+  partitions =
+      doPartitioning(origName, funcs, module_, mapping, /* saveDAG */ true);
 
   // DAG validation.
-  RETURN_IF_ERR(dagValidation(partitions_[0]));
+  RETURN_IF_ERR(dagValidation(partitions[0]));
 
   // Step 6 : Post-partition optimization - Adjust the logicalDevice for each
   // DAGNode.
@@ -837,7 +651,7 @@ llvm::Error Partitioner::Partition(CompilationContext &cctx) {
     // Attempt to saturate the host when there is only one type of backend.
     // Passing in the count of logical devices. Since logicalId starts at 0 we
     // add one.
-    saturateHost(logicalDeviceID_);
+    saturateHost(logicalDeviceID_, partitions);
   }
 
   // Step 7 : clean up and verify the generated new functions.
@@ -849,7 +663,7 @@ llvm::Error Partitioner::Partition(CompilationContext &cctx) {
   if (logPartition) {
     LOG(INFO) << "The number of partitions is : " << funcList.size()
               << ", and the DAG is dumped into DAG.dot file.\n";
-    dumpDAG("DAG.dot");
+    dumpDAG("DAG.dot", partitions);
   }
 
   for (Function *subF : funcList) {
@@ -859,49 +673,53 @@ llvm::Error Partitioner::Partition(CompilationContext &cctx) {
                     "__" + subF->getName().str() + "__" +
                     mapping.getPartitionBackendName(subF) + ".dot");
     }
-    assert(subF->verify() && "Conversion led to invalid function");
+    DCHECK(subF->verify()) << "Conversion led to invalid function";
   }
   if (logPartition) {
     logPartitionInfo(mapping);
   }
-  return llvm::Error::success();
+
+  return std::move(partitions);
 }
 
-llvm::Error Partitioner::PartitionFromConfig() {
-  Function *F = module_->getFunction(partitionConfig_.funcName);
+llvm::Expected<DAGListTy>
+Partitioner::partitionFromConfig(const PartitionConfig &partitionConfig) {
+  DAGListTy partitions;
+  // Prepare the mapping between BackendName and BackendInfo.
+  std::vector<Backend *> backends;
+  genBackendMap(backendMap_, backendHolder, backends);
+  Function *F = module_->getFunction(partitionConfig.funcName);
   RETURN_ERR_IF_NOT(F, strFormat("Can't find function %s in current module.",
                                  F->getName().str().data()));
 
-  DCHECK(partitionConfig_.numOfPartitions ==
-             partitionConfig_.backendNames.size() &&
-         partitionConfig_.numOfPartitions ==
-             partitionConfig_.partitionNames.size())
+  DCHECK(
+      partitionConfig.numOfPartitions == partitionConfig.backendNames.size() &&
+      partitionConfig.numOfPartitions == partitionConfig.partitionNames.size())
       << "Invalid user-defined partition config.";
 
   NodeToFunctionMap partitionMap;
   std::vector<Function *> funcList;
   std::unordered_set<size_t> unused;
-  std::vector<NodesSet> nodesSets(partitionConfig_.numOfPartitions);
+  std::vector<NodesSet> nodesSets(partitionConfig.numOfPartitions);
   // Create partitions based on the given number and names.
-  for (size_t i = 0; i < partitionConfig_.numOfPartitions; i++) {
-    Function *newF =
-        module_->createFunction(partitionConfig_.partitionNames[i]);
+  for (size_t i = 0; i < partitionConfig.numOfPartitions; i++) {
+    Function *newF = module_->createFunction(partitionConfig.partitionNames[i]);
     funcList.push_back(newF);
-    partitionMap.createPartition(newF, partitionConfig_.backendNames[i]);
+    partitionMap.createPartition(newF, partitionConfig.backendNames[i]);
     unused.insert(i);
   }
 
   // Map the nodes the the partitions.
   std::vector<Node *> unMapped;
   for (auto &node : F->getNodes()) {
-    auto iter = partitionConfig_.nodeToPartition.find(node.getName());
-    if (iter == partitionConfig_.nodeToPartition.end()) {
+    auto iter = partitionConfig.nodeToPartition.find(node.getName());
+    if (iter == partitionConfig.nodeToPartition.end()) {
       // If a node in F is not in the node to partition mapping, put it into
       // unMaped list.
       unMapped.push_back(&node);
     } else {
       size_t partitionID = iter->second;
-      DCHECK(partitionID < partitionConfig_.numOfPartitions)
+      DCHECK(partitionID < partitionConfig.numOfPartitions)
           << "Invalid partition id :" << partitionID;
       partitionMap.add(&node, funcList[partitionID]);
       unused.erase(partitionID);
@@ -921,7 +739,7 @@ llvm::Error Partitioner::PartitionFromConfig() {
   }
 
   // Validate memory usage.
-  for (size_t i = 0; i < partitionConfig_.numOfPartitions; i++) {
+  for (size_t i = 0; i < partitionConfig.numOfPartitions; i++) {
     GraphMemInfo cost = getGraphMemInfo(nodesSets[i]);
     partitionMap.setGraphMemInfo(funcList[i], cost);
   }
@@ -932,18 +750,19 @@ llvm::Error Partitioner::PartitionFromConfig() {
   RETURN_IF_ERR(logicalDevicesValidation(partitionMap, backendMap_));
 
   // Do partition.
-  doPartitioning(F->getName(), {F}, partitionMap, true);
+  partitions = doPartitioning(F->getName(), {F}, module_, partitionMap,
+                              /* saveDAG */ true);
   module_->eraseFunction(F);
 
   // DAG validation.
-  RETURN_IF_ERR(dagValidation(partitions_[0]));
+  RETURN_IF_ERR(dagValidation(partitions[0]));
 
   // Do optimization based on backendName.
-  for (size_t i = 0; i < partitionConfig_.numOfPartitions; i++) {
+  for (size_t i = 0; i < partitionConfig.numOfPartitions; i++) {
     auto func = funcList[i];
-    assert(func->verify() && "Conversion led to invalid function");
+    DCHECK(func->verify()) << "Conversion led to invalid function";
     std::unique_ptr<Backend> backend(
-        createBackend(partitionConfig_.backendNames[i]));
+        createBackend(partitionConfig.backendNames[i]));
     if (!optimized_) {
       CompilationContext cctx;
       RETURN_IF_ERR(::glow::optimizeFunction(func, *backend, cctx));
@@ -952,5 +771,20 @@ llvm::Error Partitioner::PartitionFromConfig() {
   if (logPartition) {
     logPartitionInfo(partitionMap);
   }
-  return llvm::Error::success();
+  return std::move(partitions);
+}
+
+llvm::Expected<DAGListTy> Partitioner::partition(CompilationContext &cctx) {
+  if (partitionConfig_.enabled()) {
+    // Call user-defined partition flow.
+    return partitionFromConfig(partitionConfig_);
+  }
+
+  if (cctx.precisionConfig.quantMode == QuantizationMode::Profile) {
+    // Call quantization profiling partition flow.
+    return quantizationProfilingPartition(cctx);
+  }
+
+  // Call heterogeneous partition flow.
+  return heterogeneousPartition(cctx);
 }
diff --git a/lib/Partitioner/PartitionerBase.cpp b/lib/Partitioner/PartitionerBase.cpp
new file mode 100644
index 0000000000..d4232b2fd4
--- /dev/null
+++ b/lib/Partitioner/PartitionerBase.cpp
@@ -0,0 +1,220 @@
+/**
+ * Copyright (c) 2017-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "glow/Partitioner/PartitionerBase.h"
+#include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/raw_ostream.h"
+#include <fstream>
+
+using namespace glow;
+using llvm::isa;
+
+// Current only partition the representative function.
+DAGListTy PartitionerBase::doPartitioning(llvm::StringRef funcName,
+                                          std::vector<Function *> funcs,
+                                          Module *module,
+                                          NodeToFunctionMap &mapping,
+                                          bool saveDAG) {
+  DAGListTy partitions;
+  // Add a dummy node to make sure that a DAG has a single entrance.
+  DAGNodePtr DAGRoot = llvm::make_unique<DAGNode>();
+  DAGNodePtrVec nodes;
+  DAGRoot->logicalDevices = {0};
+  DAGRoot->name = funcName;
+  DAGRoot->module = module;
+  DAGRoot->deviceIDs = {0};
+  DAGNode *root = DAGRoot.get();
+
+  llvm::DenseMap<Node *, Node *> currToNew;
+
+  // Clone nodes into target partition.
+  for (size_t i = 0, e = funcs.size(); i < e; i++) {
+    for (auto &N : funcs[i]->getNodes()) {
+      auto *clone = N.clone();
+      currToNew[&N] = clone;
+      mapping[&N]->addNode(clone);
+    }
+  }
+
+  // For any dependency that crosses a partition, add a placeholder and save
+  // node. Record the dependence in the function graph.
+  std::unordered_map<NodeValue, Placeholder *> placeholders;
+  llvm::DenseMap<Function *, DAGNode *> funcDAG;
+  for (auto *subF : mapping.getPartitions()) {
+    if (funcDAG.find(subF) == funcDAG.end()) {
+      std::unique_ptr<DAGNode> subDAG = llvm::make_unique<DAGNode>();
+      subDAG->name = subF->getName();
+      subDAG->logicalDevices = mapping.getLogicalDeviceIDList(subF);
+      subDAG->backendName = mapping.getPartitionBackendName(subF);
+      funcDAG[subF] = subDAG.get();
+      nodes.push_back(std::move(subDAG));
+    }
+
+    // Link subF to its parents.
+    std::set<Function *> parents;
+    for (auto &N : subF->getNodes()) {
+      for (int inp = 0, e = N.getNumInputs(); inp < e; inp++) {
+        auto input = N.getNthInput(inp);
+        // No need to check Constant since it won't be the result of another
+        // function.
+        if (isa<Constant>(input.getNode())) {
+          continue;
+        }
+
+        Function *inputF = nullptr;
+        // It is possible that one input is the output of anther function.
+        if (Placeholder *ph = llvm::dyn_cast<Placeholder>(input.getNode())) {
+          for (auto &user : ph->getUsers()) {
+            if (auto *save = llvm::dyn_cast<SaveNode>(user.getUser())) {
+              placeholders[input] = save->getPlaceholder();
+              inputF = mapping[user.getUser()];
+              break;
+            }
+          }
+          if (!inputF) {
+            continue;
+          }
+        }
+
+        if (!inputF) {
+          inputF = mapping[input.getNode()];
+        }
+        if (subF == inputF) {
+          continue;
+        }
+        // Check if a DAGNode for subF's parent is created or not. If not,
+        // create one.
+        if (funcDAG.find(inputF) == funcDAG.end()) {
+          std::unique_ptr<DAGNode> subDAG = llvm::make_unique<DAGNode>();
+          subDAG->name = inputF->getName();
+          subDAG->logicalDevices = mapping.getLogicalDeviceIDList(inputF);
+          subDAG->backendName = mapping.getPartitionBackendName(inputF);
+          funcDAG[inputF] = subDAG.get();
+          nodes.push_back(std::move(subDAG));
+        }
+
+        // subF is a child of inputF, inputF is a parent of subF.
+        if (parents.find(inputF) == parents.end()) {
+          funcDAG[inputF]->children.push_back(funcDAG[subF]);
+          funcDAG[subF]->parents.push_back(funcDAG[inputF]);
+          parents.insert(inputF);
+        }
+        // If we've already created a placeholder for this dependence, use it.
+        auto it = placeholders.find(input);
+        if (it != placeholders.end()) {
+          N.setNthInput(inp, it->second);
+          continue;
+        }
+
+        // Create a new placeholder to represent this dependence.
+        auto *save = inputF->createSave("tmp", input);
+        auto *tmp = save->getPlaceholder();
+        placeholders[input] = tmp;
+        N.setNthInput(inp, tmp);
+      }
+    }
+  }
+
+  if (saveDAG) {
+    DAG dag;
+    dag.root = std::move(DAGRoot);
+    dag.nodes = std::move(nodes);
+    partitions.push_back(std::move(dag));
+  }
+
+  // Update links between nodes in the cloned functions. Add placeholders (and
+  // save nodes) where a link crosses a partition boundary.
+  for (auto *subF : mapping.getPartitions()) {
+    for (auto &N : subF->getNodes()) {
+      for (int inp = 0, e = N.getNumInputs(); inp < e; inp++) {
+        auto input = N.getNthInput(inp);
+        if (isa<Storage>(input.getNode())) {
+          continue;
+        }
+        // Link this node to the clone of its input.
+        auto *clone = currToNew[input.getNode()];
+        N.setNthInput(inp, NodeValue(clone, input.getResNo()));
+      }
+    }
+  }
+
+  // For all DAGNode without parents, link them to the root DAG.
+  for (auto *subF : mapping.getPartitions()) {
+    if (funcDAG[subF]->parents.size() == 0) {
+      funcDAG[subF]->parents.push_back(root);
+      root->children.push_back(funcDAG[subF]);
+    }
+  }
+  return partitions;
+}
+
+void PartitionerBase::dumpDAG(llvm::StringRef dotFilename,
+                              const DAGListTy &partitions) const {
+  if (partitions.size() == 0) {
+    return;
+  }
+  auto *root = partitions[0].root.get();
+  LOG(INFO) << "Writing dotty graph for DAG after graph partitioning: "
+            << dotFilename.str();
+  std::ofstream myfile;
+  myfile.open(dotFilename);
+  myfile << "digraph DAG {\n\trankdir=TB;\n";
+  // Dump DAGNodes
+  std::vector<DAGNode *> nodes;
+  llvm::SmallSet<DAGNode *, 10> used;
+  nodes.push_back(root);
+  int cur = 0;
+  int num = 1;
+  while (cur < num) {
+    auto *node = nodes[cur];
+    for (size_t i = 0; i < node->children.size(); i++) {
+      auto child = node->children[i];
+      DescriptionBuilder db(child->name.c_str());
+      const std::string &backendName = child->backendName;
+      db.addParam("BackendName", backendName);
+      myfile << "\"" << escapeDottyString(child->name) << "\""
+             << " [ label = \"" << escapeDottyString(db) << "\"";
+      myfile << "\tshape = \"record\"\n";
+      myfile << "\tstyle=\"filled,rounded\"\n";
+      auto colorIdx = llvm::hash_value(backendName);
+      myfile << "\tfillcolor=" << getDotFileNodeColor(colorIdx) << "\n";
+      myfile << "penwidth = 2];\n";
+      if (used.count(child) == 0) {
+        nodes.push_back(child);
+        used.insert(child);
+        num++;
+      }
+    }
+    cur++;
+  }
+
+  // Dump edges.
+  for (size_t i = 0; i < nodes.size(); i++) {
+    auto *node = nodes[i];
+    for (size_t j = 0; j < node->children.size(); j++) {
+      auto child = node->children[j];
+      myfile << "\"" << escapeDottyString(node->name) << "\""
+             << " -> "
+             << "\"" << escapeDottyString(child->name) << "\""
+             << ";";
+    }
+  }
+  myfile << "}";
+
+  myfile.close();
+  return;
+}
diff --git a/lib/Runtime/HostManager/HostManager.cpp b/lib/Runtime/HostManager/HostManager.cpp
index 359c4bb59c..fed0e0bca6 100644
--- a/lib/Runtime/HostManager/HostManager.cpp
+++ b/lib/Runtime/HostManager/HostManager.cpp
@@ -131,9 +131,9 @@ llvm::Error HostManager::addNetwork(std::unique_ptr<Module> module,
   for (Function *F : module->getFunctions()) {
     RETURN_IF_ERR(optimizeFunctionBeforeLowering(F, cctx));
   }
-  auto partitioner = Partitioner(module.get(), deviceInfo, saturateHost);
-  RETURN_IF_ERR(partitioner.Partition(cctx));
-  auto nodeList = std::move(partitioner.getPartitionResult());
+  Partitioner partitioner(module.get(), deviceInfo, saturateHost);
+  DAGListTy nodeList;
+  ASSIGN_VALUE_OR_RETURN_ERR(nodeList, partitioner.partition(cctx));
 
   if (cctx.precisionConfig.quantMode == QuantizationMode::Profile) {
     // Since for profiling the provisioner will be reset, we only allow one
diff --git a/tests/unittests/PartitionerTest.cpp b/tests/unittests/PartitionerTest.cpp
index 43559ed839..9731bb3486 100644
--- a/tests/unittests/PartitionerTest.cpp
+++ b/tests/unittests/PartitionerTest.cpp
@@ -163,18 +163,17 @@ TEST_F(PartitionerTest, Basic1) {
       {3072, "Interpreter"}, {3072, "Interpreter"}, {3072, "Interpreter"}};
   Partitioner myPartitioner(&EEP.getModule(), devices, false, true);
   CompilationContext cctx;
-  auto err = myPartitioner.Partition(cctx);
-  EXPECT_FALSE(errToBool(std::move(err)));
-  DAGListTy dagList = std::move(myPartitioner.getPartitionResult());
+  auto dagList = myPartitioner.partition(cctx);
+  EXPECT_TRUE((bool)dagList);
   EXPECT_EQ(EEP.getModule().getFunctions().size(), 3);
-  EXPECT_EQ(dagList.size(), 1);
+  EXPECT_EQ(dagList->size(), 1);
   EXPECT_TRUE(checkSaveNode(EEP.getModule()));
 
   // Run the paritioned graph and compare the results.
   bindings_.clear();
   bindings_.allocate(EEP.getModule().getPlaceholders());
   EEP.compile(cctx);
-  for (auto it = dagList.begin(); it != dagList.end(); ++it) {
+  for (auto it = dagList->begin(); it != dagList->end(); ++it) {
     executeDAG((*it).root.get(), EEP.getModule(), bindings_,
                {bindings_.getPlaceholderByName("input")}, {&in}, &EEP);
     Tensor test = bindings_.get(bindings_.getPlaceholderByName("ret"))->clone();
@@ -251,14 +250,12 @@ TEST_F(PartitionerTest, Basic2) {
                                      {2048, "Interpreter"}};
   Partitioner myPartitioner(&EEP.getModule(), devices, /* saturateHost */ true);
   CompilationContext cctx;
-  auto err = myPartitioner.Partition(cctx);
-  EXPECT_FALSE(errToBool(std::move(err)));
-  DAGListTy dagList = std::move(myPartitioner.getPartitionResult());
+  auto dagList = myPartitioner.partition(cctx);
   EXPECT_EQ(EEP.getModule().getFunctions().size(), 2);
-  EXPECT_EQ(dagList.size(), 1);
+  EXPECT_EQ(dagList->size(), 1);
   ASSERT_TRUE(checkSaveNode(EEP.getModule()));
 
-  for (auto &dag : dagList) {
+  for (auto &dag : dagList.get()) {
     for (auto &node : dag.nodes) {
       // Since saturateHost is set true, in this case, there should be 2 copys
       // of the partitions.
@@ -270,7 +267,7 @@ TEST_F(PartitionerTest, Basic2) {
   bindings_.clear();
   bindings_.allocate(EEP.getModule().getPlaceholders());
   EEP.compile(cctx);
-  for (auto it = dagList.begin(); it != dagList.end(); ++it) {
+  for (auto it = dagList->begin(); it != dagList->end(); ++it) {
     updateInputPlaceholders(bindings_,
                             {bindings_.getPlaceholderByName("input"),
                              bindings_.getPlaceholderByName("input1")},
@@ -346,74 +343,61 @@ TEST_F(PartitionerTest, Error1) {
   std::vector<DeviceInfo> devices = {{2048, "Interpreter"}};
   Partitioner myPartitioner(&EEP.getModule(), devices);
   CompilationContext cctx;
-  auto err = myPartitioner.Partition(cctx);
-  EXPECT_TRUE(errToBool(std::move(err)));
+  auto dagList = myPartitioner.partition(cctx);
+  EXPECT_FALSE((bool)dagList);
 }
 
 /// This one tests the roofline computed with compute, memory and
 /// communication costs
 TEST_F(PartitionerTest, Basic1Roofline) {
-  ExecutionEngine EER, EEP;
+  ExecutionEngine EEP;
   constexpr float range = 2.0;
-  std::vector<ExecutionEngine *> engines{&EER, &EEP};
-  for (auto EE : engines) {
-    auto mod = &EE->getModule();
-    F_ = mod->createFunction("main");
-    auto *input =
-        mod->createPlaceholder(ElemKind::FloatTy, {1, 32}, "input", false);
-    auto *w1 = mod->createConstant(ElemKind::FloatTy, {32, 16}, "w1");
-    auto *b1 = mod->createConstant(ElemKind::FloatTy, {16}, "b1");
-    bindings_.allocate(input);
-    w1->getHandle<>().randomize(-range, range, mod->getPRNG());
-    b1->getHandle<>().randomize(-range, range, mod->getPRNG());
-
-    // Initial FC.
-    Node *I = F_->createFullyConnected("initial_fc", input, w1, b1);
-    I = F_->createSigmoid("initial_sigmoid", I);
 
-    // Left branch.
-    auto *w2 = mod->createConstant(ElemKind::FloatTy, {16, 16}, "w2");
-    auto *b2 = mod->createConstant(ElemKind::FloatTy, {16}, "b2");
-    w2->getHandle<>().randomize(-range, range, mod->getPRNG());
-    b2->getHandle<>().randomize(-range, range, mod->getPRNG());
-    Node *L = F_->createFullyConnected("left_fc1", I, w2, b2);
-    L = F_->createSigmoid("left_sigmoid1", L);
-    auto *w3 = mod->createConstant(ElemKind::FloatTy, {16, 8}, "w3");
-    auto *b3 = mod->createConstant(ElemKind::FloatTy, {8}, "b3");
-    w3->getHandle<>().randomize(-range, range, mod->getPRNG());
-    b3->getHandle<>().randomize(-range, range, mod->getPRNG());
-    L = F_->createFullyConnected("left_fc2", L, w3, b3);
-    L = F_->createSigmoid("left_sigmoid2", L);
+  auto mod = &EEP.getModule();
+  F_ = mod->createFunction("main");
+  auto *input =
+      mod->createPlaceholder(ElemKind::FloatTy, {1, 32}, "input", false);
+  auto *w1 = mod->createConstant(ElemKind::FloatTy, {32, 16}, "w1");
+  auto *b1 = mod->createConstant(ElemKind::FloatTy, {16}, "b1");
+  bindings_.allocate(input);
+  w1->getHandle<>().randomize(-range, range, mod->getPRNG());
+  b1->getHandle<>().randomize(-range, range, mod->getPRNG());
 
-    // Right branch.
-    auto *w4 = mod->createConstant(ElemKind::FloatTy, {16, 16}, "w4");
-    auto *b4 = mod->createConstant(ElemKind::FloatTy, {16}, "b4");
-    w4->getHandle<>().randomize(-range, range, mod->getPRNG());
-    b4->getHandle<>().randomize(-range, range, mod->getPRNG());
-    Node *R = F_->createFullyConnected("right_fc1", I, w4, b4);
-    R = F_->createSigmoid("right_sigmoid1", R);
-    auto *w5 = mod->createConstant(ElemKind::FloatTy, {16, 8}, "w5");
-    auto *b5 = mod->createConstant(ElemKind::FloatTy, {8}, "b5");
-    w5->getHandle<>().randomize(-range, range, mod->getPRNG());
-    b5->getHandle<>().randomize(-range, range, mod->getPRNG());
-    R = F_->createFullyConnected("right_fc2", R, w5, b5);
-    R = F_->createSigmoid("right_sigmoid2", R);
+  // Initial FC.
+  Node *I = F_->createFullyConnected("initial_fc", input, w1, b1);
+  I = F_->createSigmoid("initial_sigmoid", I);
 
-    // Join branches.
-    auto *mul = F_->createMul("mul", L, R);
-    F_->createSave("ret", mul);
-  }
+  // Left branch.
+  auto *w2 = mod->createConstant(ElemKind::FloatTy, {16, 16}, "w2");
+  auto *b2 = mod->createConstant(ElemKind::FloatTy, {16}, "b2");
+  w2->getHandle<>().randomize(-range, range, mod->getPRNG());
+  b2->getHandle<>().randomize(-range, range, mod->getPRNG());
+  Node *L = F_->createFullyConnected("left_fc1", I, w2, b2);
+  L = F_->createSigmoid("left_sigmoid1", L);
+  auto *w3 = mod->createConstant(ElemKind::FloatTy, {16, 8}, "w3");
+  auto *b3 = mod->createConstant(ElemKind::FloatTy, {8}, "b3");
+  w3->getHandle<>().randomize(-range, range, mod->getPRNG());
+  b3->getHandle<>().randomize(-range, range, mod->getPRNG());
+  L = F_->createFullyConnected("left_fc2", L, w3, b3);
+  L = F_->createSigmoid("left_sigmoid2", L);
 
-  // Infer using the un-partitioned graph.
-  Tensor in(ElemKind::FloatTy, {1, 32});
-  in.getHandle<>().randomize(-range, range, EER.getModule().getPRNG());
+  // Right branch.
+  auto *w4 = mod->createConstant(ElemKind::FloatTy, {16, 16}, "w4");
+  auto *b4 = mod->createConstant(ElemKind::FloatTy, {16}, "b4");
+  w4->getHandle<>().randomize(-range, range, mod->getPRNG());
+  b4->getHandle<>().randomize(-range, range, mod->getPRNG());
+  Node *R = F_->createFullyConnected("right_fc1", I, w4, b4);
+  R = F_->createSigmoid("right_sigmoid1", R);
+  auto *w5 = mod->createConstant(ElemKind::FloatTy, {16, 8}, "w5");
+  auto *b5 = mod->createConstant(ElemKind::FloatTy, {8}, "b5");
+  w5->getHandle<>().randomize(-range, range, mod->getPRNG());
+  b5->getHandle<>().randomize(-range, range, mod->getPRNG());
+  R = F_->createFullyConnected("right_fc2", R, w5, b5);
+  R = F_->createSigmoid("right_sigmoid2", R);
 
-  EER.compile(CompilationMode::Infer);
-  bindings_.clear();
-  bindings_.allocate(EER.getModule().getPlaceholders());
-  updateInputPlaceholders(bindings_, {bindings_.getPlaceholderByName("input")},
-                          {&in});
-  EER.run(bindings_);
+  // Join branches.
+  auto *mul = F_->createMul("mul", L, R);
+  F_->createSave("ret", mul);
 
   // Since the partitioner will look at all nodesin the function post
   // optimization and lowering, we need to do so here for the same list of
@@ -477,8 +461,8 @@ TEST_F(PartitionerTest, SelectRepFunc) {
                                     {1000000, "Interpreter"}});
 
   CompilationContext cctx;
-  auto err = myPartitioner.Partition(cctx);
-  EXPECT_FALSE(errToBool(std::move(err)));
+  auto dagList = myPartitioner.partition(cctx);
+  EXPECT_TRUE((bool)dagList);
 }
 
 /// Create a mock backend and rewrite the isOpSupported function
@@ -612,16 +596,14 @@ TEST_F(PartitionerTest, SimpleHeterogeneousPartitioning) {
   backends.emplace_back(&backendWithoutSub1);
   std::vector<DeviceInfo> devices = {
       {3072, "Interpreter"}, {3072, "Interpreter"}, {3072, "CPU"}};
-  auto partitioner =
-      Partitioner(&mod_, devices, backends, /* saturateHost */ true);
+  Partitioner partitioner(&mod_, devices, backends, /* saturateHost */ true);
   CompilationContext cctx;
-  auto err = partitioner.Partition(cctx);
-  EXPECT_FALSE(errToBool(std::move(err)));
-  DAGListTy dagList = std::move(partitioner.getPartitionResult());
+  auto dagList = partitioner.partition(cctx);
+  EXPECT_TRUE((bool)dagList);
   EXPECT_EQ(mod_.getFunctions().size(), 3);
-  EXPECT_EQ(dagList.size(), 1);
+  EXPECT_EQ(dagList->size(), 1);
   ASSERT_TRUE(checkSaveNode(mod_));
-  heterogeneousPartitionValidation(dagList, mod_);
+  heterogeneousPartitionValidation(dagList.get(), mod_);
 
   mod_.clear();
 }
@@ -634,15 +616,14 @@ TEST_F(PartitionerTest, heterogeneousPartitioningWithNonSupportedNodes) {
   std::vector<DeviceInfo> devices = {{3072, "Interpreter", "Mul"},
                                      {3072, "Interpreter", "Mul"},
                                      {3072, "CPU", "Sub"}};
-  auto partitioner = Partitioner(&mod_, devices);
+  Partitioner partitioner(&mod_, devices);
   CompilationContext cctx;
-  auto err = partitioner.Partition(cctx);
-  EXPECT_FALSE(errToBool(std::move(err)));
-  DAGListTy dagList = std::move(partitioner.getPartitionResult());
+  auto dagList = partitioner.partition(cctx);
+  EXPECT_TRUE((bool)dagList);
   EXPECT_EQ(mod_.getFunctions().size(), 3);
-  EXPECT_EQ(dagList.size(), 1);
+  EXPECT_EQ(dagList->size(), 1);
   ASSERT_TRUE(checkSaveNode(mod_));
-  heterogeneousPartitionValidation(dagList, mod_);
+  heterogeneousPartitionValidation(dagList.get(), mod_);
 
   mod_.clear();
 }
@@ -658,15 +639,14 @@ TEST_F(PartitionerTest, heterogeneousPartitioningWithSupportedNodes) {
       {3072, "Interpreter", "", "Sub,Add,Save"},
       {3072, "Interpreter", "", "Sub,Add,Save"},
       {3072, "CPU", "", "Mul,Add,Save"}};
-  auto partitioner = Partitioner(&mod_, devices);
+  Partitioner partitioner(&mod_, devices);
   CompilationContext cctx;
-  auto err = partitioner.Partition(cctx);
-  EXPECT_FALSE(errToBool(std::move(err)));
-  DAGListTy dagList = std::move(partitioner.getPartitionResult());
+  auto dagList = partitioner.partition(cctx);
+  EXPECT_TRUE((bool)dagList);
   EXPECT_EQ(mod_.getFunctions().size(), 3);
-  EXPECT_EQ(dagList.size(), 1);
+  EXPECT_EQ(dagList->size(), 1);
   ASSERT_TRUE(checkSaveNode(mod_));
-  heterogeneousPartitionValidation(dagList, mod_);
+  heterogeneousPartitionValidation(dagList.get(), mod_);
 
   mod_.clear();
 }
@@ -694,17 +674,16 @@ TEST_F(PartitionerTest, logicalIDTest0) {
                                      {1500, "Interpreter"}};
   // Create two backends which support different ops, then do the partition by
   // assigning the ops to the corresponding abackends.
-  auto partitioner = Partitioner(&mod_, devices, /* saturateHost */ true);
+  Partitioner partitioner(&mod_, devices, /* saturateHost */ true);
   CompilationContext cctx;
-  auto err = partitioner.Partition(cctx);
-  EXPECT_FALSE(errToBool(std::move(err)));
-  DAGListTy dagList = std::move(partitioner.getPartitionResult());
+  auto dagList = partitioner.partition(cctx);
+  EXPECT_TRUE((bool)dagList);
   // Check there are 3 partitions.
   EXPECT_EQ(mod_.getFunctions().size(), 3);
-  EXPECT_EQ(dagList.size(), 1);
+  EXPECT_EQ(dagList->size(), 1);
   ASSERT_TRUE(checkSaveNode(mod_));
 
-  for (auto &dag : dagList) {
+  for (auto &dag : dagList.get()) {
     // Check number of logical devices;
     llvm::SmallSet<DeviceIDTy, 4> usedID;
     for (auto &node : dag.nodes) {
@@ -729,17 +708,15 @@ TEST_F(PartitionerTest, logicalIDTest1) {
   backends.emplace_back(&backendWithoutMul1);
   backends.emplace_back(&backendWithoutSub1);
   std::vector<DeviceInfo> devices = {{3072, "Interpreter"}, {3072, "CPU"}};
-  auto partitioner =
-      Partitioner(&mod_, devices, backends, /* saturateHost */ true);
+  Partitioner partitioner(&mod_, devices, backends, /* saturateHost */ true);
   CompilationContext cctx;
-  auto err = partitioner.Partition(cctx);
-  EXPECT_FALSE(errToBool(std::move(err)));
-  DAGListTy dagList = std::move(partitioner.getPartitionResult());
+  auto dagList = partitioner.partition(cctx);
+  EXPECT_TRUE((bool)dagList);
   EXPECT_EQ(mod_.getFunctions().size(), 3);
-  EXPECT_EQ(dagList.size(), 1);
+  EXPECT_EQ(dagList->size(), 1);
   ASSERT_TRUE(checkSaveNode(mod_));
 
-  for (auto &dag : dagList) {
+  for (auto &dag : dagList.get()) {
     // Check number of logical devices;
     llvm::SmallSet<DeviceIDTy, 4> usedID;
     for (auto &node : dag.nodes) {
@@ -908,8 +885,8 @@ TEST_F(PartitionerTest, memoryUsageValidation1) {
                                      {500, "Interpreter"}};
   Partitioner myPartitioner(&mod_, devices);
   CompilationContext cctx;
-  auto err = myPartitioner.Partition(cctx);
-  EXPECT_TRUE(errToBool(std::move(err)));
+  auto dagList = myPartitioner.partition(cctx);
+  EXPECT_FALSE((bool)dagList);
 }
 
 /// This one test dagValidation in partitioner : p1->p2, p2->p1.
@@ -937,8 +914,8 @@ TEST_F(PartitionerTest, dagValidation1) {
   partitionConfig.nodeToPartition = {{"add2", 0}};
   auto partitioner = Partitioner(&mod_, devices, false, false, partitionConfig);
   CompilationContext cctx;
-  auto err = partitioner.Partition(cctx);
-  EXPECT_TRUE(errToBool(std::move(err)));
+  auto dagList = partitioner.partition(cctx);
+  EXPECT_FALSE((bool)dagList);
 }
 
 /// This one test dagValidation in partitioner: p0->p1, p1->p2, p2->p1.
@@ -969,8 +946,8 @@ TEST_F(PartitionerTest, dagValidation2) {
   partitionConfig.nodeToPartition = {{"add0", 0}, {"add2", 2}};
   auto partitioner = Partitioner(&mod_, devices, false, false, partitionConfig);
   CompilationContext cctx;
-  auto err = partitioner.Partition(cctx);
-  EXPECT_TRUE(errToBool(std::move(err)));
+  auto dagList = partitioner.partition(cctx);
+  EXPECT_FALSE((bool)dagList);
 }
 
 /// This one tests partition from a user-defined config.
@@ -987,13 +964,36 @@ TEST_F(PartitionerTest, partitionFromConfig) {
   partitionConfig.backendNames = {"Interpreter", "CPU", "Interpreter"};
   partitionConfig.partitionNames = {"p1", "p2", "p3"};
   partitionConfig.nodeToPartition = {{"sub", 0}, {"mul", 1}};
-  auto partitioner = Partitioner(&mod_, devices, false, false, partitionConfig);
+  Partitioner partitioner(&mod_, devices, false, false, partitionConfig);
+  CompilationContext cctx;
+  auto dagList = partitioner.partition(cctx);
+  EXPECT_TRUE((bool)dagList);
+  EXPECT_EQ(mod_.getFunctions().size(), 3);
+  EXPECT_EQ(dagList->size(), 1);
+  ASSERT_TRUE(checkSaveNode(mod_));
+  heterogeneousPartitionValidation(dagList.get(), mod_);
+}
+
+/// This one tests calling PartitionFromConfig directly.
+TEST_F(PartitionerTest, partitionFromConfigDirectCall) {
+  createSimpleModule(mod_);
+  std::vector<DeviceInfo> devices = {
+      {3072, "Interpreter"}, {3072, "Interpreter"}, {3072, "CPU"}};
+
+  // User-defined partition: 3 partitions (2 interpreter, 1 cpu), Mul nodes to
+  // CPU, others to Interpreter.
+  PartitionConfig partitionConfig;
+  partitionConfig.funcName = "test";
+  partitionConfig.numOfPartitions = 3;
+  partitionConfig.backendNames = {"Interpreter", "CPU", "Interpreter"};
+  partitionConfig.partitionNames = {"p1", "p2", "p3"};
+  partitionConfig.nodeToPartition = {{"sub", 0}, {"mul", 1}};
+  Partitioner partitioner(&mod_, devices);
   CompilationContext cctx;
-  auto err = partitioner.Partition(cctx);
-  EXPECT_FALSE(errToBool(std::move(err)));
-  DAGListTy dagList = std::move(partitioner.getPartitionResult());
+  auto dagList = partitioner.partitionFromConfig(partitionConfig);
+  EXPECT_TRUE((bool)dagList);
   EXPECT_EQ(mod_.getFunctions().size(), 3);
-  EXPECT_EQ(dagList.size(), 1);
+  EXPECT_EQ(dagList->size(), 1);
   ASSERT_TRUE(checkSaveNode(mod_));
-  heterogeneousPartitionValidation(dagList, mod_);
+  heterogeneousPartitionValidation(dagList.get(), mod_);
 }
diff --git a/tests/unittests/RecommendationSystemTest.cpp b/tests/unittests/RecommendationSystemTest.cpp
index fc05679595..4dc97dfd4e 100644
--- a/tests/unittests/RecommendationSystemTest.cpp
+++ b/tests/unittests/RecommendationSystemTest.cpp
@@ -949,13 +949,12 @@ class RecommendationSystemTest : public BackendTest {
     // Use the same precision transformation for compilation.
     CompilationContext cctx;
     cctx.precisionConfig = precConfig_;
-    EXIT_ON_ERR(myPartitioner.Partition(cctx));
-
-    DAGListTy myList = std::move(myPartitioner.getPartitionResult());
+    auto myList = myPartitioner.partition(cctx);
+    ASSERT_TRUE((bool)myList);
     std::cout << "Partitions = " << pMod->getFunctions().size() << std::endl;
     ASSERT_LE(pMod->getFunctions().size(), numDevices);
-    ASSERT_EQ(myList.size(), 1);
-    DAG &dag = myList.front();
+    ASSERT_EQ(myList->size(), 1);
+    DAG &dag = myList->front();
 
     // Run the partitioned graph and compare the results.