diff --git a/include/glow/Partitioner/Partitioner.h b/include/glow/Partitioner/Partitioner.h
index 6f4fe9ce9a..83b80f5cd5 100644
--- a/include/glow/Partitioner/Partitioner.h
+++ b/include/glow/Partitioner/Partitioner.h
@@ -16,7 +16,7 @@
 #ifndef GLOW_PARTITIONER_PARTITIONER_H
 #define GLOW_PARTITIONER_PARTITIONER_H
 
-#include "glow/Partitioner/PartitionerUtils.h"
+#include "glow/Partitioner/PartitionerTypes.h"
 #include "glow/Support/Error.h"
 
 namespace glow {
@@ -26,9 +26,6 @@ using namespace runtime;
 /// Given a module, partitions each of the its functions into multiple ones
 /// based on memory constraints and minimizes the communication cost.
 class Partitioner {
-  using MemUsageMap = std::unordered_map<Node *, uint64_t>;
-  using ComputeTimeMap = std::unordered_map<Node *, float>;
-
   /// The module that needs to be decomposed.
   Module *module_;
 
@@ -59,12 +56,6 @@ class Partitioner {
   /// Total memory (bytes) requested by one module.
   uint64_t memSize_;
 
-  /// The map of each operator and the corresponding memory size.
-  MemUsageMap memUsage_;
-
-  /// The map of each operator and the compute runtime.
-  ComputeTimeMap computeTime_;
-
   /// Flag to set if the Partitioner should attempt to saturate the host, and
   /// use all available devices.
   bool saturateHost_;
@@ -81,12 +72,6 @@ class Partitioner {
   /// update the memSize.
   static Function *selectRepFunc(Module *parent, uint64_t &memSize);
 
-  /// Get the minimal memory requirement for each op in the function \p F
-  void initOpMemUsage(Function *F);
-
-  /// Inititalize the minimal compute time for each op in the function \p F.
-  void initOpComputeTime(Function *F);
-
   /// After getting the initial partitions, adjust the partitions to minimize
   /// communication and computation cost.
   void partitionsAdjust(NodeToFunctionMap &partitions,
@@ -180,20 +165,6 @@ class Partitioner {
   /// a function family and they have the same partition, we only dump the one
   /// function's partition.
   void dumpDAG(llvm::StringRef dotFilename) const;
-
-  /// Get function for computeTime_
-  float getComputeTime(Node *N) const {
-    auto it = computeTime_.find(N);
-    assert(it != computeTime_.end());
-    return it == computeTime_.end() ? 0.0 : it->second;
-  }
-
-  /// Get function for memUsage_
-  uint64_t getMemUsage(Node *N) const {
-    auto it = memUsage_.find(N);
-    assert(it != memUsage_.end());
-    return it == memUsage_.end() ? 0 : it->second;
-  }
 };
 } // namespace glow
 #endif // GLOW_PARTITIONER_PARTITIONER_H
diff --git a/include/glow/Partitioner/PartitionerTypes.h b/include/glow/Partitioner/PartitionerTypes.h
index 5006b573df..de517e80c4 100644
--- a/include/glow/Partitioner/PartitionerTypes.h
+++ b/include/glow/Partitioner/PartitionerTypes.h
@@ -65,6 +65,17 @@ struct BackendInfo {
   size_t num = 0;
   /// The memory constraints for this backend.
   uint64_t memSize;
+  /// The following peakCompute, peakDramBw, peakSramBw, peakPCIeBw are from
+  /// DeviceInfo_. Available SRAM capacity in bytes.
+  uint64_t sramCapacity;
+  /// Peak compute on device in ops/second. Assumes all ops are in int8.
+  float peakCompute;
+  /// Peak memory bandwidth from DRAM on device in bytes/second.
+  float peakDramBw;
+  /// Peak memory bandwidth from SRAM on device in bytes/second.
+  float peakSramBw;
+  /// Peak ingress/egress PCI-E bandwidth from device in bytes/second.
+  float peakPCIeBw;
   /// Backend pointer.
   Backend *backend = nullptr;
   /// The non-supported nodes kind.
diff --git a/include/glow/Partitioner/PartitionerUtils.h b/include/glow/Partitioner/PartitionerUtils.h
index 3a466333df..7bc265075f 100644
--- a/include/glow/Partitioner/PartitionerUtils.h
+++ b/include/glow/Partitioner/PartitionerUtils.h
@@ -36,9 +36,15 @@ std::vector<Node *> getOutUsersWithOnePredecessor(const NodesSet &nodes);
 /// in the set \p nodes.
 uint64_t getOutMemPerNode(const NodesSet &nodes, const Node *node);
 
-/// Given a node, \return the NodeSet of inputs of this node.
+/// Given a node, \returns the NodeSet of inputs of this node.
 NodesSet getInputs(const Node *node);
 
+/// Return the estimated op computation time based on \p backendInfo.
+float getNodeComputeTime(const Node *node, const BackendInfo &backendInfo);
+
+/// Given a node, \returns the memory usage of its inputs (i.e. Storage input).
+uint64_t getNodeMemUsage(const Node *node);
+
 /// Given nodes set \p currNodes and its memory usage info \p info, \returns the
 /// new memory usage if \p newNode is added into \p currNodes.
 GraphMemInfo updateGraphMemInfoByAddingNode(const NodesSet &currNodes,
diff --git a/lib/Partitioner/Partitioner.cpp b/lib/Partitioner/Partitioner.cpp
index 50a157f8af..23d6af2b7f 100644
--- a/lib/Partitioner/Partitioner.cpp
+++ b/lib/Partitioner/Partitioner.cpp
@@ -17,6 +17,7 @@
 #include "glow/Partitioner/Partitioner.h"
 #include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h"
 #include "glow/Partitioner/PartitionerOptimizer.h"
+#include "glow/Partitioner/PartitionerUtils.h"
 #include "glow/Partitioner/PartitionerValidation.h"
 #include "glow/Support/Support.h"
 
@@ -174,227 +175,6 @@ Function *Partitioner::selectRepFunc(Module *parent, uint64_t &memSize) {
   return ret;
 }
 
-/// Get the minimal memory requirement (constant) for each op in the function.
-void Partitioner::initOpMemUsage(Function *F) {
-  memUsage_.clear();
-  for (auto &node : F->getNodes()) {
-    int n = node.getNumInputs();
-    uint64_t size = 0;
-    if (node.getKind() == Kinded::Kind::SaveNodeKind) {
-      memUsage_[&node] = size;
-      continue;
-    }
-    for (int i = 0; i < n; i++) {
-      Storage *in = llvm::dyn_cast<Storage>(node.getNthInput(i).getNode());
-      if (in) {
-        auto ty = in->getType();
-        size += ty->getSizeInBytes();
-      }
-    }
-    memUsage_[&node] = size;
-  }
-}
-
-/// Get the minimal compute time for each op in the function.
-void Partitioner::initOpComputeTime(Function *F) {
-  computeTime_.clear();
-
-  // This code assumes all ops are BW limited from SRAM; except
-  // if the input does not fit in SRAM -- then it is DRAM BW limited
-  float peakDramBw = deviceInfo_[0].peakDramBw;
-  float peakSramBw = deviceInfo_[0].peakSramBw;
-  uint64_t sramCapacity = deviceInfo_[0].sramCapacity;
-  float peakCompute = deviceInfo_[0].peakCompute;
-
-  for (auto &node : F->getNodes()) {
-    // compute memory side bytes for inputs from DRAM, SRAM.
-    // TODO: think about whether this is better off computed inside a Node.
-
-    int n = node.getNumInputs();
-    uint64_t sizeDram = 0;
-    uint64_t sizeSram = 0;
-    if (node.getKind() == Kinded::Kind::SaveNodeKind) {
-      computeTime_[&node] = 0.0f;
-      continue;
-    }
-
-    // The memory bytes for embedding table lookups is data dependent,
-    // so it needs to be calculated as per the number of indices accessed.
-    if (node.getKind() == Kinded::Kind::SparseLengthsWeightedSumNodeKind) {
-      auto *SLWSN = llvm::dyn_cast<SparseLengthsWeightedSumNode>(&node);
-      // compute how many entries of the embedding table we look up
-      auto numLookups = SLWSN->getIndices().dims().front();
-      // compute how many bytes we read per lookup
-      auto tableSize = SLWSN->getData().getType()->getSizeInBytes();
-      auto numRows = SLWSN->getData().dims().front();
-      auto sizePerLookup = tableSize / numRows;
-      // compute total bytes read
-      uint64_t sizeInput = numLookups * sizePerLookup;
-
-      // tables are usually large and fit in DRAM
-      sizeDram += sizeInput;
-      // we also read the indices, weights and lengths arrays
-      sizeSram += SLWSN->getIndices().getType()->getSizeInBytes();
-      sizeSram += SLWSN->getWeights().getType()->getSizeInBytes();
-      sizeSram += SLWSN->getLengths().getType()->getSizeInBytes();
-    } else if (node.getKind() == Kinded::Kind::SparseLengthsSumNodeKind) {
-      auto *SLSN = llvm::dyn_cast<SparseLengthsSumNode>(&node);
-      // compute how many entries of the embedding table we look up
-      auto numLookups = SLSN->getIndices().dims().front();
-      // compute how many bytes we read per lookup
-      auto tableSize = SLSN->getData().getType()->getSizeInBytes();
-      auto numRows = SLSN->getData().dims().front();
-      auto sizePerLookup = tableSize / numRows;
-      // compute total bytes read
-      uint64_t sizeInput = numLookups * sizePerLookup;
-
-      // tables are usually large and fit in DRAM
-      sizeDram += sizeInput;
-      // we also read the indices and lengths arrays
-      sizeSram += SLSN->getIndices().getType()->getSizeInBytes();
-      sizeSram += SLSN->getLengths().getType()->getSizeInBytes();
-    } else if (node.getKind() ==
-               Kinded::Kind::
-                   FusedRowwiseQuantizedSparseLengthsWeightedSumNodeKind) {
-      auto *FRQSLWSN =
-          llvm::dyn_cast<FusedRowwiseQuantizedSparseLengthsWeightedSumNode>(
-              &node);
-      // compute how many entries of the embedding table we look up
-      auto numLookups = FRQSLWSN->getIndices().dims().front();
-      // compute how many bytes we read per lookup
-      auto tableSize = FRQSLWSN->getData().getType()->getSizeInBytes();
-      auto numRows = FRQSLWSN->getData().dims().front();
-      auto sizePerLookup = tableSize / numRows;
-      // compute total bytes read
-      uint64_t sizeInput = numLookups * sizePerLookup;
-
-      // tables are usually large and fit in DRAM
-      sizeDram += sizeInput;
-
-      // we also read the indices, weights and lengths arrays
-      sizeSram += FRQSLWSN->getIndices().getType()->getSizeInBytes();
-      sizeSram += FRQSLWSN->getWeights().getType()->getSizeInBytes();
-      sizeSram += FRQSLWSN->getLengths().getType()->getSizeInBytes();
-    } else if (node.getKind() ==
-               Kinded::Kind::FusedRowwiseQuantizedSparseLengthsSumNodeKind) {
-      auto *FRQSLSN =
-          llvm::dyn_cast<FusedRowwiseQuantizedSparseLengthsSumNode>(&node);
-      // compute how many entries of the embedding table we look up
-      auto numLookups = FRQSLSN->getIndices().dims().front();
-      // compute how many bytes we read per lookup
-      auto tableSize = FRQSLSN->getData().getType()->getSizeInBytes();
-      auto numRows = FRQSLSN->getData().dims().front();
-      auto sizePerLookup = tableSize / numRows;
-      // compute total bytes read
-      uint64_t sizeInput = numLookups * sizePerLookup;
-
-      // tables are usually large and fit in DRAM
-      sizeDram += sizeInput;
-
-      // we also read the indices and lengths arrays
-      sizeSram += FRQSLSN->getIndices().getType()->getSizeInBytes();
-      sizeSram += FRQSLSN->getLengths().getType()->getSizeInBytes();
-    } else {
-      // for all other ops, iterate through all inputs and get size in bytes
-      for (int i = 0; i < n; i++) {
-        auto ty = node.getNthInput(i).getType();
-        uint64_t sizeInput = ty->getSizeInBytes();
-        if (sizeInput > sramCapacity) {
-          sizeDram += sizeInput;
-        } else {
-          sizeSram += sizeInput;
-        }
-      }
-    }
-
-    // Repeat for outputs
-    for (size_t i = 0, e = node.getNumResults(); i < e; i++) {
-      auto myty = node.getType(i);
-      uint64_t sizeOutput = myty->getSizeInBytes();
-      if (sizeOutput > sramCapacity) {
-        sizeDram += sizeOutput;
-      } else {
-        sizeSram += sizeOutput;
-      }
-    }
-
-    // Calculate compute ops. Currently only computed for Matmul, Conv, FC
-    // TODO: think about whether this is better off computed inside a Node.
-    uint64_t totalOps = 0;
-    switch (node.getKind()) {
-    case Kinded::Kind::MatMulNodeKind: {
-      auto *MMN = llvm::dyn_cast<MatMulNode>(&node);
-      auto lhsDims = MMN->getLHS().dims();
-      auto rhsDims = MMN->getRHS().dims();
-      totalOps = 2 * lhsDims[0] * lhsDims[1] * rhsDims[1];
-      break;
-    }
-    case Kinded::Kind::FullyConnectedNodeKind: {
-      auto *FCN = llvm::dyn_cast<FullyConnectedNode>(&node);
-      auto inputDims = FCN->getInput().dims();
-      auto wtDims = FCN->getWeights().dims();
-      totalOps = 2 * inputDims[0] * inputDims[1] * wtDims[0];
-      break;
-    }
-#ifdef GLOW_WITH_HABANA
-    case Kinded::Kind::HabanaFullyConnectedNodeKind: {
-      auto *FCN = llvm::dyn_cast<HabanaFullyConnectedNode>(&node);
-      auto inputDims = FCN->getInput().dims();
-      auto wtDims = FCN->getWeights().dims();
-      totalOps = 2 * inputDims[0] * inputDims[1] * wtDims[0];
-      break;
-    }
-#endif
-    case Kinded::Kind::ConvolutionNodeKind: {
-      auto *CN = llvm::dyn_cast<ConvolutionNode>(&node);
-      auto resultDims = CN->getResult().dims();
-      // Get the product of batch, output height, output dims, output channels
-      totalOps = resultDims[0];
-      for (size_t i = 1, e = resultDims.size(); i < e; i++) {
-        totalOps *= resultDims[i];
-      }
-      // Multiply in kernel height, kernel width
-      auto kernelDims = CN->getKernels();
-      totalOps *= kernelDims[0] * kernelDims[1];
-      // Multiply in input channels/groups
-      auto inputChannels = CN->getInput().dims()[1];
-      auto nGroups = CN->getGroup();
-      totalOps *= (inputChannels * 1.0 / nGroups);
-      break;
-    }
-#ifdef GLOW_WITH_HABANA
-    case Kinded::Kind::HabanaConvolutionNodeKind: {
-      auto *CN = llvm::dyn_cast<HabanaConvolutionNode>(&node);
-      auto resultDims = CN->getResult().dims();
-      // Get the product of batch, output height, output dims, output channels
-      totalOps = resultDims[0];
-      for (size_t i = 1, e = resultDims.size(); i < e; i++) {
-        totalOps *= resultDims[i];
-      }
-      // Multiply in kernel height, kernel width
-      auto kernelDims = CN->getKernels();
-      totalOps *= kernelDims[0] * kernelDims[1];
-      // Multiply in input channels/groups
-      auto inputChannels = CN->getInput().dims()[1];
-      auto nGroups = CN->getGroup();
-      totalOps *= (inputChannels * 1.0 / nGroups);
-      break;
-    }
-#endif
-    default:
-      break;
-    }
-
-    // Compute compute roofline as max of flops, DRAM, SRAM BW
-    // See https://bit.ly/2UdJ3mz
-    // Add epsilons to prevent seg faults on uninitialized peak values.
-    computeTime_[&node] =
-        std::max(totalOps * 1.0f / std::max(peakCompute, 1e-6f),
-                 std::max(sizeDram * 1.0f / std::max(peakDramBw, 1e-6f),
-                          sizeSram * 1.0f / std::max(peakSramBw, 1e-6f)));
-  }
-}
-
 void Partitioner::partitionsAdjust(NodeToFunctionMap &partitions,
                                    uint64_t availableMemory) {
   // For each partition, create a node set.
@@ -743,6 +523,10 @@ void Partitioner::getBackendMap(
       // is the same.
       // TODO : will improve the algorithm for different memory size.
       backendInfo.memSize = deviceInfo_[i].availableMemory;
+      backendInfo.peakDramBw = deviceInfo_[i].peakDramBw;
+      backendInfo.peakSramBw = deviceInfo_[i].peakSramBw;
+      backendInfo.sramCapacity = deviceInfo_[i].sramCapacity;
+      backendInfo.peakCompute = deviceInfo_[i].peakCompute;
       backendInfo.nonSupportedNodesKinds =
           generateNodeKindsSet(deviceInfo_[i].nonSupportedNodes);
       backendInfo.supportedNodesKinds =
@@ -827,7 +611,8 @@ llvm::Error Partitioner::loadBalancedPartitioning(Function *F,
   // Compute total roofline time
   float totalRooflineTime = 0;
   for (auto &n : F->getNodes()) {
-    totalRooflineTime += getComputeTime(&n);
+    totalRooflineTime +=
+        getNodeComputeTime(&n, backendMap_[deviceInfo_[0].backendName]);
   }
 
   float timePerPartition = totalRooflineTime / numDevices;
@@ -869,8 +654,9 @@ llvm::Error Partitioner::loadBalancedPartitioning(Function *F,
         }
       }
 
-      auto curOpTime = getComputeTime(N);
-      auto curOpMemory = getMemUsage(N);
+      auto curOpTime =
+          getNodeComputeTime(N, backendMap_[deviceInfo_[0].backendName]);
+      auto curOpMemory = getNodeMemUsage(N);
 
       // Find a partition to put this node into
       int curPartition = maxLogicalDeviceId;
@@ -1001,12 +787,7 @@ llvm::Error Partitioner::Partition(CompilationContext &cctx) {
       RETURN_IF_ERR(::glow::optimizeFunction(func, *backend, cctx));
     }
 
-    // Step 2.2 : get the min memory usage and the roofline memory bandwidth
-    // estimate for each op.
-    initOpMemUsage(func);
-    initOpComputeTime(func);
-
-    // Step 2.3 : apply graph partitioning algrithm to find out the partition.
+    // Step 2.2 : apply graph partitioning algrithm to find out the partition.
     NodeToFunctionMap partitionMap =
         selectPartitions(func, availMem, i->second);
     mapping.insert(partitionMap);
diff --git a/lib/Partitioner/PartitionerUtils.cpp b/lib/Partitioner/PartitionerUtils.cpp
index cf002a87af..277944488f 100644
--- a/lib/Partitioner/PartitionerUtils.cpp
+++ b/lib/Partitioner/PartitionerUtils.cpp
@@ -171,6 +171,212 @@ NodesSet getInputs(const Node *node) {
   return result;
 }
 
+uint64_t getNodeMemUsage(const Node *node) {
+  if (node->getKind() == Kinded::Kind::SaveNodeKind) {
+    return 0;
+  }
+  uint64_t size = 0;
+  for (size_t i = 0, e = node->getNumInputs(); i < e; i++) {
+    Storage *in = llvm::dyn_cast<Storage>(node->getNthInput(i).getNode());
+    if (in) {
+      auto ty = in->getType();
+      size += ty->getSizeInBytes();
+    }
+  }
+  return size;
+}
+
+float getNodeComputeTime(const Node *node, const BackendInfo &backendInfo) {
+  // This code assumes all ops are BW limited from SRAM; except
+  // if the input does not fit in SRAM -- then it is DRAM BW limited
+  float peakDramBw = backendInfo.peakDramBw;
+  float peakSramBw = backendInfo.peakSramBw;
+  uint64_t sramCapacity = backendInfo.sramCapacity;
+  float peakCompute = backendInfo.peakCompute;
+
+  // compute memory side bytes for inputs from DRAM, SRAM.
+  // TODO: think about whether this is better off computed inside a Node.
+
+  int n = node->getNumInputs();
+  uint64_t sizeDram = 0;
+  uint64_t sizeSram = 0;
+  if (node->getKind() == Kinded::Kind::SaveNodeKind) {
+    return 0.0f;
+  }
+  // The memory bytes for embedding table lookups is data dependent,
+  // so it needs to be calculated as per the number of indices accessed.
+  if (node->getKind() == Kinded::Kind::SparseLengthsWeightedSumNodeKind) {
+    auto *SLWSN = llvm::dyn_cast<SparseLengthsWeightedSumNode>(node);
+    // compute how many entries of the embedding table we look up
+    auto numLookups = SLWSN->getIndices().dims().front();
+    // compute how many bytes we read per lookup
+    auto tableSize = SLWSN->getData().getType()->getSizeInBytes();
+    auto numRows = SLWSN->getData().dims().front();
+    auto sizePerLookup = tableSize / numRows;
+    // compute total bytes read
+    uint64_t sizeInput = numLookups * sizePerLookup;
+
+    // tables are usually large and fit in DRAM
+    sizeDram += sizeInput;
+    // we also read the indices, weights and lengths arrays
+    sizeSram += SLWSN->getIndices().getType()->getSizeInBytes();
+    sizeSram += SLWSN->getWeights().getType()->getSizeInBytes();
+    sizeSram += SLWSN->getLengths().getType()->getSizeInBytes();
+  } else if (node->getKind() == Kinded::Kind::SparseLengthsSumNodeKind) {
+    auto *SLSN = llvm::dyn_cast<SparseLengthsSumNode>(node);
+    // compute how many entries of the embedding table we look up
+    auto numLookups = SLSN->getIndices().dims().front();
+    // compute how many bytes we read per lookup
+    auto tableSize = SLSN->getData().getType()->getSizeInBytes();
+    auto numRows = SLSN->getData().dims().front();
+    auto sizePerLookup = tableSize / numRows;
+    // compute total bytes read
+    uint64_t sizeInput = numLookups * sizePerLookup;
+
+    // tables are usually large and fit in DRAM
+    sizeDram += sizeInput;
+    // we also read the indices and lengths arrays
+    sizeSram += SLSN->getIndices().getType()->getSizeInBytes();
+    sizeSram += SLSN->getLengths().getType()->getSizeInBytes();
+  } else if (node->getKind() ==
+             Kinded::Kind::
+                 FusedRowwiseQuantizedSparseLengthsWeightedSumNodeKind) {
+    auto *FRQSLWSN =
+        llvm::dyn_cast<FusedRowwiseQuantizedSparseLengthsWeightedSumNode>(node);
+    // compute how many entries of the embedding table we look up
+    auto numLookups = FRQSLWSN->getIndices().dims().front();
+    // compute how many bytes we read per lookup
+    auto tableSize = FRQSLWSN->getData().getType()->getSizeInBytes();
+    auto numRows = FRQSLWSN->getData().dims().front();
+    auto sizePerLookup = tableSize / numRows;
+    // compute total bytes read
+    uint64_t sizeInput = numLookups * sizePerLookup;
+
+    // tables are usually large and fit in DRAM
+    sizeDram += sizeInput;
+
+    // we also read the indices, weights and lengths arrays
+    sizeSram += FRQSLWSN->getIndices().getType()->getSizeInBytes();
+    sizeSram += FRQSLWSN->getWeights().getType()->getSizeInBytes();
+    sizeSram += FRQSLWSN->getLengths().getType()->getSizeInBytes();
+  } else if (node->getKind() ==
+             Kinded::Kind::FusedRowwiseQuantizedSparseLengthsSumNodeKind) {
+    auto *FRQSLSN =
+        llvm::dyn_cast<FusedRowwiseQuantizedSparseLengthsSumNode>(node);
+    // compute how many entries of the embedding table we look up
+    auto numLookups = FRQSLSN->getIndices().dims().front();
+    // compute how many bytes we read per lookup
+    auto tableSize = FRQSLSN->getData().getType()->getSizeInBytes();
+    auto numRows = FRQSLSN->getData().dims().front();
+    auto sizePerLookup = tableSize / numRows;
+    // compute total bytes read
+    uint64_t sizeInput = numLookups * sizePerLookup;
+
+    // tables are usually large and fit in DRAM
+    sizeDram += sizeInput;
+
+    // we also read the indices and lengths arrays
+    sizeSram += FRQSLSN->getIndices().getType()->getSizeInBytes();
+    sizeSram += FRQSLSN->getLengths().getType()->getSizeInBytes();
+  } else {
+    // for all other ops, iterate through all inputs and get size in bytes
+    for (int i = 0; i < n; i++) {
+      auto ty = node->getNthInput(i).getType();
+      uint64_t sizeInput = ty->getSizeInBytes();
+      if (sizeInput > sramCapacity) {
+        sizeDram += sizeInput;
+      } else {
+        sizeSram += sizeInput;
+      }
+    }
+  }
+
+  // Repeat for outputs
+  for (size_t i = 0, e = node->getNumResults(); i < e; i++) {
+    auto myty = node->getType(i);
+    uint64_t sizeOutput = myty->getSizeInBytes();
+    if (sizeOutput > sramCapacity) {
+      sizeDram += sizeOutput;
+    } else {
+      sizeSram += sizeOutput;
+    }
+  }
+
+  // Calculate compute ops. Currently only computed for Matmul, Conv, FC
+  // TODO: think about whether this is better off computed inside a Node.
+  uint64_t totalOps = 0;
+  switch (node->getKind()) {
+  case Kinded::Kind::MatMulNodeKind: {
+    auto *MMN = llvm::dyn_cast<MatMulNode>(node);
+    auto lhsDims = MMN->getLHS().dims();
+    auto rhsDims = MMN->getRHS().dims();
+    totalOps = 2 * lhsDims[0] * lhsDims[1] * rhsDims[1];
+    break;
+  }
+  case Kinded::Kind::FullyConnectedNodeKind: {
+    auto *FCN = llvm::dyn_cast<FullyConnectedNode>(node);
+    auto inputDims = FCN->getInput().dims();
+    auto wtDims = FCN->getWeights().dims();
+    totalOps = 2 * inputDims[0] * inputDims[1] * wtDims[0];
+    break;
+  }
+#ifdef GLOW_WITH_HABANA
+  case Kinded::Kind::HabanaFullyConnectedNodeKind: {
+    auto *FCN = llvm::dyn_cast<HabanaFullyConnectedNode>(node);
+    auto inputDims = FCN->getInput().dims();
+    auto wtDims = FCN->getWeights().dims();
+    totalOps = 2 * inputDims[0] * inputDims[1] * wtDims[0];
+    break;
+  }
+#endif
+  case Kinded::Kind::ConvolutionNodeKind: {
+    auto *CN = llvm::dyn_cast<ConvolutionNode>(node);
+    auto resultDims = CN->getResult().dims();
+    // Get the product of batch, output height, output dims, output channels
+    totalOps = resultDims[0];
+    for (size_t i = 1, e = resultDims.size(); i < e; i++) {
+      totalOps *= resultDims[i];
+    }
+    // Multiply in kernel height, kernel width
+    auto kernelDims = CN->getKernels();
+    totalOps *= kernelDims[0] * kernelDims[1];
+    // Multiply in input channels/groups
+    auto inputChannels = CN->getInput().dims()[1];
+    auto nGroups = CN->getGroup();
+    totalOps *= (inputChannels * 1.0 / nGroups);
+    break;
+  }
+#ifdef GLOW_WITH_HABANA
+  case Kinded::Kind::HabanaConvolutionNodeKind: {
+    auto *CN = llvm::dyn_cast<HabanaConvolutionNode>(node);
+    auto resultDims = CN->getResult().dims();
+    // Get the product of batch, output height, output dims, output channels
+    totalOps = resultDims[0];
+    for (size_t i = 1, e = resultDims.size(); i < e; i++) {
+      totalOps *= resultDims[i];
+    }
+    // Multiply in kernel height, kernel width
+    auto kernelDims = CN->getKernels();
+    totalOps *= kernelDims[0] * kernelDims[1];
+    // Multiply in input channels/groups
+    auto inputChannels = CN->getInput().dims()[1];
+    auto nGroups = CN->getGroup();
+    totalOps *= (inputChannels * 1.0 / nGroups);
+    break;
+  }
+#endif
+  default:
+    break;
+  }
+
+  // Compute compute roofline as max of flops, DRAM, SRAM BW
+  // See https://bit.ly/2UdJ3mz
+  // Add epsilons to prevent seg faults on uninitialized peak values.
+  return std::max(totalOps * 1.0f / std::max(peakCompute, 1e-6f),
+                  std::max(sizeDram * 1.0f / std::max(peakDramBw, 1e-6f),
+                           sizeSram * 1.0f / std::max(peakSramBw, 1e-6f)));
+}
+
 /// Given nodes set \p currNodes and its memory usage info \p info, \returns the
 /// new memory usage if \p newNode is added into \p currNodes.
 GraphMemInfo updateGraphMemInfoByAddingNode(const NodesSet &currNodes,
diff --git a/tests/unittests/PartitionerTest.cpp b/tests/unittests/PartitionerTest.cpp
index eacd0de727..9430f12d85 100644
--- a/tests/unittests/PartitionerTest.cpp
+++ b/tests/unittests/PartitionerTest.cpp
@@ -17,6 +17,7 @@
 #include "glow/ExecutionEngine/ExecutionEngine2.h"
 #include "glow/Graph/Graph.h"
 #include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h"
+#include "glow/Partitioner/PartitionerUtils.h"
 
 #include "gtest/gtest.h"
 
@@ -428,17 +429,6 @@ TEST_F(PartitionerTest, Basic1Roofline) {
     nodeNamesMap[&node] = node.getName();
   }
 
-  std::vector<DeviceInfo> devices = {
-      {3072, "Interpreter", "", "", 100, 10, 0.1, 1, 0.05},
-      {3072, "Interpreter", "", "", 100, 10, 0.1, 1, 0.05},
-      {3072, "Interpreter", "", "", 100, 10, 0.1, 1, 0.05}};
-  Partitioner myPartitioner(&EEP.getModule(), devices);
-
-  auto err = myPartitioner.Partition(cctx);
-  EXPECT_FALSE(errToBool(std::move(err)));
-
-  DAGListTy dagList = std::move(myPartitioner.getPartitionResult());
-
   // check compute costs
   std::unordered_map<std::string, float> expectedComputeTime{
       {"initial_sigmoid", 128},
@@ -460,38 +450,17 @@ TEST_F(PartitionerTest, Basic1Roofline) {
       {"fc_add_bias4", 96},
   };
 
+  BackendInfo backendInfo;
+  backendInfo.sramCapacity = 100;
+  backendInfo.peakCompute = 10;
+  backendInfo.peakDramBw = 0.1;
+  backendInfo.peakSramBw = 1;
+  backendInfo.peakPCIeBw = 0.05;
   for (auto const &p : nodeNamesMap) {
     auto *N = p.first;
-    EXPECT_EQ(myPartitioner.getComputeTime(N), expectedComputeTime[p.second]);
+    EXPECT_EQ(getNodeComputeTime(N, backendInfo),
+              expectedComputeTime[p.second]);
   }
-
-  // check memUsage
-  std::unordered_map<std::string, uint64_t> expectedMemUsage{
-      {"initial_sigmoid", 0},
-      {"left_sigmoid2", 0},
-      {"fc_add_bias3", 64},
-      {"right_sigmoid1", 0},
-      {"mul", 0},
-      {"fc_add_bias2", 32},
-      {"ret", 0},
-      {"fc_dot", 2176},
-      {"left_sigmoid1", 0},
-      {"fc_add_bias", 64},
-      {"fc_dot1", 1024},
-      {"right_sigmoid2", 0},
-      {"fc_add_bias1", 64},
-      {"fc_dot2", 512},
-      {"fc_dot3", 1024},
-      {"fc_dot4", 512},
-      {"fc_add_bias4", 32},
-  };
-  for (auto const &p : nodeNamesMap) {
-    auto *N = p.first;
-    EXPECT_EQ(myPartitioner.getMemUsage(N), expectedMemUsage[p.second]);
-  }
-
-  EXPECT_EQ(EEP.getModule().getFunctions().size(), 3);
-  EXPECT_EQ(dagList.size(), 1);
 }
 
 TEST_F(PartitionerTest, SelectRepFunc) {