Skip to content

[Partitioner] Add cost functions to partitioner #2441

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 6, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions include/glow/Partitioner/Partitioner.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ namespace glow {
using namespace runtime;

using MemUsageMapTy = std::unordered_map<Node *, size_t>;
using ComputeTimeMapTy = std::unordered_map<Node *, float>;
using NodesSetTy = std::set<Node *>;
using PartitionCostMapTy = llvm::DenseMap<Function *, GraphMemInfo>;

Expand Down Expand Up @@ -97,6 +98,9 @@ class Partitioner {
/// The map of each operator and the corresponding memory size.
MemUsageMapTy memUsage_;

/// The map of each operator and the compute runtime.
ComputeTimeMapTy computeTime_;

/// Get the representative function (the one with the largest input) and
/// update the memSize.
static Function *selectRepFunc(Module *parent, size_t &memSize);
Expand All @@ -105,6 +109,9 @@ class Partitioner {
/// function.
void initOpMemUsage();

/// Inititalize the minimal compute time for each op in the function.
void initOpComputeTime();

/// Combine the partitions if necessary : if all outside uses of the nodes in
/// /// partition1 is in partition2, and the sum of memory consumption of
/// partition1 and partition2 is less than availableMemory, combine partition1
Expand Down Expand Up @@ -140,6 +147,9 @@ class Partitioner {

/// Decompose each function in a module and return a list of DAGNodes.
DAGNodeList &Partition();

/// Get function for computeTime_
ComputeTimeMapTy getComputeTime() const { return computeTime_; }
};
} // namespace glow
#endif // GLOW_PARTITIONER_PARTITIONER_H
11 changes: 11 additions & 0 deletions include/glow/Runtime/RuntimeTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,17 @@ using ResultCBTy = std::function<void(
struct DeviceInfo {
/// Available memory on device in bytes.
size_t availableMemory;
/// Available SRAM capacity in bytes.
size_t sramCapacity;
/// Peak compute on device in ops/second. Assumes all ops are in int8.
/// TODO: distinguish between data types with different peak flops.
float peakCompute;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of curiosity: We assume here DRAM and SRAM as the only two layer of the memory hierarchy and it is fine for now. But do we need to support more levels/kinds of the memory hierarchy in a general case (e.g. different caches, etc)?

Copy link
Contributor Author

@nrsatish nrsatish Feb 28, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally yes. In general, it would be good to get the characteristics of the architecture from querying some API. This current design is the first step towards that.

/// Peak memory bandwidth from DRAM on device in bytes/second.
float peakDramBw;
/// Peak memory bandwidth from SRAM on device in bytes/second.
float peakSramBw;
/// Peak ingress/egress PCI-E bandwidth from device in bytes/second.
float peakPCIeBw;
};

/// Individual Node in the DAG for a given network. This contains all the
Expand Down
126 changes: 122 additions & 4 deletions lib/Partitioner/Partitioner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,126 @@ void Partitioner::initOpMemUsage() {
}
}

/// Get the minimal compute time for each op in the function.
void Partitioner::initOpComputeTime() {
computeTime_.clear();

// This code assumes all ops are BW limited from SRAM; except
// if the input does not fit in SRAM -- then it is DRAM BW limited
float peakDramBw = deviceInfo_[0].peakDramBw;
float peakSramBw = deviceInfo_[0].peakSramBw;
size_t sramCapacity = deviceInfo_[0].sramCapacity;
float peakCompute = deviceInfo_[0].peakCompute;

for (auto &node : F_->getNodes()) {
/// compute memory side bytes for inputs from DRAM, SRAM.
/// TODO: think about whether this is better off computed inside a Node.

int n = node.getNumInputs();
uint64_t sizeDram = 0;
uint64_t sizeSram = 0;
if (node.getKind() == Kinded::Kind::SaveNodeKind) {
computeTime_[&node] = 0.0f;
continue;
}

/// The memory bytes for embedding table lookups is data dependent,
/// so it needs to be calculated as per the number of indices accessed.
if (node.getKind() == Kinded::Kind::SparseLengthsWeightedSumNodeKind) {
auto *SLWSN = llvm::dyn_cast<SparseLengthsWeightedSumNode>(&node);
/// compute how many entries of the embedding table we look up
auto numLookups = SLWSN->getIndices().getNode()->dims(0).front();
/// compute how many bytes we read per lookup
auto tableSize = SLWSN->getData().getNode()->getType(0)->getSizeInBytes();
auto numRows = SLWSN->getData().getNode()->dims(0).front();
auto sizePerLookup = tableSize / numRows;
/// compute total bytes read
uint64_t sizeInput = numLookups * sizePerLookup;

/// does the table fit in SRAM or DRAM
if (tableSize > sramCapacity) {
sizeDram += sizeInput;
} else {
sizeSram += sizeInput;
}

/// we also read the indices, weights and lengths arrays
sizeSram += SLWSN->getIndices().getNode()->getType(0)->getSizeInBytes();
sizeSram += SLWSN->getWeights().getNode()->getType(0)->getSizeInBytes();
sizeSram += SLWSN->getLengths().getNode()->getType(0)->getSizeInBytes();
} else {
/// for all other ops, iterate through all inputs and get size in bytes
for (int i = 0; i < n; i++) {
auto ty = node.getNthInput(i).getNode()->getType(0);
uint64_t sizeInput = ty->getSizeInBytes();
if (sizeInput > sramCapacity) {
sizeDram += sizeInput;
} else {
sizeSram += sizeInput;
}
}
}

// Repeat for outputs
if (node.getNumResults() > 0) {
auto myty = node.getType(0);
uint64_t sizeOutput = myty->getSizeInBytes();
if (sizeOutput > sramCapacity) {
sizeDram += sizeOutput;
} else {
sizeSram += sizeOutput;
}
}

/// Calculate compute ops. Currently only computed for Matmul, Conv, FC
/// TODO: think about whether this is better off computed inside a Node.
uint64_t totalOps = 0;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just want to double check again here: in the future, do we need to add the computation for each node?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes we do. At least for memory bytes if not flops.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But for most ops, flops is less important. There are only a handful of ops here that will be at all compute bound.

switch (node.getKind()) {
case Kinded::Kind::MatMulNodeKind: {
auto *MMN = llvm::dyn_cast<MatMulNode>(&node);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I prefer using "switch". If we need to add more node type here, "switch" looks better:)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense.

auto lhsDims = MMN->getLHS().dims();
auto rhsDims = MMN->getRHS().dims();
totalOps = 2 * lhsDims[0] * lhsDims[1] * rhsDims[1];
break;
}
case Kinded::Kind::FullyConnectedNodeKind: {
auto *FCN = llvm::dyn_cast<FullyConnectedNode>(&node);
auto inputDims = FCN->getInput().dims();
auto wtDims = FCN->getWeights().dims();
totalOps = 2 * inputDims[0] * inputDims[1] * wtDims[1];
break;
}
case Kinded::Kind::ConvolutionNodeKind: {
auto *CN = llvm::dyn_cast<ConvolutionNode>(&node);
auto resultDims = CN->getResult().dims();
// Get the product of batch, output height, output dims, output channels
totalOps = resultDims[0];
for (size_t i = 1, e = resultDims.size(); i < e; i++) {
totalOps *= resultDims[i];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here "i" should be size_t, otherwise, the type check will fail.
Usually, it can be wrote as "for (size_t i = 1, e = resultDims.size(); i < e; i++)"

}
// Multiply in kernel height, kernel width
auto kernelDims = CN->getKernels();
totalOps *= kernelDims[0] * kernelDims[1];
// Multiply in input channels/groups
auto inputChannels = CN->getInput().dims()[1];
auto nGroups = CN->getGroup();
totalOps *= (inputChannels * 1.0 / nGroups);
break;
}
default:
break;
}

/// Compute compute roofline as max of flops, DRAM, SRAM BW
/// See https://bit.ly/2UdJ3mz
/// Add epsilons to prevent seg faults on unitialized peak values
computeTime_[&node] =
std::max(totalOps * 1.0f / std::max(peakCompute, 1e-6f),
std::max(sizeDram * 1.0f / std::max(peakDramBw, 1e-6f),
sizeSram * 1.0f / std::max(peakSramBw, 1e-6f)));
}
}

// Combine the partitions if necessary : if all outside uses of the nodes in
// partition1 is in partition2, and the sum of memory consumption of partition1
// and partition2 is less than availableMemory, combine partition1 and
Expand Down Expand Up @@ -403,7 +523,6 @@ DAGNodeList &Partitioner::Partition() {

// Find the representive function for running partitioning algrithm.
F_ = selectRepFunc(module_, memSize_);

size_t availMem = deviceInfo_[0].availableMemory;

if (memSize_ < availMem) {
Expand All @@ -427,9 +546,8 @@ DAGNodeList &Partitioner::Partition() {
// Prepare 1: Get the min memory usage for each op.
initOpMemUsage();

// Prepare 2: TODO: get the minimal comunication cost for any 2 ops (i.e. the
// output data size) Will calculate it on the fly. -- Will double check which
// way is better.
// Prepare 2: Get the roofline memory bandwidth estimate for each op.
initOpComputeTime();

// Partition
// Use BFS to do the initial partitioning. Starting from the final node, BFS
Expand Down
101 changes: 101 additions & 0 deletions tests/unittests/PartitionerTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,3 +208,104 @@ TEST_F(PartitionerTest, Basic2) {
EXPECT_TRUE(ref.isEqual(test));
}
}

/// This one tests the roofline computed with compute, memory and communication
/// costs
TEST_F(PartitionerTest, Basic1Roofline) {
auto *input =
mod_.createPlaceholder(ElemKind::FloatTy, {1, 32}, "input", false);
auto *w1 = mod_.createConstant(ElemKind::FloatTy, {32, 16}, "w1");
auto *b1 = mod_.createConstant(ElemKind::FloatTy, {16}, "b1");
ctx_.allocate(input);
w1->getHandle<>().randomize(-2.0, 2.0, mod_.getPRNG());
b1->getHandle<>().randomize(-2.0, 2.0, mod_.getPRNG());

// Initial FC.
Node *I = F_->createFullyConnected("initial_fc", input, w1, b1);
I = F_->createSigmoid("initial_sigmoid", I);

// Left branch.
auto *w2 = mod_.createConstant(ElemKind::FloatTy, {16, 16}, "w2");
auto *b2 = mod_.createConstant(ElemKind::FloatTy, {16}, "b2");
w2->getHandle<>().randomize(-2.0, 2.0, mod_.getPRNG());
b2->getHandle<>().randomize(-2.0, 2.0, mod_.getPRNG());
Node *L = F_->createFullyConnected("left_fc1", I, w2, b2);
L = F_->createSigmoid("left_sigmoid1", L);
auto *w3 = mod_.createConstant(ElemKind::FloatTy, {16, 8}, "w3");
auto *b3 = mod_.createConstant(ElemKind::FloatTy, {8}, "b3");
w3->getHandle<>().randomize(-2.0, 2.0, mod_.getPRNG());
b3->getHandle<>().randomize(-2.0, 2.0, mod_.getPRNG());
L = F_->createFullyConnected("left_fc2", L, w3, b3);
L = F_->createSigmoid("left_sigmoid2", L);

// Right branch.
auto *w4 = mod_.createConstant(ElemKind::FloatTy, {16, 16}, "w4");
auto *b4 = mod_.createConstant(ElemKind::FloatTy, {16}, "b4");
w4->getHandle<>().randomize(-2.0, 2.0, mod_.getPRNG());
b4->getHandle<>().randomize(-2.0, 2.0, mod_.getPRNG());
Node *R = F_->createFullyConnected("right_fc1", I, w4, b4);
R = F_->createSigmoid("right_sigmoid1", R);
auto *w5 = mod_.createConstant(ElemKind::FloatTy, {16, 8}, "w5");
auto *b5 = mod_.createConstant(ElemKind::FloatTy, {8}, "b5");
w5->getHandle<>().randomize(-2.0, 2.0, mod_.getPRNG());
b5->getHandle<>().randomize(-2.0, 2.0, mod_.getPRNG());
R = F_->createFullyConnected("right_fc2", R, w5, b5);
R = F_->createSigmoid("right_sigmoid2", R);

// Join branches.
auto *mul = F_->createMul("mul", L, R);
auto *save = F_->createSave("ret", mul);
auto &res = *ctx_.allocate(save->getPlaceholder());

// Infer using the un-partitioned graph.
Tensor in(ElemKind::FloatTy, {1, 32});
ExecutionEngine EE;

EE.compile(CompilationMode::Infer, F_);
updateInputPlaceholders(ctx_, {input}, {&in});
EE.run(ctx_);
Tensor ref = res.clone();

std::unordered_map<Node *, std::string> nodeNamesMap;
for (auto &node : F_->getNodes()) {
nodeNamesMap[&node] = node.getName();
}

std::vector<DeviceInfo> devices = {{3072, 100, 10, 0.1, 1, 0.05},
{3072, 100, 10, 0.1, 1, 0.05},
{3072, 100, 10, 0.1, 1, 0.05}};
Partitioner myPartitioner(&mod_, devices);

DAGNodeList myList = std::move(myPartitioner.Partition());

// check compute costs
std::unordered_map<std::string, float> expectedComputeTime{
{"initial_sigmoid", 128},
{"left_sigmoid2", 64},
{"fc_add_bias3", 192},
{"right_sigmoid1", 128},
{"mul", 96},
{"fc_add_bias2", 96},
{"ret", 0},
{"fc_dot", 21760},
{"left_sigmoid1", 128},
{"fc_add_bias", 192},
{"fc_dot1", 10240},
{"right_sigmoid2", 64},
{"fc_add_bias1", 192},
{"fc_dot2", 5120},
{"fc_dot3", 10240},
{"fc_dot4", 5120},
{"fc_add_bias4", 96},
};
ASSERT_EQ(myPartitioner.getComputeTime().size(), expectedComputeTime.size());
for (auto &el : myPartitioner.getComputeTime()) {
Node *n = el.first;
float expected = expectedComputeTime[nodeNamesMap[n].c_str()];
float res = el.second;
ASSERT_EQ(expected, res);
}

ASSERT_EQ(mod_.getFunctions().size(), 3);
ASSERT_EQ(myList.roots.size(), 1);
}