Skip to content

Commit a7f06e1

Browse files
Edvard Ghazaryanfacebook-github-bot
Edvard Ghazaryan
authored andcommitted
Added statistic related to out variant nodes
Summary: added more statistic info for static runtime Test Plan: caffe2/benchmarks/static_runtime:static_runtime_cpptest Expected output example: Static runtime ms per iter: 0.939483. Iters per second: 1064.41 Node #0: 0.195671 ms/iter, %wide_offset.1 : Tensor = aten::add(%wide.1, %self._mu, %4) Node #1: 0.169457 ms/iter, %wide_normalized.1 : Tensor = aten::mul(%wide_offset.1, %self._sigma) Node #2: 0.118218 ms/iter, %wide_preproc.1 : Tensor = aten::clamp(%wide_normalized.1, %5, %6) Node #3: 0.038814 ms/iter, %user_emb_t.1 : Tensor = aten::transpose(%user_emb.1, %4, %7) Node #4: 0.0860747 ms/iter, %dp_unflatten.1 : Tensor = aten::bmm(%ad_emb_packed.1, %user_emb_t.1) Node #5: 0.0102666 ms/iter, %31 : Tensor = static_runtime::flatten_copy(%dp_unflatten.1, %4, %8) Node #6: 0.000476333 ms/iter, %19 : Tensor[] = prim::ListConstruct(%31, %wide_preproc.1) Node #7: 0.0707332 ms/iter, %input.1 : Tensor = aten::cat(%19, %4) Node #8: 0.123695 ms/iter, %fc1.1 : Tensor = aten::addmm(%self._fc_b, %input.1, %29, %4, %4) Node #9: 0.0309244 ms/iter, %23 : Tensor = aten::sigmoid(%fc1.1) Node #10: 0.0046297 ms/iter, %24 : (Tensor) = prim::TupleConstruct(%23) Time per node type: 0.195671 ms. 23.0483%. aten::add (1 nodes) 0.169457 ms. 19.9605%. aten::mul (1 nodes, out variant) 0.123695 ms. 14.5702%. aten::addmm (1 nodes, out variant) 0.118218 ms. 13.925%. aten::clamp (1 nodes, out variant) 0.0860747 ms. 10.1388%. aten::bmm (1 nodes, out variant) 0.0707332 ms. 8.33175%. aten::cat (1 nodes, out variant) 0.038814 ms. 4.57195%. aten::transpose (1 nodes) 0.0309244 ms. 3.64263%. aten::sigmoid (1 nodes, out variant) 0.0102666 ms. 1.20932%. static_runtime::flatten_copy (1 nodes, out variant) 0.0046297 ms. 0.545338%. prim::TupleConstruct (1 nodes, out variant) 0.000476333 ms. 0.0561079%. prim::ListConstruct (1 nodes, out variant) 0.848959 ms. in Total StaticRuntime setup time: 0.018925 ms Memory allocation time: 0.019808 ms Memory deallocation time: 0.0120445 ms Outputs deallocation time: 0.0864947 ms Total memory managed: 19328 bytes Total number of reused tensors: 3 Total number of 'out' variant nodes/total number of nodes: 9/11 (81.8182%) Reviewed By: hlu1 Differential Revision: D28553029 fbshipit-source-id: 55e7eab50b4b475ae219896100bdf4f6678875a4
1 parent 056287a commit a7f06e1

File tree

3 files changed

+27
-3
lines changed

3 files changed

+27
-3
lines changed

torch/csrc/jit/runtime/static/impl.cpp

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -830,8 +830,12 @@ void StaticRuntime::benchmark(
830830
const double ms = p.second;
831831
std::cout << std::setw(15) << ms << " ms. " << std::setw(10)
832832
<< results.percent_per_node_type[kind] << "%. " << kind << " ("
833-
<< results.instances_per_node_type[kind] << " nodes)"
834-
<< std::endl;
833+
<< results.instances_per_node_type[kind] << " nodes";
834+
if (results.out_nodes.count(kind) == 0) {
835+
std::cout << ")" << std::endl;
836+
} else {
837+
std::cout << ", out variant)" << std::endl;
838+
}
835839
}
836840
std::cout << std::setw(15) << results.total_time << " ms. in Total"
837841
<< std::endl;
@@ -851,6 +855,12 @@ void StaticRuntime::benchmark(
851855
std::cout << "Total number of reused tensors: "
852856
<< planner_->total_reused_tensors() << std::endl;
853857
}
858+
std::cout << "Total number of 'out' variant nodes/total number of nodes: "
859+
<< results.out_nodes_count << "/" << results.total_nodes_count
860+
<< " ("
861+
<< 100.0 * (results.out_nodes_count) /
862+
static_cast<float>(results.total_nodes_count)
863+
<< "%)" << std::endl;
854864
}
855865
check_for_memory_leak();
856866
}
@@ -978,8 +988,13 @@ StaticRuntime::IndividualMetrics StaticRuntime::benchmark_individual_ops(
978988
results.time_per_node[i] /= static_cast<float>(main_runs);
979989
results.time_per_node_type[kind] += results.time_per_node[i];
980990
results.instances_per_node_type[kind]++;
991+
if (nodes_[i].has_out_variant()) {
992+
results.out_nodes.insert(kind);
993+
results.out_nodes_count++;
994+
}
981995
results.total_time += results.time_per_node[i];
982996
}
997+
results.total_nodes_count = nodes_.size();
983998
results.memory_alloc_time /= static_cast<float>(main_runs);
984999
results.memory_dealloc_time /= static_cast<float>(main_runs);
9851000
results.output_dealloc_time /= static_cast<float>(main_runs);

torch/csrc/jit/runtime/static/impl.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,10 +205,13 @@ class TORCH_API StaticRuntime {
205205
float memory_dealloc_time{0.0};
206206
float output_dealloc_time{0.0};
207207
float total_time{0.0};
208+
size_t out_nodes_count{0};
209+
size_t total_nodes_count{0};
208210
std::vector<float> time_per_node;
209211
std::unordered_map<std::string, float> time_per_node_type;
210212
std::unordered_map<std::string, float> percent_per_node_type;
211213
std::unordered_map<std::string, int> instances_per_node_type;
214+
std::unordered_set<std::string> out_nodes;
212215
};
213216

214217
IndividualMetrics benchmark_individual_ops(

torch/csrc/jit/runtime/static/init.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,11 @@ void initStaticModuleBindings(PyObject* module) {
2626
"output_dealloc_time",
2727
&StaticRuntime::IndividualMetrics::output_dealloc_time)
2828
.def_readonly("total_time", &StaticRuntime::IndividualMetrics::total_time)
29+
.def_readonly(
30+
"out_nodes_count", &StaticRuntime::IndividualMetrics::out_nodes_count)
31+
.def_readonly(
32+
"total_nodes_count",
33+
&StaticRuntime::IndividualMetrics::total_nodes_count)
2934
.def_readonly(
3035
"time_per_node", &StaticRuntime::IndividualMetrics::time_per_node)
3136
.def_readonly(
@@ -36,7 +41,8 @@ void initStaticModuleBindings(PyObject* module) {
3641
&StaticRuntime::IndividualMetrics::percent_per_node_type)
3742
.def_readonly(
3843
"instances_per_node_type",
39-
&StaticRuntime::IndividualMetrics::instances_per_node_type);
44+
&StaticRuntime::IndividualMetrics::instances_per_node_type)
45+
.def_readonly("out_nodes", &StaticRuntime::IndividualMetrics::out_nodes);
4046
static_module
4147
.def(
4248
"__call__",

0 commit comments

Comments
 (0)