Added statistic related to out variant nodes

Edvard Ghazaryan · facebook-github-bot · commit a7f06e1e55b9 · 2021-05-20T13:57:07.000-07:00
Summary: added more statistic info for static runtime Test Plan: caffe2/benchmarks/static_runtime:static_runtime_cpptest Expected output example: Static runtime ms per iter: 0.939483. Iters per second: 1064.41 Node #0: 0.195671 ms/iter, %wide_offset.1 : Tensor = aten::add(%wide.1, %self._mu, %4) Node #1: 0.169457 ms/iter, %wide_normalized.1 : Tensor = aten::mul(%wide_offset.1, %self._sigma) Node #2: 0.118218 ms/iter, %wide_preproc.1 : Tensor = aten::clamp(%wide_normalized.1, %5, %6) Node #3: 0.038814 ms/iter, %user_emb_t.1 : Tensor = aten::transpose(%user_emb.1, %4, %7) Node #4: 0.0860747 ms/iter, %dp_unflatten.1 : Tensor = aten::bmm(%ad_emb_packed.1, %user_emb_t.1) Node #5: 0.0102666 ms/iter, %31 : Tensor = static_runtime::flatten_copy(%dp_unflatten.1, %4, %8) Node #6: 0.000476333 ms/iter, %19 : Tensor[] = prim::ListConstruct(%31, %wide_preproc.1) Node #7: 0.0707332 ms/iter, %input.1 : Tensor = aten::cat(%19, %4) Node #8: 0.123695 ms/iter, %fc1.1 : Tensor = aten::addmm(%self._fc_b, %input.1, %29, %4, %4) Node #9: 0.0309244 ms/iter, %23 : Tensor = aten::sigmoid(%fc1.1) Node #10: 0.0046297 ms/iter, %24 : (Tensor) = prim::TupleConstruct(%23) Time per node type: 0.195671 ms. 23.0483%. aten::add (1 nodes) 0.169457 ms. 19.9605%. aten::mul (1 nodes, out variant) 0.123695 ms. 14.5702%. aten::addmm (1 nodes, out variant) 0.118218 ms. 13.925%. aten::clamp (1 nodes, out variant) 0.0860747 ms. 10.1388%. aten::bmm (1 nodes, out variant) 0.0707332 ms. 8.33175%. aten::cat (1 nodes, out variant) 0.038814 ms. 4.57195%. aten::transpose (1 nodes) 0.0309244 ms. 3.64263%. aten::sigmoid (1 nodes, out variant) 0.0102666 ms. 1.20932%. static_runtime::flatten_copy (1 nodes, out variant) 0.0046297 ms. 0.545338%. prim::TupleConstruct (1 nodes, out variant) 0.000476333 ms. 0.0561079%. prim::ListConstruct (1 nodes, out variant) 0.848959 ms. in Total StaticRuntime setup time: 0.018925 ms Memory allocation time: 0.019808 ms Memory deallocation time: 0.0120445 ms Outputs deallocation time: 0.0864947 ms Total memory managed: 19328 bytes Total number of reused tensors: 3 Total number of 'out' variant nodes/total number of nodes: 9/11 (81.8182%) Reviewed By: hlu1 Differential Revision: D28553029 fbshipit-source-id: 55e7eab50b4b475ae219896100bdf4f6678875a4
diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
@@ -830,8 +830,12 @@ void StaticRuntime::benchmark(
     const double ms = p.second;
     std::cout << std::setw(15) << ms << " ms. " << std::setw(10)
               << results.percent_per_node_type[kind] << "%. " << kind << " ("
-              << results.instances_per_node_type[kind] << " nodes)"
-              << std::endl;
+              << results.instances_per_node_type[kind] << " nodes";
+    if (results.out_nodes.count(kind) == 0) {
+      std::cout << ")" << std::endl;
+    } else {
+      std::cout << ", out variant)" << std::endl;
+    }
   }
   std::cout << std::setw(15) << results.total_time << " ms. in Total"
             << std::endl;
@@ -851,6 +855,12 @@ void StaticRuntime::benchmark(
       std::cout << "Total number of reused tensors: "
                 << planner_->total_reused_tensors() << std::endl;
     }
+    std::cout << "Total number of 'out' variant nodes/total number of nodes: "
+              << results.out_nodes_count << "/" << results.total_nodes_count
+              << " ("
+              << 100.0 * (results.out_nodes_count) /
+            static_cast<float>(results.total_nodes_count)
+              << "%)" << std::endl;
   }
   check_for_memory_leak();
 }
@@ -978,8 +988,13 @@ StaticRuntime::IndividualMetrics StaticRuntime::benchmark_individual_ops(
     results.time_per_node[i] /= static_cast<float>(main_runs);
     results.time_per_node_type[kind] += results.time_per_node[i];
     results.instances_per_node_type[kind]++;
+    if (nodes_[i].has_out_variant()) {
+      results.out_nodes.insert(kind);
+      results.out_nodes_count++;
+    }
     results.total_time += results.time_per_node[i];
   }
+  results.total_nodes_count = nodes_.size();
   results.memory_alloc_time /= static_cast<float>(main_runs);
   results.memory_dealloc_time /= static_cast<float>(main_runs);
   results.output_dealloc_time /= static_cast<float>(main_runs);
diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h
@@ -205,10 +205,13 @@ class TORCH_API StaticRuntime {
     float memory_dealloc_time{0.0};
     float output_dealloc_time{0.0};
     float total_time{0.0};
+    size_t out_nodes_count{0};
+    size_t total_nodes_count{0};
     std::vector<float> time_per_node;
     std::unordered_map<std::string, float> time_per_node_type;
     std::unordered_map<std::string, float> percent_per_node_type;
     std::unordered_map<std::string, int> instances_per_node_type;
+    std::unordered_set<std::string> out_nodes;
   };
 
   IndividualMetrics benchmark_individual_ops(
diff --git a/torch/csrc/jit/runtime/static/init.cpp b/torch/csrc/jit/runtime/static/init.cpp
@@ -26,6 +26,11 @@ void initStaticModuleBindings(PyObject* module) {
           "output_dealloc_time",
           &StaticRuntime::IndividualMetrics::output_dealloc_time)
       .def_readonly("total_time", &StaticRuntime::IndividualMetrics::total_time)
+      .def_readonly(
+          "out_nodes_count", &StaticRuntime::IndividualMetrics::out_nodes_count)
+      .def_readonly(
+          "total_nodes_count",
+          &StaticRuntime::IndividualMetrics::total_nodes_count)
       .def_readonly(
           "time_per_node", &StaticRuntime::IndividualMetrics::time_per_node)
       .def_readonly(
@@ -36,7 +41,8 @@ void initStaticModuleBindings(PyObject* module) {
           &StaticRuntime::IndividualMetrics::percent_per_node_type)
       .def_readonly(
           "instances_per_node_type",
-          &StaticRuntime::IndividualMetrics::instances_per_node_type);
+          &StaticRuntime::IndividualMetrics::instances_per_node_type)
+      .def_readonly("out_nodes", &StaticRuntime::IndividualMetrics::out_nodes);
   static_module
       .def(
           "__call__",