@@ -37,7 +37,7 @@ void print_avg_std_dev(std::string type, std::vector<float>& runtimes, uint64_t
37
37
std::transform (runtimes.begin (), runtimes.end (), rt_diff.begin (), [avg_runtime](float x) { return x - avg_runtime; });
38
38
float rt_sq_sum = std::inner_product (rt_diff.begin (), rt_diff.end (), rt_diff.begin (), 0.0 );
39
39
float rt_std_dev = std::sqrt (rt_sq_sum / runtimes.size ());
40
-
40
+
41
41
std::vector<float > fps_diff (runtimes.size ());
42
42
std::transform (runtimes.begin (), runtimes.end (), fps_diff.begin (), [fps, batch_size](float x) { return ((1000 .f / x) * batch_size) - fps; });
43
43
float fps_sq_sum = std::inner_product (fps_diff.begin (), fps_diff.end (), fps_diff.begin (), 0.0 );
@@ -62,7 +62,7 @@ std::vector<float> benchmark_module(torch::jit::script::Module& mod, std::vector
62
62
cudaDeviceSynchronize ();
63
63
64
64
}
65
-
65
+
66
66
for (uint64_t i = 0 ; i < NUM_RUNS; i++) {
67
67
std::vector<torch::jit::IValue> inputs_ivalues;
68
68
auto in = at::rand (shape, {at::kCUDA });
@@ -71,7 +71,7 @@ std::vector<float> benchmark_module(torch::jit::script::Module& mod, std::vector
71
71
#endif
72
72
inputs_ivalues.push_back (in.clone ());
73
73
cudaDeviceSynchronize ();
74
-
74
+
75
75
execution_timer.start ();
76
76
mod.forward (inputs_ivalues);
77
77
cudaDeviceSynchronize ();
@@ -80,7 +80,7 @@ std::vector<float> benchmark_module(torch::jit::script::Module& mod, std::vector
80
80
auto time = execution_timer.milliseconds ();
81
81
execution_timer.reset ();
82
82
execution_runtimes.push_back (time);
83
-
83
+
84
84
c10::cuda::CUDACachingAllocator::emptyCache ();
85
85
}
86
86
return execution_runtimes;
@@ -91,9 +91,9 @@ int main(int argc, const char* argv[]) {
91
91
std::cerr << " usage: benchmark <path-to-exported-script-module> <input-size>\n " << std::endl;
92
92
return -1 ;
93
93
}
94
-
95
-
96
- torch::jit::script:: Module mod;
94
+
95
+
96
+ torch::jit::Module mod;
97
97
try {
98
98
// Deserialize the ScriptModule from a file using torch::jit::load().
99
99
mod = torch::jit::load (argv[1 ]);
@@ -104,16 +104,16 @@ int main(int argc, const char* argv[]) {
104
104
}
105
105
106
106
mod.to (at::kCUDA );
107
-
107
+
108
108
#ifdef HALF
109
109
mod.to (torch::kHalf );
110
110
for (auto layer : mod.named_modules ()) {
111
111
if (layer.name .find (" .bn" ) != std::string::npos) {
112
112
layer.value .to (torch::kFloat );
113
113
}
114
114
}
115
- #endif
116
-
115
+ #endif
116
+
117
117
std::vector<std::vector<int64_t >> dims;
118
118
for (int i = 2 ; i < argc; i++) {
119
119
auto arg = std::string (argv[i]);
@@ -128,7 +128,7 @@ int main(int argc, const char* argv[]) {
128
128
}
129
129
130
130
at::globalContext ().setBenchmarkCuDNN (true );
131
-
131
+
132
132
#ifdef JIT
133
133
auto jit_runtimes = benchmark_module (mod, dims[0 ]);
134
134
print_avg_std_dev (" JIT" , jit_runtimes, dims[0 ][0 ]);
@@ -140,11 +140,11 @@ int main(int argc, const char* argv[]) {
140
140
#ifdef HALF
141
141
extra_info.op_precision = at::kHalf ;
142
142
#endif
143
-
143
+
144
144
auto trt_mod = trtorch::CompileGraph (mod, extra_info);
145
145
auto trt_runtimes = benchmark_module (trt_mod, dims[0 ]);
146
146
print_avg_std_dev (" JIT/TRT" , trt_runtimes, dims[0 ][0 ]);
147
147
#endif
148
-
148
+
149
149
std::cout << " ok\n " ;
150
150
}
0 commit comments