Skip to content

Commit aa5b494

Browse files
GregoryComerfacebook-github-bot
authored andcommitted
Add EXECUTORCH_THREADPOOL_SIZE options, default to using only performance cores (#14090)
Summary: Allow build-time configuration of the thread pool size and default to a performance heuristic. There are 2 modes that we want to support: * Heuristic-based. Choose the number of threads according to a performance heuristic. Use threads equal to the number of detected performance cores, but we can continue to iterate on this by adding fine-grained heuristics for specific chipsets in the future. * All cores (threads=cores). This is the current behavior. We need to maintain this as an option for some use cases. With this PR, the default (for OSS) is to use performance cores. From testing with CV models on ~10 representative devices across the performance spectrum, this gives anywhere from parity with the existing perf to up to a 13x speedup (measured on Pixel 6). Many common devices (S20, S22, iPhone 15 Pro) show a 2-4x speedup. #### Specifying Threadpool Size To specify the threadpool size, I've added two preprocessor options (and corresponding CMake options): * `EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES`- Use threads = detected perf cores. * `EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES` - Use threads = logical cores. Test Plan: I've verified that logic functions correctly in OSS by building the executor_runner on M1 Mac and observing the existing logging in cpuinfo_utils. Measuring MobileNet V3 (exported from examples) on XNNPACK, time to run 100 iterations drops from ~450ms to ~230ms on M1 Pro with this change. Rollback Plan: Reviewed By: kimishpatel Differential Revision: D81965471 Pulled By: GregoryComer
1 parent f2eb38e commit aa5b494

File tree

6 files changed

+101
-2
lines changed

6 files changed

+101
-2
lines changed

extension/threadpool/CMakeLists.txt

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,16 @@ if(NOT CMAKE_CXX_STANDARD)
2020
set(CMAKE_CXX_STANDARD 17)
2121
endif()
2222

23+
# Threadpool size specifiers. Mutual exclusion is checking in default.cmake.
24+
# Default to using performance cores if
25+
# EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES isn't set.
26+
set(_threadpool_size_flag)
27+
if(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES)
28+
set(_threadpool_size_flag "EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES")
29+
else()
30+
set(_threadpool_size_flag "EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES")
31+
endif()
32+
2333
add_library(
2434
extension_threadpool threadpool.cpp threadpool_guard.cpp thread_parallel.cpp
2535
cpuinfo_utils.cpp
@@ -36,7 +46,9 @@ target_include_directories(
3646
$<BUILD_INTERFACE:${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include>
3747
$<BUILD_INTERFACE:${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include>
3848
)
39-
target_compile_definitions(extension_threadpool PUBLIC ET_USE_THREADPOOL)
49+
target_compile_definitions(
50+
extension_threadpool PUBLIC ET_USE_THREADPOOL ${_threadpool_size_flag}
51+
)
4052
target_compile_options(extension_threadpool PUBLIC ${_common_compile_options})
4153

4254
# Install libraries

extension/threadpool/targets.bzl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ def define_common_targets():
2222
name = "threadpool_lib",
2323
srcs = _THREADPOOL_SRCS,
2424
deps = [
25+
":cpuinfo_utils",
2526
"//executorch/runtime/core:core",
2627
"//executorch/runtime/core/portable_type/c10/c10:c10",
2728
],

extension/threadpool/test/threadpool_test.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
*/
88

99
#include <executorch/extension/threadpool/threadpool.h>
10+
#include <executorch/runtime/platform/runtime.h>
1011

1112
#include <mutex>
1213
#include <numeric>
@@ -71,6 +72,8 @@ void run_lambda_with_size(
7172
} // namespace
7273

7374
TEST(ThreadPoolTest, ParallelAdd) {
75+
executorch::runtime::runtime_init();
76+
7477
std::vector<int32_t> a, b, c, c_ref;
7578
size_t vector_size = 100;
7679
size_t grain_size = 10;
@@ -111,6 +114,8 @@ TEST(ThreadPoolTest, ParallelAdd) {
111114

112115
// Test parallel reduction where we acquire lock within lambda
113116
TEST(ThreadPoolTest, ParallelReduce) {
117+
executorch::runtime::runtime_init();
118+
114119
std::vector<int32_t> a;
115120
int32_t c = 0, c_ref = 0;
116121
size_t vector_size = 100;
@@ -144,6 +149,8 @@ TEST(ThreadPoolTest, ParallelReduce) {
144149
// Copied from
145150
// caffe2/aten/src/ATen/test/test_thread_pool_guard.cp
146151
TEST(TestNoThreadPoolGuard, TestThreadPoolGuard) {
152+
executorch::runtime::runtime_init();
153+
147154
auto threadpool_ptr = ::executorch::extension::threadpool::get_pthreadpool();
148155

149156
ASSERT_NE(threadpool_ptr, nullptr);
@@ -173,6 +180,8 @@ TEST(TestNoThreadPoolGuard, TestThreadPoolGuard) {
173180
}
174181

175182
TEST(TestNoThreadPoolGuard, TestRunWithGuard) {
183+
executorch::runtime::runtime_init();
184+
176185
const std::vector<int64_t> array = {1, 2, 3};
177186

178187
auto pool = ::executorch::extension::threadpool::get_threadpool();

extension/threadpool/threadpool.cpp

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9+
#include <executorch/extension/threadpool/cpuinfo_utils.h>
910
#include <executorch/extension/threadpool/threadpool.h>
1011

1112
#include <algorithm>
@@ -14,9 +15,26 @@
1415

1516
#include <executorch/extension/threadpool/threadpool_guard.h>
1617
#include <executorch/runtime/platform/assert.h>
18+
#include <executorch/runtime/platform/runtime.h>
1719

1820
#include <cpuinfo.h>
1921

22+
// At most one mode should be set.
23+
#if ( \
24+
defined(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES) && \
25+
defined(EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES))
26+
#error Multiple \
27+
threadpool size specifiers are set.At most one of \
28+
EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES, \
29+
and EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES may be defined.
30+
#endif
31+
32+
// Default to EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES if no mode is set.
33+
#if !defined(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES) && \
34+
!defined(EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES)
35+
#define EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES 1
36+
#endif
37+
2038
namespace executorch::extension::threadpool {
2139

2240
#if !(defined(WIN32))
@@ -96,12 +114,25 @@ void ThreadPool::run(
96114
// get_threadpool is not thread safe due to leak_corrupted_threadpool
97115
// Make this part threadsafe: TODO(kimishpatel)
98116
ThreadPool* get_threadpool() {
117+
executorch::runtime::runtime_init();
118+
99119
if (!cpuinfo_initialize()) {
100120
ET_LOG(Error, "cpuinfo initialization failed");
101121
return nullptr; // NOLINT(facebook-hte-NullableReturn)
102122
}
103123

104-
int num_threads = cpuinfo_get_processors_count();
124+
// Choose the number of threads according to the EXECUTORCH_THREADPOOL_
125+
// options. See the description in threadpool.h.
126+
127+
#if defined(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES)
128+
// Use threads=cores.
129+
static int num_threads = cpuinfo_get_processors_count();
130+
#else
131+
// Set threads equal to the number of performance cores.
132+
static int num_threads =
133+
::executorch::extension::cpuinfo::get_num_performant_cores();
134+
#endif
135+
105136
/*
106137
* For llvm-tsan, holding limit for the number of locks for a single thread
107138
* is 63 (because of comparison < 64 instead of <=). pthreadpool's worst

extension/threadpool/threadpool.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,22 @@
1414

1515
#include <pthreadpool.h>
1616

17+
/*
18+
* Threadpool Options:
19+
*
20+
* Threadpool size has a sizble affect on performance. By default, the
21+
* threadpool will be sized according to the number of performance cores. This
22+
* behavior can be overriden with the following build-time options. Note that
23+
* these options are mutually exclusive.
24+
*
25+
* - EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES (flag) - Sizes the threadpool
26+
* equal to the number of performance cores on the system. This is the default
27+
* behavior.
28+
* - EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES (flag) - Sizes the threadpool
29+
* equal to the number of logical cores on system. This is the historical
30+
* behavior.
31+
*/
32+
1733
namespace executorch::extension::threadpool {
1834

1935
class ThreadPool final {

tools/cmake/preset/default.cmake

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,36 @@ define_overridable_option(
176176
${_default_executorch_build_cpuinfo}
177177
)
178178

179+
# Threadpool size options. At most one can be specified. Note that the default
180+
# is managed in threadpool.cpp to allow the user to specify an alternate mode
181+
# without needing to explicitly set the default to off.
182+
define_overridable_option(
183+
EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES
184+
"Set the number of threads used for CPU parallel computation equal to the number of performant CPU cores."
185+
BOOL
186+
OFF
187+
)
188+
define_overridable_option(
189+
EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES
190+
"Set the number of threads used for CPU parallel computation equal to the number of logical CPU cores."
191+
BOOL
192+
OFF
193+
)
194+
195+
check_required_options_on(
196+
IF_ON EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES REQUIRES
197+
EXECUTORCH_BUILD_PTHREADPOOL EXECUTORCH_BUILD_CPUINFO
198+
)
199+
check_required_options_on(
200+
IF_ON EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES REQUIRES
201+
EXECUTORCH_BUILD_PTHREADPOOL EXECUTORCH_BUILD_CPUINFO
202+
)
203+
204+
check_conflicting_options_on(
205+
IF_ON EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES CONFLICTS_WITH
206+
EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES
207+
)
208+
179209
# TODO(jathu): move this to platform specific presets when created
180210
set(_default_executorch_build_executor_runner ON)
181211
if(APPLE AND "${SDK_NAME}" STREQUAL "iphoneos")

0 commit comments

Comments
 (0)