Skip to content

Commit 67bee31

Browse files
htejunKernel Patches Daemon
authored and
Kernel Patches Daemon
committed
sched_ext: Allow BPF schedulers to switch all eligible tasks into sched_ext
Currently, to use sched_ext, each task has to be put into sched_ext using sched_setscheduler(2). However, some BPF schedulers and use cases might prefer to service all eligible tasks. This patch adds a new kfunc helper, scx_bpf_switch_all(), that BPF schedulers can call from ops.init() to switch all SCHED_NORMAL, SCHED_BATCH and SCHED_IDLE tasks into sched_ext. This has the benefit that the scheduler swaps are transparent to the users and applications. As we know that CFS is not being used when scx_bpf_switch_all() is used, we can also disable hot path entry points with static_branches. Both the simple and qmap example schedulers are updated to switch all tasks by default to ease testing. '-p' option is added which enables the original behavior of switching only tasks which are explicitly on SCHED_EXT. v2: In the example schedulers, switch all tasks by default. Signed-off-by: Tejun Heo <[email protected]> Suggested-by: Barret Rhoden <[email protected]> Reviewed-by: David Vernet <[email protected]> Acked-by: Josh Don <[email protected]> Acked-by: Hao Luo <[email protected]> Acked-by: Barret Rhoden <[email protected]>
1 parent 3df08c1 commit 67bee31

File tree

8 files changed

+85
-7
lines changed

8 files changed

+85
-7
lines changed

kernel/sched/core.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1236,7 +1236,7 @@ bool sched_can_stop_tick(struct rq *rq)
12361236
* if there's more than one we need the tick for involuntary
12371237
* preemption.
12381238
*/
1239-
if (rq->nr_running > 1)
1239+
if (!scx_switched_all() && rq->nr_running > 1)
12401240
return false;
12411241

12421242
/*
@@ -5723,8 +5723,10 @@ void scheduler_tick(void)
57235723
wq_worker_tick(curr);
57245724

57255725
#ifdef CONFIG_SMP
5726-
rq->idle_balance = idle_cpu(cpu);
5727-
trigger_load_balance(rq);
5726+
if (!scx_switched_all()) {
5727+
rq->idle_balance = idle_cpu(cpu);
5728+
trigger_load_balance(rq);
5729+
}
57285730
#endif
57295731
}
57305732

kernel/sched/ext.c

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,10 @@ static DEFINE_MUTEX(scx_ops_enable_mutex);
8080
DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
8181
DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
8282
static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
83+
static bool scx_switch_all_req;
84+
static bool scx_switching_all;
85+
DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
86+
8387
static struct sched_ext_ops scx_ops;
8488
static bool scx_warned_zero_slice;
8589

@@ -2068,6 +2072,8 @@ bool task_should_scx(struct task_struct *p)
20682072
{
20692073
if (!scx_enabled() || scx_ops_disabling())
20702074
return false;
2075+
if (READ_ONCE(scx_switching_all))
2076+
return true;
20712077
return p->policy == SCHED_EXT;
20722078
}
20732079

@@ -2195,6 +2201,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
21952201
*/
21962202
mutex_lock(&scx_ops_enable_mutex);
21972203

2204+
static_branch_disable(&__scx_switched_all);
2205+
WRITE_ONCE(scx_switching_all, false);
2206+
21982207
/* avoid racing against fork */
21992208
cpus_read_lock();
22002209
percpu_down_write(&scx_fork_rwsem);
@@ -2378,6 +2387,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
23782387
*/
23792388
cpus_read_lock();
23802389

2390+
scx_switch_all_req = false;
23812391
if (scx_ops.init) {
23822392
ret = SCX_CALL_OP_RET(SCX_KF_INIT, init);
23832393
if (ret) {
@@ -2493,6 +2503,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
24932503
* transitions here are synchronized against sched_ext_free() through
24942504
* scx_tasks_lock.
24952505
*/
2506+
WRITE_ONCE(scx_switching_all, scx_switch_all_req);
2507+
24962508
scx_task_iter_init(&sti);
24972509
while ((p = scx_task_iter_next_filtered_locked(&sti))) {
24982510
if (READ_ONCE(p->__state) != TASK_DEAD) {
@@ -2524,6 +2536,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
25242536
goto err_disable;
25252537
}
25262538

2539+
if (scx_switch_all_req)
2540+
static_branch_enable_cpuslocked(&__scx_switched_all);
2541+
25272542
cpus_read_unlock();
25282543
mutex_unlock(&scx_ops_enable_mutex);
25292544

@@ -2558,6 +2573,9 @@ static int scx_debug_show(struct seq_file *m, void *v)
25582573
mutex_lock(&scx_ops_enable_mutex);
25592574
seq_printf(m, "%-30s: %s\n", "ops", scx_ops.name);
25602575
seq_printf(m, "%-30s: %ld\n", "enabled", scx_enabled());
2576+
seq_printf(m, "%-30s: %d\n", "switching_all",
2577+
READ_ONCE(scx_switching_all));
2578+
seq_printf(m, "%-30s: %ld\n", "switched_all", scx_switched_all());
25612579
seq_printf(m, "%-30s: %s\n", "enable_state",
25622580
scx_ops_enable_state_str[scx_ops_enable_state()]);
25632581
seq_printf(m, "%-30s: %lu\n", "nr_rejected",
@@ -2809,6 +2827,29 @@ __diag_push();
28092827
__diag_ignore_all("-Wmissing-prototypes",
28102828
"Global functions as their definitions will be in vmlinux BTF");
28112829

2830+
/**
2831+
* scx_bpf_switch_all - Switch all tasks into SCX
2832+
*
2833+
* Switch all existing and future non-dl/rt tasks to SCX. This can only be
2834+
* called from ops.init(), and actual switching is performed asynchronously.
2835+
*/
2836+
void scx_bpf_switch_all(void)
2837+
{
2838+
if (!scx_kf_allowed(SCX_KF_INIT))
2839+
return;
2840+
2841+
scx_switch_all_req = true;
2842+
}
2843+
2844+
BTF_SET8_START(scx_kfunc_ids_init)
2845+
BTF_ID_FLAGS(func, scx_bpf_switch_all)
2846+
BTF_SET8_END(scx_kfunc_ids_init)
2847+
2848+
static const struct btf_kfunc_id_set scx_kfunc_set_init = {
2849+
.owner = THIS_MODULE,
2850+
.set = &scx_kfunc_ids_init,
2851+
};
2852+
28122853
/**
28132854
* scx_bpf_create_dsq - Create a custom DSQ
28142855
* @dsq_id: DSQ to create
@@ -3312,6 +3353,8 @@ static int __init register_ext_kfuncs(void)
33123353
* check using scx_kf_allowed().
33133354
*/
33143355
if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
3356+
&scx_kfunc_set_init)) ||
3357+
(ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
33153358
&scx_kfunc_set_sleepable)) ||
33163359
(ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
33173360
&scx_kfunc_set_enqueue_dispatch)) ||

kernel/sched/ext.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,9 @@ extern unsigned long scx_watchdog_timeout;
7575
extern unsigned long scx_watchdog_timestamp;
7676

7777
DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled);
78+
DECLARE_STATIC_KEY_FALSE(__scx_switched_all);
7879
#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled)
80+
#define scx_switched_all() static_branch_unlikely(&__scx_switched_all)
7981

8082
static inline bool task_on_scx(const struct task_struct *p)
8183
{
@@ -115,6 +117,8 @@ static inline void scx_notify_sched_tick(void)
115117
static inline const struct sched_class *next_active_class(const struct sched_class *class)
116118
{
117119
class++;
120+
if (scx_switched_all() && class == &fair_sched_class)
121+
class++;
118122
if (!scx_enabled() && class == &ext_sched_class)
119123
class++;
120124
return class;
@@ -137,6 +141,7 @@ static inline const struct sched_class *next_active_class(const struct sched_cla
137141
#else /* CONFIG_SCHED_CLASS_EXT */
138142

139143
#define scx_enabled() false
144+
#define scx_switched_all() false
140145

141146
static inline bool task_on_scx(const struct task_struct *p) { return false; }
142147
static inline void scx_pre_fork(struct task_struct *p) {}

tools/sched_ext/scx_common.bpf.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ void ___scx_bpf_error_format_checker(const char *fmt, ...) {}
5353
___scx_bpf_error_format_checker(fmt, ##args); \
5454
})
5555

56+
void scx_bpf_switch_all(void) __ksym;
5657
s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
5758
bool scx_bpf_consume(u64 dsq_id) __ksym;
5859
u32 scx_bpf_dispatch_nr_slots(void) __ksym;

tools/sched_ext/scx_qmap.bpf.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
char _license[] SEC("license") = "GPL";
2626

2727
const volatile u64 slice_ns = SCX_SLICE_DFL;
28+
const volatile bool switch_partial;
2829
const volatile u32 stall_user_nth;
2930
const volatile u32 stall_kernel_nth;
3031
const volatile s32 disallow_tgid;
@@ -239,6 +240,13 @@ s32 BPF_STRUCT_OPS(qmap_prep_enable, struct task_struct *p,
239240
return -ENOMEM;
240241
}
241242

243+
s32 BPF_STRUCT_OPS(qmap_init)
244+
{
245+
if (!switch_partial)
246+
scx_bpf_switch_all();
247+
return 0;
248+
}
249+
242250
void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
243251
{
244252
uei_record(&uei, ei);
@@ -251,6 +259,7 @@ struct sched_ext_ops qmap_ops = {
251259
.dequeue = (void *)qmap_dequeue,
252260
.dispatch = (void *)qmap_dispatch,
253261
.prep_enable = (void *)qmap_prep_enable,
262+
.init = (void *)qmap_init,
254263
.exit = (void *)qmap_exit,
255264
.timeout_ms = 5000U,
256265
.name = "qmap",

tools/sched_ext/scx_qmap.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,14 @@ const char help_fmt[] =
1818
"\n"
1919
"See the top-level comment in .bpf.c for more details.\n"
2020
"\n"
21-
"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-d PID]\n"
21+
"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-d PID] [-p]\n"
2222
"\n"
2323
" -s SLICE_US Override slice duration\n"
2424
" -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n"
2525
" -t COUNT Stall every COUNT'th user thread\n"
2626
" -T COUNT Stall every COUNT'th kernel thread\n"
2727
" -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n"
28+
" -p Switch only tasks on SCHED_EXT policy intead of all\n"
2829
" -h Display this help and exit\n";
2930

3031
static volatile int exit_req;
@@ -48,7 +49,7 @@ int main(int argc, char **argv)
4849
skel = scx_qmap__open();
4950
SCX_BUG_ON(!skel, "Failed to open skel");
5051

51-
while ((opt = getopt(argc, argv, "s:e:t:T:d:h")) != -1) {
52+
while ((opt = getopt(argc, argv, "s:e:t:T:d:ph")) != -1) {
5253
switch (opt) {
5354
case 's':
5455
skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -67,6 +68,9 @@ int main(int argc, char **argv)
6768
if (skel->rodata->disallow_tgid < 0)
6869
skel->rodata->disallow_tgid = getpid();
6970
break;
71+
case 'p':
72+
skel->rodata->switch_partial = true;
73+
break;
7074
default:
7175
fprintf(stderr, help_fmt, basename(argv[0]));
7276
return opt != 'h';

tools/sched_ext/scx_simple.bpf.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515

1616
char _license[] SEC("license") = "GPL";
1717

18+
const volatile bool switch_partial;
19+
1820
struct user_exit_info uei;
1921

2022
struct {
@@ -43,6 +45,13 @@ void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
4345
scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
4446
}
4547

48+
s32 BPF_STRUCT_OPS(simple_init)
49+
{
50+
if (!switch_partial)
51+
scx_bpf_switch_all();
52+
return 0;
53+
}
54+
4655
void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
4756
{
4857
uei_record(&uei, ei);
@@ -51,6 +60,7 @@ void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
5160
SEC(".struct_ops.link")
5261
struct sched_ext_ops simple_ops = {
5362
.enqueue = (void *)simple_enqueue,
63+
.init = (void *)simple_init,
5464
.exit = (void *)simple_exit,
5565
.name = "simple",
5666
};

tools/sched_ext/scx_simple.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,9 @@ const char help_fmt[] =
1717
"\n"
1818
"See the top-level comment in .bpf.c for more details.\n"
1919
"\n"
20-
"Usage: %s\n"
20+
"Usage: %s [-p]\n"
2121
"\n"
22+
" -p Switch only tasks on SCHED_EXT policy intead of all\n"
2223
" -h Display this help and exit\n";
2324

2425
static volatile int exit_req;
@@ -62,8 +63,11 @@ int main(int argc, char **argv)
6263
skel = scx_simple__open();
6364
SCX_BUG_ON(!skel, "Failed to open skel");
6465

65-
while ((opt = getopt(argc, argv, "h")) != -1) {
66+
while ((opt = getopt(argc, argv, "ph")) != -1) {
6667
switch (opt) {
68+
case 'p':
69+
skel->rodata->switch_partial = true;
70+
break;
6771
default:
6872
fprintf(stderr, help_fmt, basename(argv[0]));
6973
return opt != 'h';

0 commit comments

Comments
 (0)