Skip to content

Commit 178331f

Browse files
axboegregkh
authored andcommitted
io-wq: remove GFP_ATOMIC allocation off schedule out path
[ Upstream commit d3e9f73 ] Daniel reports that the v5.14-rc4-rt4 kernel throws a BUG when running stress-ng: | [ 90.202543] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35 | [ 90.202549] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 2047, name: iou-wrk-2041 | [ 90.202555] CPU: 5 PID: 2047 Comm: iou-wrk-2041 Tainted: G W 5.14.0-rc4-rt4+ #89 | [ 90.202559] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014 | [ 90.202561] Call Trace: | [ 90.202577] dump_stack_lvl+0x34/0x44 | [ 90.202584] ___might_sleep.cold+0x87/0x94 | [ 90.202588] rt_spin_lock+0x19/0x70 | [ 90.202593] ___slab_alloc+0xcb/0x7d0 | [ 90.202598] ? newidle_balance.constprop.0+0xf5/0x3b0 | [ 90.202603] ? dequeue_entity+0xc3/0x290 | [ 90.202605] ? io_wqe_dec_running.isra.0+0x98/0xe0 | [ 90.202610] ? pick_next_task_fair+0xb9/0x330 | [ 90.202612] ? __schedule+0x670/0x1410 | [ 90.202615] ? io_wqe_dec_running.isra.0+0x98/0xe0 | [ 90.202618] kmem_cache_alloc_trace+0x79/0x1f0 | [ 90.202621] io_wqe_dec_running.isra.0+0x98/0xe0 | [ 90.202625] io_wq_worker_sleeping+0x37/0x50 | [ 90.202628] schedule+0x30/0xd0 | [ 90.202630] schedule_timeout+0x8f/0x1a0 | [ 90.202634] ? __bpf_trace_tick_stop+0x10/0x10 | [ 90.202637] io_wqe_worker+0xfd/0x320 | [ 90.202641] ? finish_task_switch.isra.0+0xd3/0x290 | [ 90.202644] ? io_worker_handle_work+0x670/0x670 | [ 90.202646] ? io_worker_handle_work+0x670/0x670 | [ 90.202649] ret_from_fork+0x22/0x30 which is due to the RT kernel not liking a GFP_ATOMIC allocation inside a raw spinlock. Besides that not working on RT, doing any kind of allocation from inside schedule() is kind of nasty and should be avoided if at all possible. This particular path happens when an io-wq worker goes to sleep, and we need a new worker to handle pending work. We currently allocate a small data item to hold the information we need to create a new worker, but we can instead include this data in the io_worker struct itself and just protect it with a single bit lock. We only really need one per worker anyway, as we will have run pending work between to sleep cycles. https://lore.kernel.org/lkml/[email protected]/ Reported-by: Daniel Wagner <[email protected]> Tested-by: Daniel Wagner <[email protected]> Acked-by: Peter Zijlstra (Intel) <[email protected]> Signed-off-by: Jens Axboe <[email protected]> Signed-off-by: Sasha Levin <[email protected]>
1 parent a0bfcc2 commit 178331f

File tree

1 file changed

+40
-32
lines changed

1 file changed

+40
-32
lines changed

fs/io-wq.c

Lines changed: 40 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@ struct io_worker {
5353

5454
struct completion ref_done;
5555

56+
unsigned long create_state;
57+
struct callback_head create_work;
58+
int create_index;
59+
5660
struct rcu_head rcu;
5761
};
5862

@@ -273,24 +277,18 @@ static void io_wqe_inc_running(struct io_worker *worker)
273277
atomic_inc(&acct->nr_running);
274278
}
275279

276-
struct create_worker_data {
277-
struct callback_head work;
278-
struct io_wqe *wqe;
279-
int index;
280-
};
281-
282280
static void create_worker_cb(struct callback_head *cb)
283281
{
284-
struct create_worker_data *cwd;
282+
struct io_worker *worker;
285283
struct io_wq *wq;
286284
struct io_wqe *wqe;
287285
struct io_wqe_acct *acct;
288286
bool do_create = false, first = false;
289287

290-
cwd = container_of(cb, struct create_worker_data, work);
291-
wqe = cwd->wqe;
288+
worker = container_of(cb, struct io_worker, create_work);
289+
wqe = worker->wqe;
292290
wq = wqe->wq;
293-
acct = &wqe->acct[cwd->index];
291+
acct = &wqe->acct[worker->create_index];
294292
raw_spin_lock_irq(&wqe->lock);
295293
if (acct->nr_workers < acct->max_workers) {
296294
if (!acct->nr_workers)
@@ -300,33 +298,42 @@ static void create_worker_cb(struct callback_head *cb)
300298
}
301299
raw_spin_unlock_irq(&wqe->lock);
302300
if (do_create) {
303-
create_io_worker(wq, wqe, cwd->index, first);
301+
create_io_worker(wq, wqe, worker->create_index, first);
304302
} else {
305303
atomic_dec(&acct->nr_running);
306304
io_worker_ref_put(wq);
307305
}
308-
kfree(cwd);
306+
clear_bit_unlock(0, &worker->create_state);
307+
io_worker_release(worker);
309308
}
310309

311-
static void io_queue_worker_create(struct io_wqe *wqe, struct io_wqe_acct *acct)
310+
static void io_queue_worker_create(struct io_wqe *wqe, struct io_worker *worker,
311+
struct io_wqe_acct *acct)
312312
{
313-
struct create_worker_data *cwd;
314313
struct io_wq *wq = wqe->wq;
315314

316315
/* raced with exit, just ignore create call */
317316
if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
318317
goto fail;
318+
if (!io_worker_get(worker))
319+
goto fail;
320+
/*
321+
* create_state manages ownership of create_work/index. We should
322+
* only need one entry per worker, as the worker going to sleep
323+
* will trigger the condition, and waking will clear it once it
324+
* runs the task_work.
325+
*/
326+
if (test_bit(0, &worker->create_state) ||
327+
test_and_set_bit_lock(0, &worker->create_state))
328+
goto fail_release;
319329

320-
cwd = kmalloc(sizeof(*cwd), GFP_ATOMIC);
321-
if (cwd) {
322-
init_task_work(&cwd->work, create_worker_cb);
323-
cwd->wqe = wqe;
324-
cwd->index = acct->index;
325-
if (!task_work_add(wq->task, &cwd->work, TWA_SIGNAL))
326-
return;
327-
328-
kfree(cwd);
329-
}
330+
init_task_work(&worker->create_work, create_worker_cb);
331+
worker->create_index = acct->index;
332+
if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL))
333+
return;
334+
clear_bit_unlock(0, &worker->create_state);
335+
fail_release:
336+
io_worker_release(worker);
330337
fail:
331338
atomic_dec(&acct->nr_running);
332339
io_worker_ref_put(wq);
@@ -344,7 +351,7 @@ static void io_wqe_dec_running(struct io_worker *worker)
344351
if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) {
345352
atomic_inc(&acct->nr_running);
346353
atomic_inc(&wqe->wq->worker_refs);
347-
io_queue_worker_create(wqe, acct);
354+
io_queue_worker_create(wqe, worker, acct);
348355
}
349356
}
350357

@@ -1010,12 +1017,12 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
10101017

10111018
static bool io_task_work_match(struct callback_head *cb, void *data)
10121019
{
1013-
struct create_worker_data *cwd;
1020+
struct io_worker *worker;
10141021

10151022
if (cb->func != create_worker_cb)
10161023
return false;
1017-
cwd = container_of(cb, struct create_worker_data, work);
1018-
return cwd->wqe->wq == data;
1024+
worker = container_of(cb, struct io_worker, create_work);
1025+
return worker->wqe->wq == data;
10191026
}
10201027

10211028
void io_wq_exit_start(struct io_wq *wq)
@@ -1032,12 +1039,13 @@ static void io_wq_exit_workers(struct io_wq *wq)
10321039
return;
10331040

10341041
while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
1035-
struct create_worker_data *cwd;
1042+
struct io_worker *worker;
10361043

1037-
cwd = container_of(cb, struct create_worker_data, work);
1038-
atomic_dec(&cwd->wqe->acct[cwd->index].nr_running);
1044+
worker = container_of(cb, struct io_worker, create_work);
1045+
atomic_dec(&worker->wqe->acct[worker->create_index].nr_running);
10391046
io_worker_ref_put(wq);
1040-
kfree(cwd);
1047+
clear_bit_unlock(0, &worker->create_state);
1048+
io_worker_release(worker);
10411049
}
10421050

10431051
rcu_read_lock();

0 commit comments

Comments
 (0)