Skip to content

Commit 2554db9

Browse files
pdxChentorvalds
authored andcommitted
sched/wait: Break up long wake list walk
We encountered workloads that have very long wake up list on large systems. A waker takes a long time to traverse the entire wake list and execute all the wake functions. We saw page wait list that are up to 3700+ entries long in tests of large 4 and 8 socket systems. It took 0.8 sec to traverse such list during wake up. Any other CPU that contends for the list spin lock will spin for a long time. It is a result of the numa balancing migration of hot pages that are shared by many threads. Multiple CPUs waking are queued up behind the lock, and the last one queued has to wait until all CPUs did all the wakeups. The page wait list is traversed with interrupt disabled, which caused various problems. This was the original cause that triggered the NMI watch dog timer in: https://patchwork.kernel.org/patch/9800303/ . Only extending the NMI watch dog timer there helped. This patch bookmarks the waker's scan position in wake list and break the wake up walk, to allow access to the list before the waker resume its walk down the rest of the wait list. It lowers the interrupt and rescheduling latency. This patch also provides a performance boost when combined with the next patch to break up page wakeup list walk. We saw 22% improvement in the will-it-scale file pread2 test on a Xeon Phi system running 256 threads. [ v2: Merged in Linus' changes to remove the bookmark_wake_function, and simply access to flags. ] Reported-by: Kan Liang <[email protected]> Tested-by: Kan Liang <[email protected]> Signed-off-by: Tim Chen <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 46c1e79 commit 2554db9

File tree

2 files changed

+64
-15
lines changed

2 files changed

+64
-15
lines changed

include/linux/wait.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int
1818
/* wait_queue_entry::flags */
1919
#define WQ_FLAG_EXCLUSIVE 0x01
2020
#define WQ_FLAG_WOKEN 0x02
21+
#define WQ_FLAG_BOOKMARK 0x04
2122

2223
/*
2324
* A single wait-queue entry structure:

kernel/sched/wait.c

Lines changed: 63 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,12 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry
5353
}
5454
EXPORT_SYMBOL(remove_wait_queue);
5555

56+
/*
57+
* Scan threshold to break wait queue walk.
58+
* This allows a waker to take a break from holding the
59+
* wait queue lock during the wait queue walk.
60+
*/
61+
#define WAITQUEUE_WALK_BREAK_CNT 64
5662

5763
/*
5864
* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
@@ -63,18 +69,67 @@ EXPORT_SYMBOL(remove_wait_queue);
6369
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
6470
* zero in this (rare) case, and we handle it by continuing to scan the queue.
6571
*/
66-
static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
67-
int nr_exclusive, int wake_flags, void *key)
72+
static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
73+
int nr_exclusive, int wake_flags, void *key,
74+
wait_queue_entry_t *bookmark)
6875
{
6976
wait_queue_entry_t *curr, *next;
77+
int cnt = 0;
78+
79+
if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) {
80+
curr = list_next_entry(bookmark, entry);
7081

71-
list_for_each_entry_safe(curr, next, &wq_head->head, entry) {
82+
list_del(&bookmark->entry);
83+
bookmark->flags = 0;
84+
} else
85+
curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
86+
87+
if (&curr->entry == &wq_head->head)
88+
return nr_exclusive;
89+
90+
list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) {
7291
unsigned flags = curr->flags;
73-
int ret = curr->func(curr, mode, wake_flags, key);
92+
int ret;
93+
94+
if (flags & WQ_FLAG_BOOKMARK)
95+
continue;
96+
97+
ret = curr->func(curr, mode, wake_flags, key);
7498
if (ret < 0)
7599
break;
76100
if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
77101
break;
102+
103+
if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
104+
(&next->entry != &wq_head->head)) {
105+
bookmark->flags = WQ_FLAG_BOOKMARK;
106+
list_add_tail(&bookmark->entry, &next->entry);
107+
break;
108+
}
109+
}
110+
return nr_exclusive;
111+
}
112+
113+
static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode,
114+
int nr_exclusive, int wake_flags, void *key)
115+
{
116+
unsigned long flags;
117+
wait_queue_entry_t bookmark;
118+
119+
bookmark.flags = 0;
120+
bookmark.private = NULL;
121+
bookmark.func = NULL;
122+
INIT_LIST_HEAD(&bookmark.entry);
123+
124+
spin_lock_irqsave(&wq_head->lock, flags);
125+
nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark);
126+
spin_unlock_irqrestore(&wq_head->lock, flags);
127+
128+
while (bookmark.flags & WQ_FLAG_BOOKMARK) {
129+
spin_lock_irqsave(&wq_head->lock, flags);
130+
nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
131+
wake_flags, key, &bookmark);
132+
spin_unlock_irqrestore(&wq_head->lock, flags);
78133
}
79134
}
80135

@@ -91,11 +146,7 @@ static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
91146
void __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
92147
int nr_exclusive, void *key)
93148
{
94-
unsigned long flags;
95-
96-
spin_lock_irqsave(&wq_head->lock, flags);
97-
__wake_up_common(wq_head, mode, nr_exclusive, 0, key);
98-
spin_unlock_irqrestore(&wq_head->lock, flags);
149+
__wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key);
99150
}
100151
EXPORT_SYMBOL(__wake_up);
101152

@@ -104,13 +155,13 @@ EXPORT_SYMBOL(__wake_up);
104155
*/
105156
void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr)
106157
{
107-
__wake_up_common(wq_head, mode, nr, 0, NULL);
158+
__wake_up_common(wq_head, mode, nr, 0, NULL, NULL);
108159
}
109160
EXPORT_SYMBOL_GPL(__wake_up_locked);
110161

111162
void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key)
112163
{
113-
__wake_up_common(wq_head, mode, 1, 0, key);
164+
__wake_up_common(wq_head, mode, 1, 0, key, NULL);
114165
}
115166
EXPORT_SYMBOL_GPL(__wake_up_locked_key);
116167

@@ -134,7 +185,6 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key);
134185
void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
135186
int nr_exclusive, void *key)
136187
{
137-
unsigned long flags;
138188
int wake_flags = 1; /* XXX WF_SYNC */
139189

140190
if (unlikely(!wq_head))
@@ -143,9 +193,7 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
143193
if (unlikely(nr_exclusive != 1))
144194
wake_flags = 0;
145195

146-
spin_lock_irqsave(&wq_head->lock, flags);
147-
__wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key);
148-
spin_unlock_irqrestore(&wq_head->lock, flags);
196+
__wake_up_common_lock(wq_head, mode, nr_exclusive, wake_flags, key);
149197
}
150198
EXPORT_SYMBOL_GPL(__wake_up_sync_key);
151199

0 commit comments

Comments
 (0)