Skip to content

Commit 5165e4a

Browse files
authored
Merge pull request #26 from delphix/master
Merge branch 'master' into '6.0/stage'
2 parents d5e0f27 + 4f17842 commit 5165e4a

21 files changed

+909
-665
lines changed

docs/case_studies.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
Case Studies
2+
============
3+
4+
These are writeups of real-world problems solved with drgn.
5+
6+
.. toctree::
7+
8+
case_studies/kyber_stack_trace.rst
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
Using Stack Trace Variables to Find a Kyber Bug
2+
===============================================
3+
4+
| Author: Omar Sandoval
5+
| Date: June 9th, 2021
6+
7+
.. highlight:: pycon
8+
9+
Jakub Kicinski reported a crash in the `Kyber I/O scheduler
10+
<https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/block/kyber-iosched.c>`_
11+
when he was testing Linux 5.12. He captured a core dump and asked me to debug
12+
it. This is a quick writeup of that investigation.
13+
14+
First, we can get the task that crashed::
15+
16+
>>> task = per_cpu(prog["runqueues"], prog["crashing_cpu"]).curr
17+
18+
Then, we can get its stack trace::
19+
20+
>>> trace = prog.stack_trace(task)
21+
>>> trace
22+
#0 queued_spin_lock_slowpath (../kernel/locking/qspinlock.c:471:3)
23+
#1 queued_spin_lock (../include/asm-generic/qspinlock.h:85:2)
24+
#2 do_raw_spin_lock (../kernel/locking/spinlock_debug.c:113:2)
25+
#3 spin_lock (../include/linux/spinlock.h:354:2)
26+
#4 kyber_bio_merge (../block/kyber-iosched.c:573:2)
27+
#5 blk_mq_sched_bio_merge (../block/blk-mq-sched.h:37:9)
28+
#6 blk_mq_submit_bio (../block/blk-mq.c:2182:6)
29+
#7 __submit_bio_noacct_mq (../block/blk-core.c:1015:9)
30+
#8 submit_bio_noacct (../block/blk-core.c:1048:10)
31+
#9 submit_bio (../block/blk-core.c:1125:9)
32+
#10 submit_stripe_bio (../fs/btrfs/volumes.c:6553:2)
33+
#11 btrfs_map_bio (../fs/btrfs/volumes.c:6642:3)
34+
#12 btrfs_submit_data_bio (../fs/btrfs/inode.c:2440:8)
35+
#13 submit_one_bio (../fs/btrfs/extent_io.c:175:9)
36+
#14 submit_extent_page (../fs/btrfs/extent_io.c:3229:10)
37+
#15 __extent_writepage_io (../fs/btrfs/extent_io.c:3793:9)
38+
#16 __extent_writepage (../fs/btrfs/extent_io.c:3872:8)
39+
#17 extent_write_cache_pages (../fs/btrfs/extent_io.c:4514:10)
40+
#18 extent_writepages (../fs/btrfs/extent_io.c:4635:8)
41+
#19 do_writepages (../mm/page-writeback.c:2352:10)
42+
#20 __writeback_single_inode (../fs/fs-writeback.c:1467:8)
43+
#21 writeback_sb_inodes (../fs/fs-writeback.c:1732:3)
44+
#22 __writeback_inodes_wb (../fs/fs-writeback.c:1801:12)
45+
#23 wb_writeback (../fs/fs-writeback.c:1907:15)
46+
#24 wb_check_background_flush (../fs/fs-writeback.c:1975:10)
47+
#25 wb_do_writeback (../fs/fs-writeback.c:2063:11)
48+
#26 wb_workfn (../fs/fs-writeback.c:2091:20)
49+
#27 process_one_work (../kernel/workqueue.c:2275:2)
50+
#28 worker_thread (../kernel/workqueue.c:2421:4)
51+
#29 kthread (../kernel/kthread.c:292:9)
52+
#30 ret_from_fork+0x1f/0x2a (../arch/x86/entry/entry_64.S:294)
53+
54+
It looks like ``kyber_bio_merge()`` tried to lock an invalid spinlock. For
55+
reference, this is the source code of ``kyber_bio_merge()``:
56+
57+
.. code-block:: c
58+
:lineno-start: 563
59+
60+
static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio,
61+
unsigned int nr_segs)
62+
{
63+
struct kyber_hctx_data *khd = hctx->sched_data;
64+
struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue);
65+
struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]];
66+
unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);
67+
struct list_head *rq_list = &kcq->rq_list[sched_domain];
68+
bool merged;
69+
70+
spin_lock(&kcq->lock);
71+
merged = blk_bio_list_merge(hctx->queue, rq_list, bio, nr_segs);
72+
spin_unlock(&kcq->lock);
73+
74+
return merged;
75+
}
76+
77+
When printed, the ``kcq`` structure containing the spinlock indeed looks like
78+
garbage (omitted for brevity).
79+
80+
A crash course on the Linux kernel block layer: for each block device, there is
81+
a "software queue" (``struct blk_mq_ctx *ctx``) for each CPU and a "hardware
82+
queue" (``struct blk_mq_hw_ctx *hctx``) for each I/O queue provided by the
83+
device. Each hardware queue has one or more software queues assigned to it.
84+
Kyber keeps additional data per hardware queue (``struct kyber_hctx_data
85+
*khd``) and per software queue (``struct kyber_ctx_queue *kcq``).
86+
87+
Let's try to figure out where the bad ``kcq`` came from. It should be an
88+
element of the ``khd->kcqs`` array (``khd`` is optimized out, but we can
89+
recover it from ``hctx->sched_data``)::
90+
91+
>>> trace[4]["khd"]
92+
(struct kyber_hctx_data *)<absent>
93+
>>> hctx = trace[4]["hctx"]
94+
>>> khd = cast("struct kyber_hctx_data *", hctx.sched_data)
95+
>>> trace[4]["kcq"] - khd.kcqs
96+
(ptrdiff_t)1
97+
>>> hctx.nr_ctx
98+
(unsigned short)1
99+
100+
So the ``kcq`` is for the second software queue, but the hardware queue is only
101+
supposed to have one software queue. Let's see which CPU was assigned to the
102+
hardware queue::
103+
104+
>>> hctx.ctxs[0].cpu
105+
(unsigned int)6
106+
107+
Here's the problem: we're not running on CPU 6, we're running on CPU 19::
108+
109+
>>> prog["crashing_cpu"]
110+
(int)19
111+
112+
And CPU 19 is assigned to a different hardware queue that actually does have
113+
two software queues::
114+
115+
>>> ctx = per_cpu_ptr(hctx.queue.queue_ctx, 19)
116+
>>> other_hctx = ctx.hctxs[hctx.type]
117+
>>> other_hctx == hctx
118+
False
119+
>>> other_hctx.nr_ctx
120+
(unsigned short)2
121+
122+
The bug is that the caller gets the ``hctx`` for the current CPU, then
123+
``kyber_bio_merge()`` gets the ``ctx`` for the current CPU, and if the thread
124+
is migrated to another CPU in between, they won't match. The fix is to get a
125+
consistent view of the ``hctx`` and ``ctx``. The commit that fixes this is
126+
`here
127+
<https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=efed9a3337e341bd0989161b97453b52567bc59d>`_.

docs/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,4 @@ Table of Contents
3636
advanced_usage
3737
api_reference
3838
helpers
39+
case_studies

docs/user_guide.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,7 @@ explicitly::
317317
Next Steps
318318
----------
319319

320-
Refer to the :doc:`api_reference`. Look through the :doc:`helpers`. Browse
321-
through the official `examples
320+
Refer to the :doc:`api_reference`. Look through the :doc:`helpers`. Read some
321+
:doc:`case_studies`. Browse through the official `examples
322322
<https://github.com/osandov/drgn/tree/main/examples>`_ and `tools
323323
<https://github.com/osandov/drgn/tree/main/tools>`_.

libdrgn/Makefile.am

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@ libdrgnimpl_la_SOURCES = $(ARCH_DEFS:.defs=.c) \
7070
register_state.h \
7171
serialize.c \
7272
serialize.h \
73-
siphash.h \
7473
splay_tree.c \
7574
stack_trace.c \
7675
stack_trace.h \

libdrgn/binary_buffer.h

Lines changed: 65 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@
1414

1515
#include <assert.h>
1616
#include <byteswap.h>
17+
#include <inttypes.h>
1718
#include <stdbool.h>
1819
#include <stddef.h>
19-
#include <stdint.h>
2020
#include <string.h>
2121

2222
#include "util.h"
@@ -138,11 +138,11 @@ static inline bool binary_buffer_has_next(struct binary_buffer *bb)
138138
}
139139

140140
static inline struct drgn_error *
141-
binary_buffer_check_bounds(struct binary_buffer *bb, size_t n)
141+
binary_buffer_check_bounds(struct binary_buffer *bb, uint64_t n)
142142
{
143143
if (unlikely(bb->end - bb->pos < n)) {
144144
return binary_buffer_error_at(bb, bb->pos,
145-
"expected at least %zu byte%s, have %td",
145+
"expected at least %" PRIu64 " byte%s, have %td",
146146
n, n == 1 ? "" : "s",
147147
bb->end - bb->pos);
148148
}
@@ -151,7 +151,7 @@ binary_buffer_check_bounds(struct binary_buffer *bb, size_t n)
151151

152152
/** Advance the current buffer position by @p n bytes. */
153153
static inline struct drgn_error *binary_buffer_skip(struct binary_buffer *bb,
154-
size_t n)
154+
uint64_t n)
155155
{
156156
struct drgn_error *err;
157157
if ((err = binary_buffer_check_bounds(bb, n)))
@@ -347,59 +347,102 @@ binary_buffer_next_sint(struct binary_buffer *bb, size_t size, int64_t *ret)
347347
static inline struct drgn_error *
348348
binary_buffer_next_uleb128(struct binary_buffer *bb, uint64_t *ret)
349349
{
350-
int shift = 0;
351350
uint64_t value = 0;
352351
const char *pos = bb->pos;
353-
while (likely(pos < bb->end)) {
354-
uint8_t byte = *(uint8_t *)(pos++);
355-
if (unlikely(shift == 63 && byte > 1)) {
352+
uint8_t byte;
353+
/* No overflow possible for the first 9 bytes. */
354+
for (int shift = 0; shift < 63; shift += 7) {
355+
if (unlikely(pos >= bb->end)) {
356+
oob:
356357
return binary_buffer_error_at(bb, bb->pos,
357-
"ULEB128 number overflows unsigned 64-bit integer");
358+
"expected ULEB128 number");
358359
}
360+
byte = *(uint8_t *)(pos++);
359361
value |= (uint64_t)(byte & 0x7f) << shift;
360-
shift += 7;
361362
if (!(byte & 0x80)) {
363+
done:
362364
bb->prev = bb->pos;
363365
bb->pos = pos;
364366
*ret = value;
365367
return NULL;
366368
}
367369
}
368-
return binary_buffer_error_at(bb, bb->pos, "expected ULEB128 number");
370+
/* The 10th byte must be 0 or 1. */
371+
if (unlikely(pos >= bb->end))
372+
goto oob;
373+
byte = *(uint8_t *)(pos++);
374+
if (byte & 0x7e) {
375+
overflow:
376+
return binary_buffer_error_at(bb, bb->pos,
377+
"ULEB128 number overflows unsigned 64-bit integer");
378+
}
379+
value |= (uint64_t)byte << 63;
380+
/* Any remaining bytes must be 0. */
381+
while (byte & 0x80) {
382+
if (unlikely(pos >= bb->end))
383+
goto oob;
384+
byte = *(uint8_t *)(pos++);
385+
if (byte & 0x7f)
386+
goto overflow;
387+
}
388+
goto done;
369389
}
370390

371391
/**
372392
* Decode a Signed Little-Endian Base 128 (SLEB128) number at the current buffer
373393
* position and advance the position.
374394
*
375-
* If the number does not fit in a @c int64_t, an error is returned.
395+
* If the number does not fit in an @c int64_t, an error is returned.
376396
*
377397
* @param[out] ret Returned value.
378398
*/
379399
static inline struct drgn_error *
380400
binary_buffer_next_sleb128(struct binary_buffer *bb, int64_t *ret)
381401
{
382-
int shift = 0;
383-
int64_t value = 0;
402+
uint64_t value = 0;
384403
const char *pos = bb->pos;
385-
while (likely(pos < bb->end)) {
386-
uint8_t byte = *(uint8_t *)(pos++);
387-
if (unlikely(shift == 63 && byte != 0 && byte != 0x7f)) {
404+
uint8_t byte;
405+
/* No overflow possible for the first 9 bytes. */
406+
for (int shift = 0; shift < 63; shift += 7) {
407+
if (unlikely(pos >= bb->end)) {
408+
oob:
388409
return binary_buffer_error_at(bb, bb->pos,
389-
"SLEB128 number overflows signed 64-bit integer");
410+
"expected SLEB128 number");
390411
}
412+
byte = *(uint8_t *)(pos++);
391413
value |= (uint64_t)(byte & 0x7f) << shift;
392-
shift += 7;
393414
if (!(byte & 0x80)) {
415+
if (byte & 0x40)
416+
value |= ~(UINT64_C(1) << (shift + 7)) + 1;
417+
done:
394418
bb->prev = bb->pos;
395419
bb->pos = pos;
396-
if (shift < 64 && (byte & 0x40))
397-
value |= ~(UINT64_C(1) << shift) + 1;
398420
*ret = value;
399421
return NULL;
400422
}
401423
}
402-
return binary_buffer_error_at(bb, bb->pos, "expected SLEB128 number");
424+
/*
425+
* The least significant bit of the 10th byte must be the sign bit, and
426+
* any other bits must match it (sign extension).
427+
*/
428+
if (unlikely(pos >= bb->end))
429+
goto oob;
430+
byte = *(uint8_t *)(pos++);
431+
uint8_t sign = byte & 0x7f;
432+
if (sign != 0 && sign != 0x7f) {
433+
overflow:
434+
return binary_buffer_error_at(bb, bb->pos,
435+
"SLEB128 number overflows signed 64-bit integer");
436+
}
437+
value |= (uint64_t)byte << 63;
438+
while (byte & 0x80) {
439+
if (unlikely(pos >= bb->end))
440+
goto oob;
441+
byte = *(uint8_t *)(pos++);
442+
if ((byte & 0x7f) != sign)
443+
goto overflow;
444+
}
445+
goto done;
403446
}
404447

405448
/**

0 commit comments

Comments
 (0)