delphix
diff --git a/‎docs/case_studies.rst
Lines changed: 8 additions & 0 deletions b/‎docs/case_studies.rst
Lines changed: 8 additions & 0 deletions
diff --git a/‎docs/case_studies/kyber_stack_trace.rst
Lines changed: 127 additions & 0 deletions b/‎docs/case_studies/kyber_stack_trace.rst
Lines changed: 127 additions & 0 deletions
diff --git a/‎docs/index.rst
Lines changed: 1 addition & 0 deletions b/‎docs/index.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/user_guide.rst
Lines changed: 2 additions & 2 deletions b/‎docs/user_guide.rst
Lines changed: 2 additions & 2 deletions
diff --git a/‎libdrgn/Makefile.am
Lines changed: 0 additions & 1 deletion b/‎libdrgn/Makefile.am
Lines changed: 0 additions & 1 deletion
diff --git a/‎libdrgn/binary_buffer.h
Lines changed: 65 additions & 22 deletions b/‎libdrgn/binary_buffer.h
Lines changed: 65 additions & 22 deletions
@@ -0,0 +1,8 @@
+Case Studies
+============
+
+These are writeups of real-world problems solved with drgn.
+
+.. toctree::
+
+    case_studies/kyber_stack_trace.rst
@@ -0,0 +1,127 @@
+Using Stack Trace Variables to Find a Kyber Bug
+===============================================
+
+| Author: Omar Sandoval
+| Date: June 9th, 2021
+
+.. highlight:: pycon
+
+Jakub Kicinski reported a crash in the `Kyber I/O scheduler
+<https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/block/kyber-iosched.c>`_
+when he was testing Linux 5.12. He captured a core dump and asked me to debug
+it. This is a quick writeup of that investigation.
+
+First, we can get the task that crashed::
+
+    >>> task = per_cpu(prog["runqueues"], prog["crashing_cpu"]).curr
+
+Then, we can get its stack trace::
+
+    >>> trace = prog.stack_trace(task)
+    >>> trace
+    #0  queued_spin_lock_slowpath (../kernel/locking/qspinlock.c:471:3)
+    #1  queued_spin_lock (../include/asm-generic/qspinlock.h:85:2)
+    #2  do_raw_spin_lock (../kernel/locking/spinlock_debug.c:113:2)
+    #3  spin_lock (../include/linux/spinlock.h:354:2)
+    #4  kyber_bio_merge (../block/kyber-iosched.c:573:2)
+    #5  blk_mq_sched_bio_merge (../block/blk-mq-sched.h:37:9)
+    #6  blk_mq_submit_bio (../block/blk-mq.c:2182:6)
+    #7  __submit_bio_noacct_mq (../block/blk-core.c:1015:9)
+    #8  submit_bio_noacct (../block/blk-core.c:1048:10)
+    #9  submit_bio (../block/blk-core.c:1125:9)
+    #10 submit_stripe_bio (../fs/btrfs/volumes.c:6553:2)
+    #11 btrfs_map_bio (../fs/btrfs/volumes.c:6642:3)
+    #12 btrfs_submit_data_bio (../fs/btrfs/inode.c:2440:8)
+    #13 submit_one_bio (../fs/btrfs/extent_io.c:175:9)
+    #14 submit_extent_page (../fs/btrfs/extent_io.c:3229:10)
+    #15 __extent_writepage_io (../fs/btrfs/extent_io.c:3793:9)
+    #16 __extent_writepage (../fs/btrfs/extent_io.c:3872:8)
+    #17 extent_write_cache_pages (../fs/btrfs/extent_io.c:4514:10)
+    #18 extent_writepages (../fs/btrfs/extent_io.c:4635:8)
+    #19 do_writepages (../mm/page-writeback.c:2352:10)
+    #20 __writeback_single_inode (../fs/fs-writeback.c:1467:8)
+    #21 writeback_sb_inodes (../fs/fs-writeback.c:1732:3)
+    #22 __writeback_inodes_wb (../fs/fs-writeback.c:1801:12)
+    #23 wb_writeback (../fs/fs-writeback.c:1907:15)
+    #24 wb_check_background_flush (../fs/fs-writeback.c:1975:10)
+    #25 wb_do_writeback (../fs/fs-writeback.c:2063:11)
+    #26 wb_workfn (../fs/fs-writeback.c:2091:20)
+    #27 process_one_work (../kernel/workqueue.c:2275:2)
+    #28 worker_thread (../kernel/workqueue.c:2421:4)
+    #29 kthread (../kernel/kthread.c:292:9)
+    #30 ret_from_fork+0x1f/0x2a (../arch/x86/entry/entry_64.S:294)
+
+It looks like ``kyber_bio_merge()`` tried to lock an invalid spinlock. For
+reference, this is the source code of ``kyber_bio_merge()``:
+
+.. code-block:: c
+   :lineno-start: 563
+
+   static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio,
+   			       unsigned int nr_segs)
+   {
+           struct kyber_hctx_data *khd = hctx->sched_data;
+           struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue);
+           struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]];
+           unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);
+           struct list_head *rq_list = &kcq->rq_list[sched_domain];
+           bool merged;
+
+           spin_lock(&kcq->lock);
+           merged = blk_bio_list_merge(hctx->queue, rq_list, bio, nr_segs);
+           spin_unlock(&kcq->lock);
+
+           return merged;
+   }
+
+When printed, the ``kcq`` structure containing the spinlock indeed looks like
+garbage (omitted for brevity).
+
+A crash course on the Linux kernel block layer: for each block device, there is
+a "software queue" (``struct blk_mq_ctx *ctx``) for each CPU and a "hardware
+queue" (``struct blk_mq_hw_ctx *hctx``) for each I/O queue provided by the
+device. Each hardware queue has one or more software queues assigned to it.
+Kyber keeps additional data per hardware queue (``struct kyber_hctx_data
+*khd``) and per software queue (``struct kyber_ctx_queue *kcq``).
+
+Let's try to figure out where the bad ``kcq`` came from. It should be an
+element of the ``khd->kcqs`` array (``khd`` is optimized out, but we can
+recover it from ``hctx->sched_data``)::
+
+    >>> trace[4]["khd"]
+    (struct kyber_hctx_data *)<absent>
+    >>> hctx = trace[4]["hctx"]
+    >>> khd = cast("struct kyber_hctx_data *", hctx.sched_data)
+    >>> trace[4]["kcq"] - khd.kcqs
+    (ptrdiff_t)1
+    >>> hctx.nr_ctx
+    (unsigned short)1
+
+So the ``kcq`` is for the second software queue, but the hardware queue is only
+supposed to have one software queue. Let's see which CPU was assigned to the
+hardware queue::
+
+    >>> hctx.ctxs[0].cpu
+    (unsigned int)6
+
+Here's the problem: we're not running on CPU 6, we're running on CPU 19::
+
+    >>> prog["crashing_cpu"]
+    (int)19
+
+And CPU 19 is assigned to a different hardware queue that actually does have
+two software queues::
+
+    >>> ctx = per_cpu_ptr(hctx.queue.queue_ctx, 19)
+    >>> other_hctx = ctx.hctxs[hctx.type]
+    >>> other_hctx == hctx
+    False
+    >>> other_hctx.nr_ctx
+    (unsigned short)2
+
+The bug is that the caller gets the ``hctx`` for the current CPU, then
+``kyber_bio_merge()`` gets the ``ctx`` for the current CPU, and if the thread
+is migrated to another CPU in between, they won't match. The fix is to get a
+consistent view of the ``hctx`` and ``ctx``. The commit that fixes this is
+`here
+<https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=efed9a3337e341bd0989161b97453b52567bc59d>`_.
@@ -36,3 +36,4 @@ Table of Contents
     advanced_usage
     api_reference
     helpers
+    case_studies
@@ -317,7 +317,7 @@ explicitly::
 Next Steps
 ----------
 
-Refer to the :doc:`api_reference`. Look through the :doc:`helpers`. Browse
-through the official `examples
+Refer to the :doc:`api_reference`. Look through the :doc:`helpers`. Read some
+:doc:`case_studies`. Browse through the official `examples
 <https://github.com/osandov/drgn/tree/main/examples>`_ and `tools
 <https://github.com/osandov/drgn/tree/main/tools>`_.
@@ -70,7 +70,6 @@ libdrgnimpl_la_SOURCES = $(ARCH_DEFS:.defs=.c) \
 			 register_state.h \
 			 serialize.c \
 			 serialize.h \
-			 siphash.h \
 			 splay_tree.c \
 			 stack_trace.c \
 			 stack_trace.h \
 
@@ -14,9 +14,9 @@
 
 #include <assert.h>
 #include <byteswap.h>
+#include <inttypes.h>
 #include <stdbool.h>
 #include <stddef.h>
-#include <stdint.h>
 #include <string.h>
 
 #include "util.h"
@@ -138,11 +138,11 @@ static inline bool binary_buffer_has_next(struct binary_buffer *bb)
 }
 
 static inline struct drgn_error *
-binary_buffer_check_bounds(struct binary_buffer *bb, size_t n)
+binary_buffer_check_bounds(struct binary_buffer *bb, uint64_t n)
 {
 	if (unlikely(bb->end - bb->pos < n)) {
 		return binary_buffer_error_at(bb, bb->pos,
-					      "expected at least %zu byte%s, have %td",
+					      "expected at least %" PRIu64 " byte%s, have %td",
 					      n, n == 1 ? "" : "s",
 					      bb->end - bb->pos);
 	}
@@ -151,7 +151,7 @@ binary_buffer_check_bounds(struct binary_buffer *bb, size_t n)
 
 /** Advance the current buffer position by @p n bytes. */
 static inline struct drgn_error *binary_buffer_skip(struct binary_buffer *bb,
-						    size_t n)
+						    uint64_t n)
 {
 	struct drgn_error *err;
 	if ((err = binary_buffer_check_bounds(bb, n)))
@@ -347,59 +347,102 @@ binary_buffer_next_sint(struct binary_buffer *bb, size_t size, int64_t *ret)
 static inline struct drgn_error *
 binary_buffer_next_uleb128(struct binary_buffer *bb, uint64_t *ret)
 {
-	int shift = 0;
 	uint64_t value = 0;
 	const char *pos = bb->pos;
-	while (likely(pos < bb->end)) {
-		uint8_t byte = *(uint8_t *)(pos++);
-		if (unlikely(shift == 63 && byte > 1)) {
+	uint8_t byte;
+	/* No overflow possible for the first 9 bytes. */
+	for (int shift = 0; shift < 63; shift += 7) {
+		if (unlikely(pos >= bb->end)) {
+oob:
 			return binary_buffer_error_at(bb, bb->pos,
-						      "ULEB128 number overflows unsigned 64-bit integer");
+						      "expected ULEB128 number");
 		}
+		byte = *(uint8_t *)(pos++);
 		value |= (uint64_t)(byte & 0x7f) << shift;
-		shift += 7;
 		if (!(byte & 0x80)) {
+done:
 			bb->prev = bb->pos;
 			bb->pos = pos;
 			*ret = value;
 			return NULL;
 		}
 	}
-	return binary_buffer_error_at(bb, bb->pos, "expected ULEB128 number");
+	/* The 10th byte must be 0 or 1. */
+	if (unlikely(pos >= bb->end))
+		goto oob;
+	byte = *(uint8_t *)(pos++);
+	if (byte & 0x7e) {
+overflow:
+		return binary_buffer_error_at(bb, bb->pos,
+					      "ULEB128 number overflows unsigned 64-bit integer");
+	}
+	value |= (uint64_t)byte << 63;
+	/* Any remaining bytes must be 0. */
+	while (byte & 0x80) {
+		if (unlikely(pos >= bb->end))
+			goto oob;
+		byte = *(uint8_t *)(pos++);
+		if (byte & 0x7f)
+			goto overflow;
+	}
+	goto done;
 }
 
 /**
  * Decode a Signed Little-Endian Base 128 (SLEB128) number at the current buffer
  * position and advance the position.
  *
- * If the number does not fit in a @c int64_t, an error is returned.
+ * If the number does not fit in an @c int64_t, an error is returned.
  *
  * @param[out] ret Returned value.
  */
 static inline struct drgn_error *
 binary_buffer_next_sleb128(struct binary_buffer *bb, int64_t *ret)
 {
-	int shift = 0;
-	int64_t value = 0;
+	uint64_t value = 0;
 	const char *pos = bb->pos;
-	while (likely(pos < bb->end)) {
-		uint8_t byte = *(uint8_t *)(pos++);
-		if (unlikely(shift == 63 && byte != 0 && byte != 0x7f)) {
+	uint8_t byte;
+	/* No overflow possible for the first 9 bytes. */
+	for (int shift = 0; shift < 63; shift += 7) {
+		if (unlikely(pos >= bb->end)) {
+oob:
 			return binary_buffer_error_at(bb, bb->pos,
-						      "SLEB128 number overflows signed 64-bit integer");
+						      "expected SLEB128 number");
 		}
+		byte = *(uint8_t *)(pos++);
 		value |= (uint64_t)(byte & 0x7f) << shift;
-		shift += 7;
 		if (!(byte & 0x80)) {
+			if (byte & 0x40)
+				value |= ~(UINT64_C(1) << (shift + 7)) + 1;
+done:
 			bb->prev = bb->pos;
 			bb->pos = pos;
-			if (shift < 64 && (byte & 0x40))
-				value |= ~(UINT64_C(1) << shift) + 1;
 			*ret = value;
 			return NULL;
 		}
 	}
-	return binary_buffer_error_at(bb, bb->pos, "expected SLEB128 number");
+	/*
+	 * The least significant bit of the 10th byte must be the sign bit, and
+	 * any other bits must match it (sign extension).
+	 */
+	if (unlikely(pos >= bb->end))
+		goto oob;
+	byte = *(uint8_t *)(pos++);
+	uint8_t sign = byte & 0x7f;
+	if (sign != 0 && sign != 0x7f) {
+overflow:
+		return binary_buffer_error_at(bb, bb->pos,
+					      "SLEB128 number overflows signed 64-bit integer");
+	}
+	value |= (uint64_t)byte << 63;
+	while (byte & 0x80) {
+		if (unlikely(pos >= bb->end))
+			goto oob;
+		byte = *(uint8_t *)(pos++);
+		if ((byte & 0x7f) != sign)
+			goto overflow;
+	}
+	goto done;
 }
 
 /**