Skip to content

Commit eea5422

Browse files
committed
libdrgn: make Linux kernel stack unwinding more robust
drgn has a couple of issues unwinding stack traces for kernel core dumps: 1. It can't unwind the stack for the idle task (PID 0), which commonly appears in core dumps. 2. It uses the PID in PRSTATUS, which is racy and can't actually be trusted. The solution for both of these is to look up the PRSTATUS note by CPU instead of PID. For the live kernel, drgn refuses to unwind the stack of tasks in the "R" state. However, the "R" state is running *or runnable*, so in the latter case, we can still unwind the stack. The solution for this is to look at on_cpu for the task instead of the state.
1 parent 146930a commit eea5422

File tree

7 files changed

+293
-142
lines changed

7 files changed

+293
-142
lines changed

libdrgn/arch_x86_64.c.in

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,9 @@ out:
257257

258258
static struct drgn_error *
259259
linux_kernel_set_initial_registers_x86_64(Dwfl_Thread *thread,
260-
const struct drgn_object *task_obj)
260+
const struct drgn_object *task_obj,
261+
const void *prstatus,
262+
size_t prstatus_size)
261263
{
262264
struct drgn_error *err;
263265
struct drgn_program *prog = task_obj->prog;
@@ -268,8 +270,44 @@ linux_kernel_set_initial_registers_x86_64(Dwfl_Thread *thread,
268270

269271
drgn_object_init(&sp_obj, prog);
270272

271-
/*
272-
*/
273+
if (prstatus) {
274+
/*
275+
* If the stack pointer in PRSTATUS is within this task's stack,
276+
* then we can use it. Otherwise, the task either wasn't running
277+
* or was in the middle of context switching. Either way, we
278+
* should use the saved registers instead.
279+
*/
280+
uint64_t thread_size;
281+
uint64_t stack;
282+
283+
err = linux_kernel_get_thread_size(prog, &thread_size);
284+
if (err)
285+
goto out;
286+
err = drgn_object_member_dereference(&sp_obj, task_obj,
287+
"stack");
288+
if (err)
289+
goto out;
290+
err = drgn_object_read_unsigned(&sp_obj, &stack);
291+
if (err)
292+
goto out;
293+
294+
if (prstatus_size < 272) {
295+
err = drgn_error_create(DRGN_ERROR_INVALID_ARGUMENT,
296+
"registers are truncated");
297+
goto out;
298+
}
299+
memcpy(&sp, (char *)prstatus + 264, sizeof(sp));
300+
if (drgn_program_bswap(prog))
301+
sp = bswap_64(sp);
302+
if (sp > stack && sp <= stack + thread_size) {
303+
err = prstatus_set_initial_registers_x86_64(prog,
304+
thread,
305+
prstatus,
306+
prstatus_size);
307+
goto out;
308+
}
309+
}
310+
273311
err = drgn_object_member_dereference(&sp_obj, task_obj, "thread");
274312
if (err)
275313
goto out;

libdrgn/linux_kernel.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -234,8 +234,8 @@ struct drgn_error *read_vmcoreinfo_fallback(struct drgn_memory_reader *reader,
234234
return err;
235235
}
236236

237-
static struct drgn_error *
238-
linux_kernel_get_thread_size(struct drgn_program *prog, uint64_t *ret)
237+
struct drgn_error *linux_kernel_get_thread_size(struct drgn_program *prog,
238+
uint64_t *ret)
239239
{
240240
struct drgn_error *err;
241241
struct drgn_qualified_type thread_union_type;

libdrgn/linux_kernel.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ struct drgn_error *proc_kallsyms_symbol_addr(const char *name,
2424

2525
struct drgn_error *read_vmcoreinfo_fallback(struct drgn_memory_reader *reader,
2626
struct vmcoreinfo *ret);
27+
struct drgn_error *linux_kernel_get_thread_size(struct drgn_program *prog,
28+
uint64_t *ret);
2729

2830
struct drgn_error *linux_kernel_object_find(const char *name, size_t name_len,
2931
const char *filename,

libdrgn/platform.h

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,35 @@ struct drgn_architecture_info {
6565
Dwfl_Thread *,
6666
const void *,
6767
size_t);
68+
/*
69+
* Get a task's registers from the task_struct or PRSTATUS note as
70+
* appropriate.
71+
*
72+
* The given PRSTATUS note is for the CPU that the task is assigned to,
73+
* which may or may not be for the given task. This callback must
74+
* determine that (typically by checking whether the stack pointer in
75+
* PRSTATUS lies within the task's stack).
76+
*
77+
* We find the PRSTATUS note by CPU rather than by PID for two reasons:
78+
*
79+
* 1. The PID is populated by the kernel from "current" (the current
80+
* task) via a non-maskable interrupt (NMI). During a context switch,
81+
* the stack pointer and current are not updated atomically, so if
82+
* the NMI arrives in the middle of a context switch, the stack
83+
* pointer may not actually be that of current. Therefore, the stack
84+
* pointer in PRSTATUS may not actually be for the PID in PRSTATUS.
85+
*
86+
* We go through all of this trouble because blindly trusting the PID
87+
* could result in a stack trace for the wrong task, which we want to
88+
* avoid at all costs.
89+
*
90+
* 2. There is an idle task with PID 0 for each CPU, so for an idle task
91+
* we have no choice but to find the note by CPU.
92+
*/
6893
struct drgn_error *(*linux_kernel_set_initial_registers)(Dwfl_Thread *,
69-
const struct drgn_object *);
94+
const struct drgn_object *,
95+
const void *prstatus,
96+
size_t prstatus_size);
7097
struct drgn_error *(*linux_kernel_get_page_offset)(struct drgn_program *,
7198
uint64_t *);
7299
struct drgn_error *(*linux_kernel_get_vmemmap)(struct drgn_program *,

libdrgn/program.c

Lines changed: 99 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include "type_index.h"
2929
#include "vector.h"
3030

31+
DEFINE_VECTOR_FUNCTIONS(drgn_prstatus_vector)
3132
DEFINE_HASH_TABLE_FUNCTIONS(drgn_prstatus_map, hash_pair_int_type,
3233
hash_table_scalar_eq)
3334

@@ -75,15 +76,19 @@ void drgn_program_init(struct drgn_program *prog,
7576
drgn_type_index_init(&prog->tindex);
7677
drgn_object_index_init(&prog->oindex);
7778
prog->core_fd = -1;
78-
drgn_prstatus_map_init(&prog->prstatus_cache);
7979
if (platform)
8080
drgn_program_set_platform(prog, platform);
8181
}
8282

8383
void drgn_program_deinit(struct drgn_program *prog)
8484
{
8585
free(prog->task_state_chars);
86-
drgn_prstatus_map_deinit(&prog->prstatus_cache);
86+
if (prog->prstatus_cached) {
87+
if (prog->flags & DRGN_PROGRAM_IS_LINUX_KERNEL)
88+
drgn_prstatus_vector_deinit(&prog->prstatus_vector);
89+
else
90+
drgn_prstatus_map_deinit(&prog->prstatus_map);
91+
}
8792
free(prog->pgtable_it);
8893

8994
drgn_object_index_deinit(&prog->oindex);
@@ -736,41 +741,64 @@ drgn_program_load_debug_info(struct drgn_program *prog, const char **paths,
736741
struct drgn_error *drgn_program_cache_prstatus_entry(struct drgn_program *prog,
737742
char *data, size_t size)
738743
{
739-
struct drgn_prstatus_map_entry entry;
740-
size_t pr_pid_offset;
741-
uint32_t pr_pid;
744+
if (prog->flags & DRGN_PROGRAM_IS_LINUX_KERNEL) {
745+
struct string *entry;
742746

743-
pr_pid_offset = drgn_program_is_64_bit(prog) ? 32 : 24;
747+
entry = drgn_prstatus_vector_append_entry(&prog->prstatus_vector);
748+
if (!entry)
749+
return &drgn_enomem;
750+
entry->str = data;
751+
entry->len = size;
752+
} else {
753+
struct drgn_prstatus_map_entry entry;
754+
size_t pr_pid_offset;
755+
uint32_t pr_pid;
744756

745-
if (size < pr_pid_offset + sizeof(pr_pid))
746-
return NULL;
757+
pr_pid_offset = drgn_program_is_64_bit(prog) ? 32 : 24;
758+
if (size < pr_pid_offset + sizeof(pr_pid))
759+
return NULL;
747760

748-
memcpy(&pr_pid, data + pr_pid_offset, sizeof(pr_pid));
749-
if (drgn_program_bswap(prog))
750-
pr_pid = bswap_32(pr_pid);
751-
if (!pr_pid)
752-
return NULL;
761+
memcpy(&pr_pid, data + pr_pid_offset, sizeof(pr_pid));
762+
if (drgn_program_bswap(prog))
763+
pr_pid = bswap_32(pr_pid);
753764

754-
entry.key = pr_pid;
755-
entry.value.str = data;
756-
entry.value.len = size;
757-
if (drgn_prstatus_map_insert(&prog->prstatus_cache, &entry,
758-
NULL) == -1) {
759-
return &drgn_enomem;
765+
entry.key = pr_pid;
766+
entry.value.str = data;
767+
entry.value.len = size;
768+
if (drgn_prstatus_map_insert(&prog->prstatus_map, &entry,
769+
NULL) == -1)
770+
return &drgn_enomem;
760771
}
761772
return NULL;
762773
}
763774

764775
static struct drgn_error *drgn_program_cache_prstatus(struct drgn_program *prog)
765776
{
777+
struct drgn_error *err;
766778
size_t phnum, i;
767779

780+
if (prog->prstatus_cached)
781+
return NULL;
782+
783+
if (prog->flags & DRGN_PROGRAM_IS_LINUX_KERNEL)
784+
drgn_prstatus_vector_init(&prog->prstatus_vector);
785+
else
786+
drgn_prstatus_map_init(&prog->prstatus_map);
787+
768788
#ifdef WITH_LIBKDUMPFILE
769-
if (prog->kdump_ctx)
770-
return drgn_program_cache_prstatus_kdump(prog);
789+
if (prog->kdump_ctx) {
790+
err = drgn_program_cache_prstatus_kdump(prog);
791+
goto out;
792+
}
771793
#endif
772-
if (elf_getphdrnum(prog->core, &phnum) != 0)
773-
return drgn_error_libelf();
794+
if (!prog->core) {
795+
err = NULL;
796+
goto out;
797+
}
798+
if (elf_getphdrnum(prog->core, &phnum) != 0) {
799+
err = drgn_error_libelf();
800+
goto out;
801+
}
774802
for (i = 0; i < phnum; i++) {
775803
GElf_Phdr phdr_mem, *phdr;
776804
Elf_Data *data;
@@ -779,23 +807,26 @@ static struct drgn_error *drgn_program_cache_prstatus(struct drgn_program *prog)
779807
size_t name_offset, desc_offset;
780808

781809
phdr = gelf_getphdr(prog->core, i, &phdr_mem);
782-
if (!phdr)
783-
return drgn_error_libelf();
810+
if (!phdr) {
811+
err = drgn_error_libelf();
812+
goto out;
813+
}
784814
if (phdr->p_type != PT_NOTE)
785815
continue;
786816

787817
data = elf_getdata_rawchunk(prog->core, phdr->p_offset,
788818
phdr->p_filesz,
789819
note_header_type(phdr));
790-
if (!data)
791-
return drgn_error_libelf();
820+
if (!data) {
821+
err = drgn_error_libelf();
822+
goto out;
823+
}
792824

793825
offset = 0;
794826
while (offset < data->d_size &&
795827
(offset = gelf_getnote(data, offset, &nhdr, &name_offset,
796828
&desc_offset))) {
797829
const char *name;
798-
struct drgn_error *err;
799830

800831
name = (char *)data->d_buf + name_offset;
801832
if (strncmp(name, "CORE", nhdr.n_namesz) != 0 ||
@@ -806,26 +837,56 @@ static struct drgn_error *drgn_program_cache_prstatus(struct drgn_program *prog)
806837
(char *)data->d_buf + desc_offset,
807838
nhdr.n_descsz);
808839
if (err)
809-
return err;
840+
goto out;
810841
}
811842
}
843+
844+
err = NULL;
845+
out:
846+
if (err) {
847+
if (prog->flags & DRGN_PROGRAM_IS_LINUX_KERNEL)
848+
drgn_prstatus_vector_deinit(&prog->prstatus_vector);
849+
else
850+
drgn_prstatus_map_deinit(&prog->prstatus_map);
851+
} else {
852+
prog->prstatus_cached = true;
853+
}
854+
return err;
855+
}
856+
857+
struct drgn_error *drgn_program_find_prstatus_by_cpu(struct drgn_program *prog,
858+
uint32_t cpu,
859+
struct string *ret)
860+
{
861+
struct drgn_error *err;
862+
863+
assert(prog->flags & DRGN_PROGRAM_IS_LINUX_KERNEL);
864+
err = drgn_program_cache_prstatus(prog);
865+
if (err)
866+
return err;
867+
868+
if (cpu < prog->prstatus_vector.size) {
869+
*ret = prog->prstatus_vector.data[cpu];
870+
} else {
871+
ret->str = NULL;
872+
ret->len = 0;
873+
}
812874
return NULL;
813875
}
814876

815-
struct drgn_error *drgn_program_find_prstatus(struct drgn_program *prog,
816-
uint32_t tid, struct string *ret)
877+
struct drgn_error *drgn_program_find_prstatus_by_tid(struct drgn_program *prog,
878+
uint32_t tid,
879+
struct string *ret)
817880
{
818881
struct drgn_error *err;
819882
struct drgn_prstatus_map_iterator it;
820883

821-
if (!prog->prstatus_cached) {
822-
err = drgn_program_cache_prstatus(prog);
823-
if (err)
824-
return err;
825-
prog->prstatus_cached = true;
826-
}
884+
assert(!(prog->flags & DRGN_PROGRAM_IS_LINUX_KERNEL));
885+
err = drgn_program_cache_prstatus(prog);
886+
if (err)
887+
return err;
827888

828-
it = drgn_prstatus_map_search(&prog->prstatus_cache, &tid);
889+
it = drgn_prstatus_map_search(&prog->prstatus_map, &tid);
829890
if (!it.entry) {
830891
ret->str = NULL;
831892
ret->len = 0;

libdrgn/program.h

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include "object_index.h"
2323
#include "platform.h"
2424
#include "type_index.h"
25+
#include "vector.h"
2526

2627
/**
2728
* @ingroup Internals
@@ -53,6 +54,7 @@ struct vmcoreinfo {
5354
bool pgtable_l5_enabled;
5455
};
5556

57+
DEFINE_VECTOR_TYPE(drgn_prstatus_vector, struct string)
5658
DEFINE_HASH_MAP_TYPE(drgn_prstatus_map, uint32_t, struct string)
5759

5860
struct drgn_dwarf_info_cache;
@@ -92,7 +94,16 @@ struct drgn_program {
9294
*/
9395
pid_t pid;
9496
struct drgn_dwarf_info_cache *_dicache;
95-
struct drgn_prstatus_map prstatus_cache;
97+
union {
98+
/*
99+
* For the Linux kernel, PRSTATUS notes indexed by CPU. See @ref
100+
* drgn_architecture_info::linux_kernel_set_initial_registers
101+
* for why we don't use the PID map.
102+
*/
103+
struct drgn_prstatus_vector prstatus_vector;
104+
/* For userspace programs, PRSTATUS notes indexed by PID. */
105+
struct drgn_prstatus_map prstatus_map;
106+
};
96107
/* See @ref drgn_object_stack_trace(). */
97108
struct drgn_error *stack_trace_err;
98109
/* See @ref drgn_object_stack_trace_next_thread(). */
@@ -172,16 +183,29 @@ static inline bool drgn_program_is_64_bit(struct drgn_program *prog)
172183

173184
struct drgn_error *drgn_program_get_dwfl(struct drgn_program *prog, Dwfl **ret);
174185

186+
/**
187+
* Find the @c NT_PRSTATUS note for the given CPU.
188+
*
189+
* This is only valid for the Linux kernel.
190+
*
191+
* @param[out] ret Returned note data. If not found, <tt>ret->str</tt> is set to
192+
* @c NULL and <tt>ret->len</tt> is set to zero.
193+
*/
194+
struct drgn_error *drgn_program_find_prstatus_by_cpu(struct drgn_program *prog,
195+
uint32_t cpu,
196+
struct string *ret);
197+
175198
/**
176199
* Find the @c NT_PRSTATUS note for the given thread ID.
177200
*
178-
* This assumes that <tt>prog->core</tt> is not @c NULL.
201+
* This is only valid for userspace programs.
179202
*
180203
* @param[out] ret Returned note data. If not found, <tt>ret->str</tt> is set to
181204
* @c NULL and <tt>ret->len</tt> is set to zero.
182205
*/
183-
struct drgn_error *drgn_program_find_prstatus(struct drgn_program *prog,
184-
uint32_t tid, struct string *ret);
206+
struct drgn_error *drgn_program_find_prstatus_by_tid(struct drgn_program *prog,
207+
uint32_t tid,
208+
struct string *ret);
185209

186210
/**
187211
* Cache the @c NT_PRSTATUS note provided by @p data in @p prog.

0 commit comments

Comments
 (0)