Skip to content

Commit a27f3d5

Browse files
committed
runtime: fix hangs in TestDebugCall*
This fixes a few different issues that led to hangs and general flakiness in the TestDebugCall* tests. 1. This fixes missing wake-ups in two error paths of the SIGTRAP signal handler. If the goroutine was in an unknown state, or if there was an unknown debug call status, we currently don't wake the injection coordinator. These are terminal states, so this resulted in a hang. 2. This adds a retry if the target goroutine is in a transient state that prevents us from injecting a call. The most common failure mode here is that the target goroutine is in _Grunnable, but this was previously masked because it deadlocked the test. 3. Related to 2, this switches the "ready" signal from the target goroutine from a blocking channel send to a non-blocking channel send. This makes it much less likely that we'll catch this goroutine while it's in the runtime performing that send. 4. This increases GOMAXPROCS from 2 to 8 during these tests. With the current setting of 2, we can have at most the non-preemptible goroutine we're injecting a call in to and the goroutine that's trying to make it exit. If anything else comes along, it can deadlock. One particular case I observed was in TestDebugCallGC, where runtime.GC() returns before the forEachP that prepares sweeping on all goroutines has finished. When this happens, the forEachP blocks on the non-preemptible loop, which means we now have at least three goroutines that need to run. Fixes #25519. Updates #29124. Change-Id: I7bc41dc0b865b7d0bb379cb654f9a1218bc37428 Reviewed-on: https://go-review.googlesource.com/c/154112 Run-TryBot: Austin Clements <[email protected]> Reviewed-by: Michael Knyszek <[email protected]>
1 parent e167587 commit a27f3d5

File tree

2 files changed

+48
-18
lines changed

2 files changed

+48
-18
lines changed

src/runtime/debug_test.go

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,17 @@ func startDebugCallWorker(t *testing.T) (g *runtime.G, after func()) {
3333
skipUnderDebugger(t)
3434

3535
// This can deadlock if there aren't enough threads or if a GC
36-
// tries to interrupt an atomic loop (see issue #10958).
37-
ogomaxprocs := runtime.GOMAXPROCS(2)
36+
// tries to interrupt an atomic loop (see issue #10958). We
37+
// use 8 Ps so there's room for the debug call worker,
38+
// something that's trying to preempt the call worker, and the
39+
// goroutine that's trying to stop the call worker.
40+
ogomaxprocs := runtime.GOMAXPROCS(8)
3841
ogcpercent := debug.SetGCPercent(-1)
3942

40-
ready := make(chan *runtime.G)
43+
// ready is a buffered channel so debugCallWorker won't block
44+
// on sending to it. This makes it less likely we'll catch
45+
// debugCallWorker while it's in the runtime.
46+
ready := make(chan *runtime.G, 1)
4147
var stop uint32
4248
done := make(chan error)
4349
go debugCallWorker(ready, &stop, done)
@@ -67,6 +73,10 @@ func debugCallWorker(ready chan<- *runtime.G, stop *uint32, done chan<- error) {
6773
close(done)
6874
}
6975

76+
// Don't inline this function, since we want to test adjusting
77+
// pointers in the arguments.
78+
//
79+
//go:noinline
7080
func debugCallWorker2(stop *uint32, x *int) {
7181
for atomic.LoadUint32(stop) == 0 {
7282
// Strongly encourage x to live in a register so we
@@ -193,7 +203,7 @@ func TestDebugCallUnsafePoint(t *testing.T) {
193203

194204
// This can deadlock if there aren't enough threads or if a GC
195205
// tries to interrupt an atomic loop (see issue #10958).
196-
defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(2))
206+
defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(8))
197207
defer debug.SetGCPercent(debug.SetGCPercent(-1))
198208

199209
// Test that the runtime refuses call injection at unsafe points.
@@ -215,7 +225,7 @@ func TestDebugCallPanic(t *testing.T) {
215225
skipUnderDebugger(t)
216226

217227
// This can deadlock if there aren't enough threads.
218-
defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(2))
228+
defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(8))
219229

220230
ready := make(chan *runtime.G)
221231
var stop uint32

src/runtime/export_debug_test.go

Lines changed: 33 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -50,19 +50,31 @@ func InjectDebugCall(gp *g, fn, args interface{}, tkill func(tid int) error) (in
5050
h.gp = gp
5151
h.fv, h.argp, h.argSize = fv, argp, argSize
5252
h.handleF = h.handle // Avoid allocating closure during signal
53-
noteclear(&h.done)
5453

5554
defer func() { testSigtrap = nil }()
56-
testSigtrap = h.inject
57-
if err := tkill(tid); err != nil {
58-
return nil, err
59-
}
60-
// Wait for completion.
61-
notetsleepg(&h.done, -1)
62-
if len(h.err) != 0 {
63-
return nil, h.err
55+
for i := 0; ; i++ {
56+
testSigtrap = h.inject
57+
noteclear(&h.done)
58+
h.err = ""
59+
60+
if err := tkill(tid); err != nil {
61+
return nil, err
62+
}
63+
// Wait for completion.
64+
notetsleepg(&h.done, -1)
65+
if h.err != "" {
66+
switch h.err {
67+
case "retry _Grunnable", "executing on Go runtime stack":
68+
// These are transient states. Try to get out of them.
69+
if i < 100 {
70+
Gosched()
71+
continue
72+
}
73+
}
74+
return nil, h.err
75+
}
76+
return h.panic, nil
6477
}
65-
return h.panic, nil
6678
}
6779

6880
type debugCallHandler struct {
@@ -99,12 +111,18 @@ func (h *debugCallHandler) inject(info *siginfo, ctxt *sigctxt, gp2 *g) bool {
99111
h.savedRegs.fpstate = nil
100112
// Set PC to debugCallV1.
101113
ctxt.set_rip(uint64(funcPC(debugCallV1)))
114+
// Call injected. Switch to the debugCall protocol.
115+
testSigtrap = h.handleF
116+
case _Grunnable:
117+
// Ask InjectDebugCall to pause for a bit and then try
118+
// again to interrupt this goroutine.
119+
h.err = plainError("retry _Grunnable")
120+
notewakeup(&h.done)
102121
default:
103122
h.err = plainError("goroutine in unexpected state at call inject")
104-
return true
123+
notewakeup(&h.done)
105124
}
106-
// Switch to the debugCall protocol and resume execution.
107-
testSigtrap = h.handleF
125+
// Resume execution.
108126
return true
109127
}
110128

@@ -149,6 +167,7 @@ func (h *debugCallHandler) handle(info *siginfo, ctxt *sigctxt, gp2 *g) bool {
149167
sp := ctxt.rsp()
150168
reason := *(*string)(unsafe.Pointer(uintptr(sp)))
151169
h.err = plainError(reason)
170+
// Don't wake h.done. We need to transition to status 16 first.
152171
case 16:
153172
// Restore all registers except RIP and RSP.
154173
rip, rsp := ctxt.rip(), ctxt.rsp()
@@ -162,6 +181,7 @@ func (h *debugCallHandler) handle(info *siginfo, ctxt *sigctxt, gp2 *g) bool {
162181
notewakeup(&h.done)
163182
default:
164183
h.err = plainError("unexpected debugCallV1 status")
184+
notewakeup(&h.done)
165185
}
166186
// Resume execution.
167187
return true

0 commit comments

Comments
 (0)