Skip to content

Commit 1da1030

Browse files
committed
runtime: fix false deadlock crash
Fixes #6070. Update #6055. R=golang-dev, nightlyone, rsc CC=golang-dev https://golang.org/cl/12602043
1 parent 7eb6a6f commit 1da1030

File tree

2 files changed

+102
-15
lines changed

2 files changed

+102
-15
lines changed

src/pkg/net/tcp_test.go

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -494,3 +494,78 @@ func TestTCPReadWriteMallocs(t *testing.T) {
494494
t.Fatalf("Got %v allocs, want %v", mallocs, maxMallocs)
495495
}
496496
}
497+
498+
func TestTCPStress(t *testing.T) {
499+
const conns = 2
500+
const msgs = 1e4
501+
const msgLen = 512
502+
503+
sendMsg := func(c Conn, buf []byte) bool {
504+
n, err := c.Write(buf)
505+
if n != len(buf) || err != nil {
506+
t.Logf("Write failed: %v", err)
507+
return false
508+
}
509+
return true
510+
}
511+
recvMsg := func(c Conn, buf []byte) bool {
512+
for read := 0; read != len(buf); {
513+
n, err := c.Read(buf)
514+
read += n
515+
if err != nil {
516+
t.Logf("Read failed: %v", err)
517+
return false
518+
}
519+
}
520+
return true
521+
}
522+
523+
ln, err := Listen("tcp", "127.0.0.1:0")
524+
if err != nil {
525+
t.Fatalf("Listen failed: %v", err)
526+
}
527+
defer ln.Close()
528+
// Acceptor.
529+
go func() {
530+
for {
531+
c, err := ln.Accept()
532+
if err != nil {
533+
break
534+
}
535+
// Server connection.
536+
go func(c Conn) {
537+
defer c.Close()
538+
var buf [msgLen]byte
539+
for m := 0; m < msgs; m++ {
540+
if !recvMsg(c, buf[:]) || !sendMsg(c, buf[:]) {
541+
break
542+
}
543+
}
544+
}(c)
545+
}
546+
}()
547+
done := make(chan bool)
548+
for i := 0; i < conns; i++ {
549+
// Client connection.
550+
go func() {
551+
defer func() {
552+
done <- true
553+
}()
554+
c, err := Dial("tcp", ln.Addr().String())
555+
if err != nil {
556+
t.Logf("Dial failed: %v", err)
557+
return
558+
}
559+
defer c.Close()
560+
var buf [msgLen]byte
561+
for m := 0; m < msgs; m++ {
562+
if !sendMsg(c, buf[:]) || !recvMsg(c, buf[:]) {
563+
break
564+
}
565+
}
566+
}()
567+
}
568+
for i := 0; i < conns; i++ {
569+
<-done
570+
}
571+
}

src/pkg/runtime/proc.c

Lines changed: 27 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ struct Sched {
3030

3131
M* midle; // idle m's waiting for work
3232
int32 nmidle; // number of idle m's waiting for work
33-
int32 mlocked; // number of locked m's waiting for work
33+
int32 nmidlelocked; // number of locked m's waiting for work
3434
int32 mcount; // number of m's that have been created
3535

3636
P* pidle; // idle P's
@@ -95,7 +95,7 @@ static void stoplockedm(void);
9595
static void startlockedm(G*);
9696
static void sysmon(void);
9797
static uint32 retake(int64);
98-
static void inclocked(int32);
98+
static void incidlelocked(int32);
9999
static void checkdead(void);
100100
static void exitsyscall0(G*);
101101
static void park0(G*);
@@ -1019,7 +1019,7 @@ stoplockedm(void)
10191019
p = releasep();
10201020
handoffp(p);
10211021
}
1022-
inclocked(1);
1022+
incidlelocked(1);
10231023
// Wait until another thread schedules lockedg again.
10241024
runtime·notesleep(&m->park);
10251025
runtime·noteclear(&m->park);
@@ -1042,7 +1042,7 @@ startlockedm(G *gp)
10421042
if(mp->nextp)
10431043
runtime·throw("startlockedm: m has p");
10441044
// directly handoff current P to the locked m
1045-
inclocked(-1);
1045+
incidlelocked(-1);
10461046
p = releasep();
10471047
mp->nextp = p;
10481048
runtime·notewakeup(&mp->park);
@@ -1485,7 +1485,7 @@ void
14851485
p = releasep();
14861486
handoffp(p);
14871487
if(g->isbackground) // do not consider blocked scavenger for deadlock detection
1488-
inclocked(1);
1488+
incidlelocked(1);
14891489

14901490
// Resave for traceback during blocked call.
14911491
save(runtime·getcallerpc(&dummy), runtime·getcallersp(&dummy));
@@ -1505,7 +1505,7 @@ runtime·exitsyscall(void)
15051505
m->locks++; // see comment in entersyscall
15061506

15071507
if(g->isbackground) // do not consider blocked scavenger for deadlock detection
1508-
inclocked(-1);
1508+
incidlelocked(-1);
15091509

15101510
if(exitsyscallfast()) {
15111511
// There's a cpu for us, so we can run.
@@ -2159,10 +2159,10 @@ releasep(void)
21592159
}
21602160

21612161
static void
2162-
inclocked(int32 v)
2162+
incidlelocked(int32 v)
21632163
{
21642164
runtime·lock(&runtime·sched);
2165-
runtime·sched.mlocked += v;
2165+
runtime·sched.nmidlelocked += v;
21662166
if(v > 0)
21672167
checkdead();
21682168
runtime·unlock(&runtime·sched);
@@ -2177,12 +2177,12 @@ checkdead(void)
21772177
int32 run, grunning, s;
21782178

21792179
// -1 for sysmon
2180-
run = runtime·sched.mcount - runtime·sched.nmidle - runtime·sched.mlocked - 1;
2180+
run = runtime·sched.mcount - runtime·sched.nmidle - runtime·sched.nmidlelocked - 1;
21812181
if(run > 0)
21822182
return;
21832183
if(run < 0) {
2184-
runtime·printf("checkdead: nmidle=%d mlocked=%d mcount=%d\n",
2185-
runtime·sched.nmidle, runtime·sched.mlocked, runtime·sched.mcount);
2184+
runtime·printf("checkdead: nmidle=%d nmidlelocked=%d mcount=%d\n",
2185+
runtime·sched.nmidle, runtime·sched.nmidlelocked, runtime·sched.mcount);
21862186
runtime·throw("checkdead: inconsistent counts");
21872187
}
21882188
grunning = 0;
@@ -2238,7 +2238,18 @@ sysmon(void)
22382238
if(lastpoll != 0 && lastpoll + 10*1000*1000 > now) {
22392239
runtime·cas64(&runtime·sched.lastpoll, lastpoll, now);
22402240
gp = runtime·netpoll(false); // non-blocking
2241-
injectglist(gp);
2241+
if(gp) {
2242+
// Need to decrement number of idle locked M's
2243+
// (pretending that one more is running) before injectglist.
2244+
// Otherwise it can lead to the following situation:
2245+
// injectglist grabs all P's but before it starts M's to run the P's,
2246+
// another M returns from syscall, finishes running its G,
2247+
// observes that there is no work to do and no other running M's
2248+
// and reports deadlock.
2249+
incidlelocked(-1);
2250+
injectglist(gp);
2251+
incidlelocked(1);
2252+
}
22422253
}
22432254
// retake P's blocked in syscalls
22442255
// and preempt long running G's
@@ -2284,15 +2295,16 @@ retake(int64 now)
22842295
if(p->runqhead == p->runqtail &&
22852296
runtime·atomicload(&runtime·sched.nmspinning) + runtime·atomicload(&runtime·sched.npidle) > 0)
22862297
continue;
2287-
// Need to increment number of locked M's before the CAS.
2298+
// Need to decrement number of idle locked M's
2299+
// (pretending that one more is running) before the CAS.
22882300
// Otherwise the M from which we retake can exit the syscall,
22892301
// increment nmidle and report deadlock.
2290-
inclocked(-1);
2302+
incidlelocked(-1);
22912303
if(runtime·cas(&p->status, s, Pidle)) {
22922304
n++;
22932305
handoffp(p);
22942306
}
2295-
inclocked(1);
2307+
incidlelocked(1);
22962308
} else if(s == Prunning) {
22972309
// Preempt G if it's running for more than 10ms.
22982310
if(pd->when + 10*1000*1000 > now)

0 commit comments

Comments
 (0)