Skip to content

Commit a9d4b2d

Browse files
Bryan C. Millsgopherbot
Bryan C. Mills
authored andcommitted
net: avoid internal hooks in TestDialTimeout
TestDialTimeout has historically been very flaky (#11872, #13144, #22896, and now #56876), apparently in part due to implementation details of the socktest package it relies on. In reviewing CL 467335, I noticed that TestDialTimeout is the last remaining use of testHookDialChannel (added for #5349), and that that hook no longer has any effect for Unix and Windows. As an experiment, I tried removing both that hook and the call to time.Sleep in the socktest filter, and to my surprise the test continued to pass. That greatly undermined my confidence in the test, since it appears that the “timeout” behavior it observes is caused by the socktest filter injecting an error rather than anything in the net package proper actually timing out. To restore confidence in the test, I think it should be written against only the public API of the net package, and should test the publicly-documented behaviors. This change implements that approach. Notably, when a timeout is set on a Dial call, that does not guarantee that the listener will actually call Accept on the connection before the timeout occurs: the kernel's network stack may preemptively accept and buffer the connection on behalf of the listener. To avoid test flakiness, the test must tolerate (and leave open) those spurious connections: when the kernel has accepted enough of them, it will start to block new connections until the buffered connections have been accepted, and the expected timeout behavior will occur. This also allows the test to run much more quickly and in parallel: since we are relying on real timeouts instead of injected calls to time.Sleep, we can set the timeouts to be much shorter and run concurrently with other public-API tests without introducing races. Fixes #56876. Change-Id: I90dcb2ed70976e70857ca29c253ed760cb078a4c Reviewed-on: https://go-review.googlesource.com/c/go/+/524055 TryBot-Bypass: Bryan Mills <[email protected]> Reviewed-by: Damien Neil <[email protected]> Auto-Submit: Bryan Mills <[email protected]> Run-TryBot: Bryan Mills <[email protected]>
1 parent 52a5881 commit a9d4b2d

File tree

5 files changed

+97
-60
lines changed

5 files changed

+97
-60
lines changed

src/net/hook_plan9.go

Lines changed: 0 additions & 9 deletions
This file was deleted.

src/net/hook_unix.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ package net
99
import "syscall"
1010

1111
var (
12-
testHookDialChannel = func() {} // for golang.org/issue/5349
1312
testHookCanceledDial = func() {} // for golang.org/issue/16523
1413

1514
// Placeholders for socket system calls.

src/net/hook_windows.go

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,9 @@ package net
77
import (
88
"internal/syscall/windows"
99
"syscall"
10-
"time"
1110
)
1211

1312
var (
14-
testHookDialChannel = func() { time.Sleep(time.Millisecond) } // see golang.org/issue/5349
15-
1613
// Placeholders for socket system calls.
1714
wsaSocketFunc func(int32, int32, int32, *syscall.WSAProtocolInfo, uint32, uint32) (syscall.Handle, error) = windows.WSASocket
1815
connectFunc func(syscall.Handle, syscall.Sockaddr) error = syscall.Connect

src/net/ipsock_plan9.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,6 @@ func dialPlan9(ctx context.Context, net string, laddr, raddr Addr) (fd *netFD, e
181181
}
182182
resc := make(chan res)
183183
go func() {
184-
testHookDialChannel()
185184
fd, err := dialPlan9Blocking(ctx, net, laddr, raddr)
186185
select {
187186
case resc <- res{fd, err}:

src/net/timeout_test.go

Lines changed: 97 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@ import (
1111
"fmt"
1212
"internal/testenv"
1313
"io"
14-
"net/internal/socktest"
1514
"os"
1615
"runtime"
16+
"strings"
1717
"sync"
1818
"testing"
1919
"time"
@@ -22,62 +22,113 @@ import (
2222
var dialTimeoutTests = []struct {
2323
timeout time.Duration
2424
delta time.Duration // for deadline
25-
26-
guard time.Duration
2725
}{
2826
// Tests that dial timeouts, deadlines in the past work.
29-
{-5 * time.Second, 0, -5 * time.Second},
30-
{0, -5 * time.Second, -5 * time.Second},
31-
{-5 * time.Second, 5 * time.Second, -5 * time.Second}, // timeout over deadline
32-
{-1 << 63, 0, time.Second},
33-
{0, -1 << 63, time.Second},
34-
35-
{50 * time.Millisecond, 0, 100 * time.Millisecond},
36-
{0, 50 * time.Millisecond, 100 * time.Millisecond},
37-
{50 * time.Millisecond, 5 * time.Second, 100 * time.Millisecond}, // timeout over deadline
27+
{-5 * time.Second, 0},
28+
{0, -5 * time.Second},
29+
{-5 * time.Second, 5 * time.Second}, // timeout over deadline
30+
{-1 << 63, 0},
31+
{0, -1 << 63},
32+
33+
{1 * time.Millisecond, 0},
34+
{0, 1 * time.Millisecond},
35+
{1 * time.Millisecond, 5 * time.Second}, // timeout over deadline
3836
}
3937

4038
func TestDialTimeout(t *testing.T) {
41-
// Cannot use t.Parallel - modifies global hooks.
42-
origTestHookDialChannel := testHookDialChannel
43-
defer func() { testHookDialChannel = origTestHookDialChannel }()
44-
defer sw.Set(socktest.FilterConnect, nil)
45-
46-
for i, tt := range dialTimeoutTests {
47-
switch runtime.GOOS {
48-
case "plan9", "windows":
49-
testHookDialChannel = func() { time.Sleep(tt.guard) }
50-
if runtime.GOOS == "plan9" {
51-
break
52-
}
53-
fallthrough
54-
default:
55-
sw.Set(socktest.FilterConnect, func(so *socktest.Status) (socktest.AfterFilter, error) {
56-
time.Sleep(tt.guard)
57-
return nil, errTimedout
58-
})
59-
}
39+
switch runtime.GOOS {
40+
case "plan9":
41+
t.Skipf("not supported on %s", runtime.GOOS)
42+
}
43+
44+
t.Parallel()
6045

61-
d := Dialer{Timeout: tt.timeout}
62-
if tt.delta != 0 {
63-
d.Deadline = time.Now().Add(tt.delta)
46+
ln := newLocalListener(t, "tcp")
47+
defer func() {
48+
if err := ln.Close(); err != nil {
49+
t.Error(err)
6450
}
51+
}()
6552

66-
// This dial never starts to send any TCP SYN
67-
// segment because of above socket filter and
68-
// test hook.
69-
c, err := d.Dial("tcp", "127.0.0.1:0")
70-
if err == nil {
71-
err = fmt.Errorf("unexpectedly established: tcp:%s->%s", c.LocalAddr(), c.RemoteAddr())
53+
// We expect the kernel to spuriously accept some number of connections on
54+
// behalf of the listener, even when it hasn't called Accept yet.
55+
var bufferedConns []Conn
56+
t.Cleanup(func() {
57+
t.Logf("ignored %d spurious connections", len(bufferedConns))
58+
for _, c := range bufferedConns {
7259
c.Close()
7360
}
61+
})
7462

75-
if perr := parseDialError(err); perr != nil {
76-
t.Errorf("#%d: %v", i, perr)
77-
}
78-
if nerr, ok := err.(Error); !ok || !nerr.Timeout() {
79-
t.Fatalf("#%d: %v", i, err)
80-
}
63+
for _, tt := range dialTimeoutTests {
64+
t.Run(fmt.Sprintf("%v/%v", tt.timeout, tt.delta), func(t *testing.T) {
65+
// We don't run these subtests in parallel because (at least on Linux)
66+
// that empirically causes many of the Dial calls to fail with
67+
// ECONNREFUSED instead of a timeout error.
68+
d := Dialer{Timeout: tt.timeout}
69+
if tt.delta != 0 {
70+
d.Deadline = time.Now().Add(tt.delta)
71+
}
72+
73+
var (
74+
beforeDial time.Time
75+
afterDial time.Time
76+
err error
77+
)
78+
for err == nil {
79+
beforeDial = time.Now()
80+
var c Conn
81+
c, err = d.Dial(ln.Addr().Network(), ln.Addr().String())
82+
afterDial = time.Now()
83+
if err == nil {
84+
// The connection was accepted before the timeout took effect; leave
85+
// the connection open and try again. Eventually we will have so many
86+
// open connections that the kernel stops buffering new ones, in which
87+
// case the Dial calls should start to time out and return errors.
88+
bufferedConns = append(bufferedConns, c)
89+
}
90+
}
91+
92+
if strings.Contains(err.Error(), "connection reset by peer") && (testenv.Builder() == "" || runtime.GOOS == "freebsd") {
93+
// After we set up the connection on Unix, we make a call to
94+
// getsockopt to retrieve its status. Empirically, on some platforms
95+
// (notably FreeBSD 13), we may see ECONNRESET from that call instead
96+
// of a timeout when the listener's accept queue is full.
97+
//
98+
// We don't retry ECONNRESET errors in the saturation loop above,
99+
// because there is no upper bound on how often they will occur.
100+
// Empirically, with a 1ms timeout a single run of the test could
101+
// provoke upward of 100k ECONNRESETS, running for over 15s before
102+
// it finally trigged a timeout.
103+
//
104+
// We record this as a skipped subtest rather than a passing test so
105+
// that we can (potentially, one day) analyze it as such: this test
106+
// didn't fail, but it also didn't successfully provoke the intended
107+
// timeout behavior.
108+
//
109+
// We don't allow this on Go builders other than the freebsd builder
110+
// because we're not aware of any other platforms with this behavior,
111+
// and if the test suddenly starts skipping on other platforms we want
112+
// to know about it so that we can fix either the test or our Dial
113+
// implementation.
114+
t.Logf("Dial: %v", err)
115+
t.Skipf("skipping due to ECONNRESET with full accept queue")
116+
}
117+
118+
if d.Deadline.IsZero() || afterDial.Before(d.Deadline) {
119+
delay := afterDial.Sub(beforeDial)
120+
if delay < tt.timeout {
121+
t.Errorf("Dial returned after %v; want ≥%v", delay, tt.timeout)
122+
}
123+
}
124+
125+
if perr := parseDialError(err); perr != nil {
126+
t.Errorf("unexpected error from Dial: %v", perr)
127+
}
128+
if nerr, ok := err.(Error); !ok || !nerr.Timeout() {
129+
t.Errorf("Dial: %v, want timeout", err)
130+
}
131+
})
81132
}
82133
}
83134

0 commit comments

Comments
 (0)