Skip to content

net/http: a "bad" https connection which uses Header.Set could potentially block all https requests more than 15 mins in production environments #33006

Closed
@kerberosYao

Description

@kerberosYao

What version of Go are you using (go version)?

$ go version go1.12.6 linux/amd64

Does this issue reproduce with the latest release?

Yes

What operating system and processor architecture are you using (go env)?

go env Output
$ go env
GOARCH="amd64"
GOBIN=""
GOCACHE="/home/zhang/.cache/go-build"
GOEXE=""
GOFLAGS=""
GOHOSTARCH="amd64"
GOHOSTOS="linux"
GOOS="linux"
GOPATH="/home/zhang/.gvm/pkgsets/go1.12.6/global"
GOPROXY=""
GORACE=""
GOROOT="/home/zhang/.gvm/gos/go1.12.6"
GOTMPDIR=""
GOTOOLDIR="/home/zhang/.gvm/gos/go1.12.6/pkg/tool/linux_amd64"
GCCGO="gccgo"
CC="gcc"
CXX="g++"
CGO_ENABLED="1"
GOMOD=""
CGO_CFLAGS="-g -O2"
CGO_CPPFLAGS=""
CGO_CXXFLAGS="-g -O2"
CGO_FFLAGS="-g -O2"
CGO_LDFLAGS="-g -O2"
PKG_CONFIG="pkg-config"
GOGCCFLAGS="-fPIC -m64 -pthread -fmessage-length=0 -fdebug-prefix-map=/tmp/go-build353201341=/tmp/go-build -gno-record-gcc-switches"

What did you do?

step1: Run the demo below to simulate the situation described in the title
step2: after step1 is successful , disconnect 10.210.180.138 server's power or network

package main

import (
	"flag"
	"io"
	"io/ioutil"
	"log"
	"net/http"
	_ "net/http/pprof"
	"strings"
	"sync/atomic"
	"time"
)

var url = flag.String("url", "https://10.210.180.138:9908", "url")

func main() {

	flag.Parse()

	go func() {
		log.Println(http.ListenAndServe("localhost:6060", nil))
	}()

	var reqA, doneA int64
	var reqB, doneB int64
	for {
		for i := 0; i < 10; i++ {
			go func() {
				atomic.AddInt64(&reqA, 1)
				resp, err := http.Get("https://www.qiniu.com")
				if err == nil {
					io.Copy(ioutil.Discard, resp.Body)
					resp.Body.Close()
				}
				atomic.AddInt64(&doneA, 1)
			}()
		}
		for i := 0; i < 10; i++ {
			go func() {
				atomic.AddInt64(&reqB, 1)
				req, _ := http.NewRequest("GET", *url, nil)
				req.Header.Set("X-Qiniu", strings.Repeat("helloworld", 1024))
				resp, err := http.DefaultClient.Do(req)
				if err == nil {
					io.Copy(ioutil.Discard, resp.Body)
					resp.Body.Close()
				} else {
					log.Println(err)
				}
				atomic.AddInt64(&doneB, 1)
			}()
		}

		log.Printf("reqA:%d doneA:%d, reqB:%d doneB:%d\n", atomic.LoadInt64(&reqA), atomic.LoadInt64(&doneA), atomic.LoadInt64(&reqB), atomic.LoadInt64(&doneB))

		time.Sleep(1e9)
	}
}

What did you expect to see?

doneA continue increase, doneB stop increase

What did you see instead?

as the output shown, from "2019/07/09 19:19:20", both doneA and doneB stop increase, all requests blocked:

go run http.go 
2019/07/09 19:19:14 reqA:0 doneA:0, reqB:0 doneB:0
2019/07/09 19:19:15 reqA:10 doneA:10, reqB:10 doneB:10
2019/07/09 19:19:16 reqA:20 doneA:20, reqB:20 doneB:20
2019/07/09 19:19:17 reqA:30 doneA:30, reqB:30 doneB:30
2019/07/09 19:19:18 reqA:40 doneA:40, reqB:40 doneB:40
2019/07/09 19:19:19 reqA:50 doneA:50, reqB:50 doneB:50
2019/07/09 19:19:20 reqA:60 doneA:60, reqB:60 doneB:50
2019/07/09 19:19:21 reqA:70 doneA:70, reqB:70 doneB:50
2019/07/09 19:19:22 reqA:80 doneA:80, reqB:80 doneB:50
2019/07/09 19:19:23 reqA:90 doneA:90, reqB:90 doneB:50
2019/07/09 19:19:24 reqA:100 doneA:100, reqB:100 doneB:50
2019/07/09 19:19:25 reqA:110 doneA:100, reqB:110 doneB:50
2019/07/09 19:19:26 reqA:120 doneA:100, reqB:120 doneB:50
2019/07/09 19:19:27 reqA:130 doneA:100, reqB:130 doneB:50
2019/07/09 19:19:28 reqA:140 doneA:100, reqB:140 doneB:50
2019/07/09 19:19:29 reqA:150 doneA:100, reqB:150 doneB:50
2019/07/09 19:19:30 reqA:160 doneA:100, reqB:160 doneB:50
2019/07/09 19:19:31 reqA:170 doneA:100, reqB:170 doneB:50
2019/07/09 19:19:32 reqA:180 doneA:100, reqB:180 doneB:50
2019/07/09 19:19:33 reqA:190 doneA:100, reqB:190 doneB:50
2019/07/09 19:19:34 reqA:200 doneA:100, reqB:200 doneB:50
2019/07/09 19:19:35 reqA:210 doneA:100, reqB:210 doneB:50
2019/07/09 19:19:36 reqA:223 doneA:100, reqB:220 doneB:50
2019/07/09 19:19:37 reqA:230 doneA:100, reqB:230 doneB:50

goroutine stack is shown below:

goroutine 743 [IO wait]:
internal/poll.runtime_pollWait(0x7fc8a5794618, 0x77, 0xffffffffffffffff)
	/home/zhang/.gvm/gos/go1.12.6/src/runtime/netpoll.go:182 +0x56
internal/poll.(*pollDesc).wait(0xc000101898, 0x77, 0x400, 0x491, 0xffffffffffffffff)
	/home/zhang/.gvm/gos/go1.12.6/src/internal/poll/fd_poll_runtime.go:87 +0x9b
internal/poll.(*pollDesc).waitWrite(...)
	/home/zhang/.gvm/gos/go1.12.6/src/internal/poll/fd_poll_runtime.go:96
internal/poll.(*FD).Write(0xc000101880, 0xc0002c6000, 0x1d37, 0x1d37, 0x0, 0x0, 0x0)
	/home/zhang/.gvm/gos/go1.12.6/src/internal/poll/fd_unix.go:276 +0x26e
net.(*netFD).Write(0xc000101880, 0xc0002c6000, 0x1d37, 0x1d37, 0x0, 0x1d32, 0x20202020203e612f)
	/home/zhang/.gvm/gos/go1.12.6/src/net/fd_unix.go:220 +0x4f
net.(*conn).Write(0xc000010020, 0xc0002c6000, 0x1d37, 0x1d37, 0x0, 0x0, 0x0)
	/home/zhang/.gvm/gos/go1.12.6/src/net/net.go:189 +0x69
crypto/tls.(*Conn).write(0xc00003f180, 0xc0002c6000, 0x1d37, 0x1d37, 0xc0005f4000, 0x1d1a, 0x2000)
	/home/zhang/.gvm/gos/go1.12.6/src/crypto/tls/conn.go:896 +0x160
crypto/tls.(*Conn).writeRecordLocked(0xc00003f180, 0x826117, 0xc0005f4000, 0x1d1a, 0x2000, 0xc000189620, 0xc000503340, 0xc000503340)
	/home/zhang/.gvm/gos/go1.12.6/src/crypto/tls/conn.go:945 +0x2a1
crypto/tls.(*Conn).Write(0xc00003f180, 0xc0005f4000, 0x1d1a, 0x2000, 0x0, 0x0, 0x0)
	/home/zhang/.gvm/gos/go1.12.6/src/crypto/tls/conn.go:1114 +0x184
net/http.http2stickyErrWriter.Write(0x7fc8a56b4160, 0xc00003f180, 0xc00013f358, 0xc0005f4000, 0x1d1a, 0x2000, 0x616264656566223d, 0x65746e6f632d6b63, 0x656d616e2022746e)
	/home/zhang/.gvm/gos/go1.12.6/src/net/http/h2_bundle.go:6792 +0x68
bufio.(*Writer).Write(0xc000112680, 0xc0005f4000, 0x1d1a, 0x2000, 0x60, 0x8, 0xc0003693d0)
	/home/zhang/.gvm/gos/go1.12.6/src/bufio/bufio.go:622 +0x145
net/http.(*http2Framer).endWrite(0xc0004f62a0, 0xc000078200, 0x0)
	/home/zhang/.gvm/gos/go1.12.6/src/net/http/h2_bundle.go:1605 +0xab
net/http.(*http2Framer).WriteHeaders(0xc0004f62a0, 0xc3, 0xc0002f0000, 0x1d11, 0x1d8b, 0x101, 0x0, 0x69d081, 0x7e7ac0)
	/home/zhang/.gvm/gos/go1.12.6/src/net/http/h2_bundle.go:2339 +0x29d
net/http.(*http2ClientConn).writeHeaders(0xc00013f200, 0x1000000c3, 0x100000, 0xc0002f1d11, 0x0, 0x7a, 0x0, 0x0)
	/home/zhang/.gvm/gos/go1.12.6/src/net/http/h2_bundle.go:7613 +0x18f
net/http.(*http2ClientConn).roundTrip(0xc00013f200, 0xc0000ff800, 0x0, 0x0, 0x0, 0x0)
	/home/zhang/.gvm/gos/go1.12.6/src/net/http/h2_bundle.go:7446 +0x4ca
net/http.(*http2Transport).RoundTripOpt(0xc00009e660, 0xc0000ff800, 0xc000192800, 0x6a8f66, 0x0, 0xc0000ac7e0)
	/home/zhang/.gvm/gos/go1.12.6/src/net/http/h2_bundle.go:6867 +0x159
net/http.(*http2Transport).RoundTrip(...)
	/home/zhang/.gvm/gos/go1.12.6/src/net/http/h2_bundle.go:6829
net/http.http2noDialH2RoundTripper.RoundTrip(0xc00009e660, 0xc0000ff800, 0x818401, 0x5, 0xc0000ac868)
	/home/zhang/.gvm/gos/go1.12.6/src/net/http/h2_bundle.go:8959 +0x3f
net/http.(*Transport).roundTrip(0xb2eba0, 0xc0000ff800, 0x10, 0xc000416420, 0xc000192b80)
	/home/zhang/.gvm/gos/go1.12.6/src/net/http/transport.go:430 +0xe90
net/http.(*Transport).RoundTrip(0xb2eba0, 0xc0000ff800, 0xb2eba0, 0x0, 0x0)
	/home/zhang/.gvm/gos/go1.12.6/src/net/http/roundtrip.go:17 +0x35
net/http.send(0xc0000ff800, 0x88cfe0, 0xb2eba0, 0x0, 0x0, 0x0, 0xc0001302f0, 0xc000192da8, 0x1, 0x0)
	/home/zhang/.gvm/gos/go1.12.6/src/net/http/client.go:250 +0x461
net/http.(*Client).send(0xb37180, 0xc0000ff800, 0x0, 0x0, 0x0, 0xc0001302f0, 0x0, 0x1, 0x2800)
	/home/zhang/.gvm/gos/go1.12.6/src/net/http/client.go:174 +0xfb
net/http.(*Client).do(0xb37180, 0xc0000ff800, 0x0, 0x0, 0x0)
	/home/zhang/.gvm/gos/go1.12.6/src/net/http/client.go:641 +0x279
net/http.(*Client).Do(...)
	/home/zhang/.gvm/gos/go1.12.6/src/net/http/client.go:509
main.main.func3(0xc0000982c8, 0xc0000982d0)
	/home/zhang/Workspace/go/src/test_block/http.go:45 +0x1a9
created by main.main
	/home/zhang/Workspace/go/src/test_block/http.go:41 +0x18c

goroutine 788 [IO wait]:
internal/poll.runtime_pollWait(0x7fc8a57943a8, 0x72, 0xffffffffffffffff)
	/home/zhang/.gvm/gos/go1.12.6/src/runtime/netpoll.go:182 +0x56
internal/poll.(*pollDesc).wait(0xc000134b18, 0x72, 0x0, 0x1, 0xffffffffffffffff)
	/home/zhang/.gvm/gos/go1.12.6/src/internal/poll/fd_poll_runtime.go:87 +0x9b
internal/poll.(*pollDesc).waitRead(...)
	/home/zhang/.gvm/gos/go1.12.6/src/internal/poll/fd_poll_runtime.go:92
internal/poll.(*FD).Read(0xc000134b00, 0xc00010f391, 0x1, 0x1, 0x0, 0x0, 0x0)
	/home/zhang/.gvm/gos/go1.12.6/src/internal/poll/fd_unix.go:169 +0x19b
net.(*netFD).Read(0xc000134b00, 0xc00010f391, 0x1, 0x1, 0x24, 0x0, 0x0)
	/home/zhang/.gvm/gos/go1.12.6/src/net/fd_unix.go:202 +0x4f
net.(*conn).Read(0xc0000102d8, 0xc00010f391, 0x1, 0x1, 0x0, 0x0, 0x0)
	/home/zhang/.gvm/gos/go1.12.6/src/net/net.go:177 +0x69
net/http.(*connReader).backgroundRead(0xc00010f380)
	/home/zhang/.gvm/gos/go1.12.6/src/net/http/server.go:677 +0x58
created by net/http.(*connReader).startBackgroundRead
	/home/zhang/.gvm/gos/go1.12.6/src/net/http/server.go:673 +0xca

and lots of goroutines are semacquire

goroutine 759 [semacquire]:
sync.runtime_SemacquireMutex(0xc0000879bc, 0xc00064a600)
	/home/zhang/.gvm/gos/go1.12.6/src/runtime/sema.go:71 +0x3d
sync.(*Mutex).Lock(0xc0000879b8)
	/home/zhang/.gvm/gos/go1.12.6/src/sync/mutex.go:134 +0x109
net/http.(*http2clientConnPool).getClientConn(0xc0000879b0, 0xc0003e9100, 0xc00052c720, 0x11, 0xc00009e600, 0xc00064a700, 0x6e0aa0, 0xc00009e660)
	/home/zhang/.gvm/gos/go1.12.6/src/net/http/h2_bundle.go:760 +0x66
net/http.http2noDialClientConnPool.GetClientConn(0xc0000879b0, 0xc0003e9100, 0xc00052c720, 0x11, 0xc00052c720, 0x11, 0xc00032d130)
	/home/zhang/.gvm/gos/go1.12.6/src/net/http/h2_bundle.go:954 +0x4e
net/http.(*http2Transport).RoundTripOpt(0xc00009e660, 0xc0003e9100, 0xc00064a800, 0x6a8f66, 0x0, 0xc0000ac7e0)
	/home/zhang/.gvm/gos/go1.12.6/src/net/http/h2_bundle.go:6861 +0x105
net/http.(*http2Transport).RoundTrip(...)
	/home/zhang/.gvm/gos/go1.12.6/src/net/http/h2_bundle.go:6829
net/http.http2noDialH2RoundTripper.RoundTrip(0xc00009e660, 0xc0003e9100, 0x81187b, 0x5, 0xc0000ac868)
	/home/zhang/.gvm/gos/go1.12.6/src/net/http/h2_bundle.go:8959 +0x3f
net/http.(*Transport).roundTrip(0xb2eba0, 0xc0003e9100, 0xc00012f0e0, 0xc00012f0f0, 0xc00064ab70)
	/home/zhang/.gvm/gos/go1.12.6/src/net/http/transport.go:430 +0xe90
net/http.(*Transport).RoundTrip(0xb2eba0, 0xc0003e9100, 0xb2eba0, 0x0, 0x0)
	/home/zhang/.gvm/gos/go1.12.6/src/net/http/roundtrip.go:17 +0x35
net/http.send(0xc0003e9100, 0x88cfe0, 0xb2eba0, 0x0, 0x0, 0x0, 0xc00010c338, 0xc00064ad98, 0x1, 0x0)
	/home/zhang/.gvm/gos/go1.12.6/src/net/http/client.go:250 +0x461
net/http.(*Client).send(0xb37180, 0xc0003e9100, 0x0, 0x0, 0x0, 0xc00010c338, 0x0, 0x1, 0xc00006ce00)
	/home/zhang/.gvm/gos/go1.12.6/src/net/http/client.go:174 +0xfb
net/http.(*Client).do(0xb37180, 0xc0003e9100, 0x0, 0x0, 0x0)
	/home/zhang/.gvm/gos/go1.12.6/src/net/http/client.go:641 +0x279
net/http.(*Client).Do(...)
	/home/zhang/.gvm/gos/go1.12.6/src/net/http/client.go:509
net/http.(*Client).Get(0xb37180, 0x81187b, 0x15, 0x0, 0x0, 0x1)
	/home/zhang/.gvm/gos/go1.12.6/src/net/http/client.go:398 +0x9e
net/http.Get(...)
	/home/zhang/.gvm/gos/go1.12.6/src/net/http/client.go:370
main.main.func2(0xc0000982b8, 0xc0000982c0)
	/home/zhang/Workspace/go/src/test_block/http.go:32 +0x56
created by main.main
	/home/zhang/Workspace/go/src/test_block/http.go:30 +0x145

this issue has attracted a lot of attention such as 288242355932388, however unfortunately has not been solved until now.

Activity

gzcaimufu

gzcaimufu commented on Jul 10, 2019

@gzcaimufu

@agnivade We have the same problem, please have a look?

kerberosYao

kerberosYao commented on Jul 10, 2019

@kerberosYao
Author

after testing, the problem still exists in go1.12.6, do you have any good ideas? @carter2000

av86743

av86743 commented on Jul 10, 2019

@av86743

You could use this CL as a starting point if you wish to resolve your issue.

agnivade

agnivade commented on Jul 10, 2019

@agnivade
Contributor
added
NeedsInvestigationSomeone must examine and confirm this is a valid issue and not a duplicate of an existing one.
on Jul 10, 2019
fraenkel

fraenkel commented on Jul 10, 2019

@fraenkel
Contributor

This is a dupe of #32388.
With my CL noted above, it hangs exactly where I said it would in the CL. The write lock starts blocking any forward progress.

goroutine 749 [IO wait]:
internal/poll.runtime_pollWait(0x7f7d64421008, 0x77, 0xadf640)
	/snap/go/4098/src/runtime/netpoll.go:182 +0x56
internal/poll.(*pollDesc).wait(0xc000120b18, 0x77, 0x1a00, 0x1a0d, 0xffffffffffffffff)
	/snap/go/4098/src/internal/poll/fd_poll_runtime.go:87 +0xe5
internal/poll.(*pollDesc).waitWrite(...)
	/snap/go/4098/src/internal/poll/fd_poll_runtime.go:96
internal/poll.(*FD).Write(0xc000120b00, 0xc0006ec000, 0x1d37, 0x1d37, 0x0, 0x0, 0x0)
	/snap/go/4098/src/internal/poll/fd_unix.go:276 +0x312
net.(*netFD).Write(0xc000120b00, 0xc0006ec000, 0x1d37, 0x1d37, 0x0, 0xd, 0x10)
	/snap/go/4098/src/net/fd_unix.go:220 +0x66
net.(*conn).Write(0xc000248068, 0xc0006ec000, 0x1d37, 0x1d37, 0x9e81c0, 0x7, 0xae73e0)
	/snap/go/4098/src/net/net.go:189 +0xa2
crypto/tls.(*Conn).write(0xc00018ca80, 0xc0006ec000, 0x1d37, 0x1d37, 0xc0006ce000, 0x1d1a, 0x2000)
	/snap/go/4098/src/crypto/tls/conn.go:896 +0x244
crypto/tls.(*Conn).writeRecordLocked(0xc00018ca80, 0x17, 0xc0006ce000, 0x1d1a, 0x2000, 0xa, 0xa3d5a3, 0x12)
	/snap/go/4098/src/crypto/tls/conn.go:945 +0x56e
crypto/tls.(*Conn).Write(0xc00018ca80, 0xc0006ce000, 0x1d1a, 0x2000, 0x0, 0x0, 0x0)
	/snap/go/4098/src/crypto/tls/conn.go:1114 +0x22d
golang.org/x/net/http2.stickyErrWriter.Write(0x7f7d5c6f9008, 0xc00018ca80, 0xc00018fad8, 0xc0006ce000, 0x1d1a, 0x2000, 0x9438d4, 0xc00018fa88, 0xc00063a220)
	/home/fraenkel/go/src/golang.org/x/net/http2/transport.go:372 +0x86
bufio.(*Writer).Write(0xc000204140, 0xc0006ce000, 0x1d1a, 0x2000, 0x1d11, 0x46de70, 0xc0006be000)
	/snap/go/4098/src/bufio/bufio.go:622 +0x184
golang.org/x/net/http2.(*Framer).endWrite(0xc00066c000, 0x0, 0x2e6)
	/home/fraenkel/go/src/golang.org/x/net/http2/frame.go:366 +0x1cf
golang.org/x/net/http2.(*Framer).WriteHeaders(0xc00066c000, 0xc5, 0xc0006be000, 0x1d11, 0x1d8b, 0x101, 0x0, 0x7ffc7eab54a2, 0x30)
	/home/fraenkel/go/src/golang.org/x/net/http2/frame.go:1097 +0x55d
golang.org/x/net/http2.(*ClientConn).writeHeaders(0xc00018f980, 0x1000000c5, 0x4000, 0xc0006bfd11, 0x0, 0x7a, 0x0, 0x0)
	/home/fraenkel/go/src/golang.org/x/net/http2/transport.go:1198 +0x21f
golang.org/x/net/http2.(*ClientConn).roundTrip(0xc00018f980, 0xc0001b0200, 0x0, 0x0, 0x0, 0x0)
	/home/fraenkel/go/src/golang.org/x/net/http2/transport.go:1032 +0x70f
golang.org/x/net/http2.(*Transport).RoundTripOpt(0xc000094240, 0xc0001b0200, 0xc0002a2500, 0x82824f, 0x0, 0x0)
	/home/fraenkel/go/src/golang.org/x/net/http2/transport.go:447 +0x270
golang.org/x/net/http2.(*Transport).RoundTrip(...)
	/home/fraenkel/go/src/golang.org/x/net/http2/transport.go:408
golang.org/x/net/http2.noDialH2RoundTripper.RoundTrip(0xc000094240, 0xc0001b0200, 0x7ffc7eab549a, 0x5, 0xc00017e1a8)
	/home/fraenkel/go/src/golang.org/x/net/http2/transport.go:2550 +0x4d
net/http.(*Transport).roundTrip(0xc00017e000, 0xc0001b0200, 0x434910, 0xc0002a2980, 0x434910)
	/snap/go/4098/src/net/http/transport.go:430 +0x14da
fraenkel

fraenkel commented on Jul 10, 2019

@fraenkel
Contributor

I will look further as to why the read side isn't breaking the write side which I believe should have happened.

fraenkel

fraenkel commented on Jul 11, 2019

@fraenkel
Contributor

The only solution to break out of this situation is to start setting decent timeouts/deadlines. Using the default transport, you are saying to have infinite timeouts. When the network is broken, the polling just retries the outstanding reads and writes because there is nothing to say that the connection is permanently broken.

kerberosYao

kerberosYao commented on Jul 11, 2019

@kerberosYao
Author

I agree that setting write deadline to terminate tcp connection. But only the Set[Read|Write]Deadline(time.Time) methods exposed by net.Conn with are effective.The customized http.Client and transport which have set timeouts is invalid.Now the question is whether http.Client or transport has a method equivalent to SetDeadline exposed by net.Conn.

fraenkel

fraenkel commented on Jul 11, 2019

@fraenkel
Contributor

There is Client.Timeout, or setting the request context using WithDeadline/WithTimeout which is for the entire request. Setting deadlines on the connection is the wrong approach given the issue is with http2 and you are sharing a connection.

gzcaimufu

gzcaimufu commented on Jul 11, 2019

@gzcaimufu

@fraenkel I think setting write or read deadline on connection is not bad. Only by doing that calller can return error early, but not waiting for broken connection for a long time.

salmon7

salmon7 commented on Jul 11, 2019

@salmon7

According the goroutine stack, http1.x shares some common logic with http2, is it possible to not block http1.x request(https) when calling WriteHeaders() ?

fraenkel

fraenkel commented on Jul 11, 2019

@fraenkel
Contributor

@salmon7 My best advice right now is to use separate transports. Its not like you are getting connection sharing when going to two different sites.

fraenkel

fraenkel commented on Jul 11, 2019

@fraenkel
Contributor

@gzcaimufu Adjust the keepalive on the connection to detect this sooner.

fraenkel

fraenkel commented on Jul 11, 2019

@fraenkel
Contributor

@ianlancetaylor This testcase highlights an issue with epoll that I do not know how to easily resolve other than layering timeouts. If I run the testcase although I used url=https://www.google.com, and then kill my network connection, the read and write sides go into a constant EGAIN loop. If I re-enable my network, I never see the connection re-established. I don't see any easily solutions.

4 remaining items

joshprzybyszewski-wf

joshprzybyszewski-wf commented on Jun 9, 2020

@joshprzybyszewski-wf

For those who follow the example of #33006 (comment), please note that while following that example, we started getting errors like connect: cannot assign requested address. We believe it's because of this:

client := &http.Client{
	Transport: &http.Transport{
		Dial: (&net.Dialer{
		}).Dial,
	},
	Timeout: 5 * time.Second,
}

while looking at #16012 (comment)

We think that using MaxIdleConnsPerHost = 100 will resolve, based on the http.DefaultTransport:

transport := &http.Transport{
	Proxy: http.ProxyFromEnvironment,
	DialContext: (&net.Dialer{
		Timeout:   30 * time.Second,
		KeepAlive: 30 * time.Second,
		DualStack: true,
	}).DialContext,
	ForceAttemptHTTP2:     true,
	MaxIdleConns:          100,
	IdleConnTimeout:       90 * time.Second,
	TLSHandshakeTimeout:   10 * time.Second,
	ExpectContinueTimeout: 1 * time.Second,
}
client := &http.Client{
	Transport: transport,
	Timeout:   5 * time.Second,
}
yuanmwang-wf

yuanmwang-wf commented on Jun 25, 2020

@yuanmwang-wf

@fraenkel

The only solution to break out of this situation is to start setting decent timeouts/deadlines. Using the default transport, you are saying to have infinite timeouts. When the network is broken, the polling just retries the outstanding reads and writes because there is nothing to say that the connection is permanently broken.

Recently we ran into what seems to be this problem with our go services and in our testing we noticed setting timeouts on the http client still causes all connections (even http/1 connections from the same http client) to be blocked after an http/2 connection goes bad.

We were able to reproduce this by setting up a testing app, only one http client is created in the app and we've set the timeout to be 10 seconds on the http client. The app makes both an http/2 connection and an http/1 connection every 5 seconds, we then set up an iptables rule to block packets on the ephemeral port of the http/2 connection, we saw the app reports timeout on the http/2 connection as expected, the specific error was context deadline exceeded (Client.Timeout exceeded while awaiting headers). However, we soon noticed the http/1 connections were blocked too. For roughly 15 minutes our app stopped making both http/1 and http/2 requests and we saw a flood of all buffered requests after 15 minutes because presumably OS killed the underlying tcp connections.

carter2000

carter2000 commented on Dec 8, 2020

@carter2000

Thanks all.The stuck problem seems like to have been solved after i define an http.Client with a sensible timeout and specify a custom net.Transport and net.Dialer.The modified code is below.

package main

import (
"flag"
"io"
"io/ioutil"
"log"
"net"
"net/http"
_ "net/http/pprof"
"strings"
"sync/atomic"
"time"
)

var url = flag.String("url", "https://10.210.180.138:9908", "url")

func main() {

flag.Parse()

go func() {
log.Println(http.ListenAndServe("localhost:6060", nil))
}()

var reqA, doneA int64
var reqB, doneB int64
client := &http.Client{
Transport: &http.Transport{
Dial: (&net.Dialer{
}).Dial,
},
Timeout: 5 * time.Second,
}
for {
for i := 0; i < 10; i++ {
go func() {
atomic.AddInt64(&reqA, 1)
resp, err := http.Get("https://www.qiniu.com")
if err == nil {
io.Copy(ioutil.Discard, resp.Body)
resp.Body.Close()
}
atomic.AddInt64(&doneA, 1)
}()
}
for i := 0; i < 10; i++ {
go func() {
atomic.AddInt64(&reqB, 1)
req, _ := http.NewRequest("GET", *url, nil)
req.Header.Set("X-Qiniu", strings.Repeat("helloworld", 1024))
resp, err := client.Do(req)
if err == nil {
io.Copy(ioutil.Discard, resp.Body)
resp.Body.Close()
} else {
log.Println(err)
}
atomic.AddInt64(&doneB, 1)
}()
}

  log.Printf("reqA:%d doneA:%d, reqB:%d doneB:%d\n", atomic.LoadInt64(&reqA), atomic.LoadInt64(&doneA), atomic.LoadInt64(&reqB), atomic.LoadInt64(&doneB))

  time.Sleep(1e9)

}
}

In this case, A and B use different transports, this just circumvents the problem, but does not really solve it . The problem should still exist.

neild

neild commented on Oct 13, 2021

@neild
Contributor

I believe the problem described in the original report is the same as #32388, which should have been fixed by CL 349594.

locked and limited conversation to collaborators on Oct 13, 2022
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Metadata

Metadata

Assignees

No one assigned

    Labels

    FrozenDueToAgeNeedsInvestigationSomeone must examine and confirm this is a valid issue and not a duplicate of an existing one.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

      Development

      No branches or pull requests

        Participants

        @neild@fraenkel@carter2000@av86743@agnivade

        Issue actions

          net/http: a "bad" https connection which uses Header.Set could potentially block all https requests more than 15 mins in production environments · Issue #33006 · golang/go