Skip to content

Commit 992f423

Browse files
Albin Kerouantonakerouanton
Albin Kerouanton
authored andcommitted
Properly stop logger during (re)connect failure
PR fluent#77 introduced a new parameter named ForceStopAsyncSend. It can be used to tell the logger to not try to send all the log messages in its buffer before closing. Without this parameter, the logger hangs out whenever it has logs to write and the target Fluentd server is down. But this new parameter is not enough: the connection is currently lazily initialized when the logger receive its first message. This blocks the select reading messages from the queue (until the connection is ready). Moreover, the connection dialing uses an exponential back-off retry. Because of that, the logger won't look for messages on `stopRunning` channel (the channel introduced by PR fluent#77), either because it's blocked by the Sleep used for the retry or because the connection dialing is waiting until dialing timeout (eg. TCP timeout). To fix these edge cases, the time.Sleep() used for back-off retry has been transformed into a time.After(). Moreover, the dialer.Dial() call used to connect to Fluentd has been replaced with dialer.DialContext() and a cancellable context is now used to stop the call to that method. The associated cancel function is stored in Fluent and got called by Close() when ForceStopAsyncSend option is enabled. Finally, the Fluent.run() method has been adapted to wait for both new messages on f.pending and stop signal sent to f.stopRunning channel. Previously, both channels were selected independently, in a procedural fashion. This fix is motivated by the issue described in: moby/moby#40063. Signed-off-by: Albin Kerouanton <[email protected]>
1 parent 0e453b2 commit 992f423

File tree

2 files changed

+157
-100
lines changed

2 files changed

+157
-100
lines changed

fluent/fluent.go

Lines changed: 138 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
package fluent
22

33
import (
4+
"context"
45
"encoding/json"
56
"errors"
67
"fmt"
78
"math"
9+
"math/rand"
810
"net"
911
"os"
1012
"reflect"
@@ -15,7 +17,6 @@ import (
1517
"bytes"
1618
"encoding/base64"
1719
"encoding/binary"
18-
"math/rand"
1920

2021
"github.com/tinylib/msgp/msgp"
2122
)
@@ -37,6 +38,13 @@ const (
3738
defaultSubSecondPrecision = false
3839
)
3940

41+
// randomGenerator is used by getUniqueId to generate ack hashes.
42+
// Unfortunately, using a seeded random number generator isn't enough to ensure
43+
// ack hashes are deterministically generated during tests, thus we need to
44+
// change randomGenerator value to ensure the hashes are stable during tests
45+
// such that we can expect how the logger behaves with RequestAck option enabled.
46+
var randomGenerator = rand.Uint64
47+
4048
type Config struct {
4149
FluentPort int `json:"fluent_port"`
4250
FluentHost string `json:"fluent_host"`
@@ -85,15 +93,22 @@ type msgToSend struct {
8593
type Fluent struct {
8694
Config
8795

88-
dialer dialer
89-
stopRunning chan bool
90-
pending chan *msgToSend
91-
wg sync.WaitGroup
96+
dialer dialer
97+
// stopRunning is used in async mode to signal to run() it should abort.
98+
stopRunning chan struct{}
99+
// cancelDialings is used by Close() to stop any in-progress dialing.
100+
cancelDialings context.CancelFunc
101+
pending chan *msgToSend
102+
wg sync.WaitGroup
92103

93104
muconn sync.Mutex
94105
conn net.Conn
95106
}
96107

108+
type dialer interface {
109+
DialContext(ctx context.Context, network, address string) (net.Conn, error)
110+
}
111+
97112
// New creates a new Logger.
98113
func New(config Config) (*Fluent, error) {
99114
if config.Timeout == 0 {
@@ -104,10 +119,6 @@ func New(config Config) (*Fluent, error) {
104119
})
105120
}
106121

107-
type dialer interface {
108-
Dial(string, string) (net.Conn, error)
109-
}
110-
111122
func newWithDialer(config Config, d dialer) (f *Fluent, err error) {
112123
if config.FluentNetwork == "" {
113124
config.FluentNetwork = defaultNetwork
@@ -142,20 +153,24 @@ func newWithDialer(config Config, d dialer) (f *Fluent, err error) {
142153
}
143154

144155
if config.Async {
156+
ctx, cancel := context.WithCancel(context.Background())
157+
145158
f = &Fluent{
146-
Config: config,
147-
dialer: d,
148-
pending: make(chan *msgToSend, config.BufferLimit),
149-
stopRunning: make(chan bool, 1),
159+
Config: config,
160+
dialer: d,
161+
stopRunning: make(chan struct{}),
162+
cancelDialings: cancel,
163+
pending: make(chan *msgToSend, config.BufferLimit),
150164
}
165+
151166
f.wg.Add(1)
152-
go f.run()
167+
go f.run(ctx)
153168
} else {
154169
f = &Fluent{
155170
Config: config,
156171
dialer: d,
157172
}
158-
err = f.connect()
173+
err = f.connect(context.Background())
159174
}
160175
return
161176
}
@@ -252,7 +267,7 @@ func (f *Fluent) postRawData(msg *msgToSend) error {
252267
return f.appendBuffer(msg)
253268
}
254269
// Synchronous write
255-
return f.write(msg)
270+
return f.write(context.Background(), msg)
256271
}
257272

258273
// For sending forward protocol adopted JSON
@@ -286,7 +301,7 @@ func getUniqueID(timeUnix int64) (string, error) {
286301
enc.Close()
287302
return "", err
288303
}
289-
if err := binary.Write(enc, binary.LittleEndian, rand.Uint64()); err != nil {
304+
if err := binary.Write(enc, binary.LittleEndian, randomGenerator()); err != nil {
290305
enc.Close()
291306
return "", err
292307
}
@@ -326,13 +341,18 @@ func (f *Fluent) EncodeData(tag string, tm time.Time, message interface{}) (msg
326341
func (f *Fluent) Close() (err error) {
327342
if f.Config.Async {
328343
if f.Config.ForceStopAsyncSend {
329-
f.stopRunning <- true
330344
close(f.stopRunning)
345+
f.cancelDialings()
331346
}
347+
332348
close(f.pending)
333349
f.wg.Wait()
334350
}
335-
f.close(f.conn)
351+
352+
f.muconn.Lock()
353+
f.close()
354+
f.muconn.Unlock()
355+
336356
return
337357
}
338358

@@ -346,58 +366,91 @@ func (f *Fluent) appendBuffer(msg *msgToSend) error {
346366
return nil
347367
}
348368

349-
// close closes the connection.
350-
func (f *Fluent) close(c net.Conn) {
351-
f.muconn.Lock()
352-
if f.conn != nil && f.conn == c {
369+
// close closes the connection. Callers should take care of locking muconn first.
370+
func (f *Fluent) close() {
371+
if f.conn != nil {
353372
f.conn.Close()
354373
f.conn = nil
355374
}
356-
f.muconn.Unlock()
357375
}
358376

359377
// connect establishes a new connection using the specified transport.
360-
func (f *Fluent) connect() (err error) {
378+
func (f *Fluent) connect(ctx context.Context) (err error) {
361379
switch f.Config.FluentNetwork {
362380
case "tcp":
363-
f.conn, err = f.dialer.Dial(
381+
f.conn, err = f.dialer.DialContext(ctx,
364382
f.Config.FluentNetwork,
365383
f.Config.FluentHost+":"+strconv.Itoa(f.Config.FluentPort))
366384
case "unix":
367-
f.conn, err = f.dialer.Dial(
385+
f.conn, err = f.dialer.DialContext(ctx,
368386
f.Config.FluentNetwork,
369387
f.Config.FluentSocketPath)
370388
default:
371389
err = NewErrUnknownNetwork(f.Config.FluentNetwork)
372390
}
391+
373392
return err
374393
}
375394

376-
func (f *Fluent) run() {
377-
drainEvents := false
378-
var emitEventDrainMsg sync.Once
395+
var errIsClosing = errors.New("fluent logger is closing")
396+
397+
func (f *Fluent) connectOrRetry(ctx context.Context) error {
398+
// Use a Time channel instead of time.Sleep() to avoid blocking this
399+
// goroutine during possibly way too much time (because of the exponential
400+
// back-off retry).
401+
waiter := time.After(time.Duration(0))
402+
for i := 0; i < f.Config.MaxRetry; i++ {
403+
select {
404+
case <-waiter:
405+
err := f.connect(ctx)
406+
if err == nil {
407+
return nil
408+
}
409+
410+
if _, ok := err.(*ErrUnknownNetwork); ok {
411+
return err
412+
}
413+
if err == context.Canceled {
414+
return errIsClosing
415+
}
416+
417+
waitTime := f.Config.RetryWait * e(defaultReconnectWaitIncreRate, float64(i-1))
418+
if waitTime > f.Config.MaxRetryWait {
419+
waitTime = f.Config.MaxRetryWait
420+
}
421+
422+
waiter = time.After(time.Duration(waitTime) * time.Millisecond)
423+
424+
fmt.Fprintf(os.Stderr, "[%s] An error happened during connect: %s. Retrying to connect in %dms.", time.Now().Format(time.RFC3339), err, waitTime)
425+
case <-ctx.Done():
426+
return errIsClosing
427+
}
428+
}
429+
430+
return fmt.Errorf("could not connect to fluentd after %d retries", f.Config.MaxRetry)
431+
}
432+
433+
// run is the goroutine used to unqueue and write logs in async mode. That
434+
// goroutine is meant to run during the whole life of the Fluent logger.
435+
func (f *Fluent) run(ctx context.Context) {
379436
for {
380437
select {
381438
case entry, ok := <-f.pending:
439+
// f.stopRunning is closed before f.pending only when ForceStopAsyncSend
440+
// is enabled. Otherwise, f.pending is closed when Close() is called.
382441
if !ok {
383442
f.wg.Done()
384443
return
385444
}
386-
if drainEvents {
387-
emitEventDrainMsg.Do(func() { fmt.Fprintf(os.Stderr, "[%s] Discarding queued events...\n", time.Now().Format(time.RFC3339)) })
388-
continue
389-
}
390-
err := f.write(entry)
391-
if err != nil {
445+
446+
if err := f.write(ctx, entry); err != nil && err != errIsClosing {
392447
fmt.Fprintf(os.Stderr, "[%s] Unable to send logs to fluentd, reconnecting...\n", time.Now().Format(time.RFC3339))
393448
}
394-
}
395-
select {
396-
case stopRunning, ok := <-f.stopRunning:
397-
if stopRunning || !ok {
398-
drainEvents = true
399-
}
400-
default:
449+
case <-f.stopRunning:
450+
fmt.Fprintf(os.Stderr, "[%s] Discarding queued events...\n", time.Now().Format(time.RFC3339))
451+
452+
f.wg.Done()
453+
return
401454
}
402455
}
403456
}
@@ -406,63 +459,59 @@ func e(x, y float64) int {
406459
return int(math.Pow(x, y))
407460
}
408461

409-
func (f *Fluent) write(msg *msgToSend) error {
410-
var c net.Conn
411-
for i := 0; i < f.Config.MaxRetry; i++ {
412-
c = f.conn
413-
// Connect if needed
414-
if c == nil {
415-
f.muconn.Lock()
416-
if f.conn == nil {
417-
err := f.connect()
418-
if err != nil {
419-
f.muconn.Unlock()
420-
421-
if _, ok := err.(*ErrUnknownNetwork); ok {
422-
// do not retry on unknown network error
423-
break
424-
}
425-
waitTime := f.Config.RetryWait * e(defaultReconnectWaitIncreRate, float64(i-1))
426-
if waitTime > f.Config.MaxRetryWait {
427-
waitTime = f.Config.MaxRetryWait
428-
}
429-
time.Sleep(time.Duration(waitTime) * time.Millisecond)
430-
continue
431-
}
462+
func (f *Fluent) write(ctx context.Context, msg *msgToSend) error {
463+
writer := func() (bool, error) {
464+
// This function is used to ensure muconn is properly locked and unlocked
465+
// between each retry. This gives the importunity to other goroutines to
466+
// lock it (e.g. to close the connection).
467+
f.muconn.Lock()
468+
defer f.muconn.Unlock()
469+
470+
if f.conn == nil {
471+
if err := f.connectOrRetry(ctx); err != nil {
472+
return false, err
432473
}
433-
c = f.conn
434-
f.muconn.Unlock()
435474
}
436475

437-
// We're connected, write msg
438476
t := f.Config.WriteTimeout
439477
if time.Duration(0) < t {
440-
c.SetWriteDeadline(time.Now().Add(t))
478+
f.conn.SetWriteDeadline(time.Now().Add(t))
441479
} else {
442-
c.SetWriteDeadline(time.Time{})
480+
f.conn.SetWriteDeadline(time.Time{})
443481
}
444-
_, err := c.Write(msg.data)
482+
483+
_, err := f.conn.Write(msg.data)
445484
if err != nil {
446-
f.close(c)
447-
} else {
448-
// Acknowledgment check
449-
if msg.ack != "" {
450-
resp := &AckResp{}
451-
if f.Config.MarshalAsJSON {
452-
dec := json.NewDecoder(c)
453-
err = dec.Decode(resp)
454-
} else {
455-
r := msgp.NewReader(c)
456-
err = resp.DecodeMsg(r)
457-
}
458-
if err != nil || resp.Ack != msg.ack {
459-
f.close(c)
460-
continue
461-
}
485+
f.close()
486+
return true, err
487+
}
488+
489+
// Acknowledgment check
490+
if msg.ack != "" {
491+
resp := &AckResp{}
492+
if f.Config.MarshalAsJSON {
493+
dec := json.NewDecoder(f.conn)
494+
err = dec.Decode(resp)
495+
} else {
496+
r := msgp.NewReader(f.conn)
497+
err = resp.DecodeMsg(r)
462498
}
499+
500+
if err != nil || resp.Ack != msg.ack {
501+
fmt.Fprintf(os.Stderr, "fluent#write: message ack (%s) doesn't match expected one (%s). Closing connection...", resp.Ack, msg.ack)
502+
f.close()
503+
return true, err
504+
}
505+
}
506+
507+
return false, nil
508+
}
509+
510+
for i := 0; i < f.Config.MaxRetry; i++ {
511+
if retry, err := writer(); !retry {
463512
return err
464513
}
465514
}
466515

467-
return fmt.Errorf("fluent#write: failed to reconnect, max retry: %v", f.Config.MaxRetry)
516+
return fmt.Errorf("fluent#write: failed to write after %d attempts", f.Config.MaxRetry)
468517
}

0 commit comments

Comments
 (0)