Skip to content

Commit 3908b01

Browse files
committed
Rate limit certain warning messages
There are a few warning messages that can easily flood the log, making it more difficult to figure out what causes the problem. Those are - too long WAL write - waited for ... bytes of vinyl memory quota for too long - get/select(...) => ... took too long - readahead limit is reached - net_msg_max limit is reached Actually, it's pointless to print each and every of them, because all messages of the same kind are similar and don't convey any additional information. So this patch limits the rate at which those messages may be printed. To achieve that, it introduces say_ratelimited() helper, which works exactly like say() except it does nothing if too many messages of the same kind have already been printed in the last few seconds. The implementation is trivial - say_ratelimited() defines a static ratelimit state variable at its call site (it's a macro) and checks it before logging anything. If the ratelimit state says that an event may be emitted, it will log the message, otherwise it will skip it and eventually print the total number of skipped messages instead. The rate limit is set to 10 messages per 5 seconds for each kind of a warning message enumerated above. Here's how it looks in the log: 2018-12-11 18:07:21.830 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached 2018-12-11 18:07:21.831 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached 2018-12-11 18:07:21.831 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached 2018-12-11 18:07:21.831 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached 2018-12-11 18:07:21.831 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached 2018-12-11 18:07:21.832 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached 2018-12-11 18:07:21.832 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached 2018-12-11 18:07:21.832 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached 2018-12-11 18:07:21.832 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached 2018-12-11 18:07:21.832 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached 2018-12-11 18:07:26.851 [30404] iproto iproto.cc:524 W> 9635 messages suppressed Closes #2218
1 parent 3dd7090 commit 3908b01

File tree

6 files changed

+59
-15
lines changed

6 files changed

+59
-15
lines changed

src/box/iproto.cc

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -539,8 +539,9 @@ iproto_connection_is_idle(struct iproto_connection *con)
539539
static inline void
540540
iproto_connection_stop_readahead_limit(struct iproto_connection *con)
541541
{
542-
say_warn("stopping input on connection %s, readahead limit is reached",
543-
sio_socketname(con->input.fd));
542+
say_warn_ratelimited("stopping input on connection %s, "
543+
"readahead limit is reached",
544+
sio_socketname(con->input.fd));
544545
assert(rlist_empty(&con->in_stop_list));
545546
ev_io_stop(con->loop, &con->input);
546547
}
@@ -550,8 +551,9 @@ iproto_connection_stop_msg_max_limit(struct iproto_connection *con)
550551
{
551552
assert(rlist_empty(&con->in_stop_list));
552553

553-
say_warn("stopping input on connection %s, net_msg_max limit is reached",
554-
sio_socketname(con->input.fd));
554+
say_warn_ratelimited("stopping input on connection %s, "
555+
"net_msg_max limit is reached",
556+
sio_socketname(con->input.fd));
555557
ev_io_stop(con->loop, &con->input);
556558
/*
557559
* Important to add to tail and fetch from head to ensure

src/box/txn.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -296,8 +296,9 @@ txn_write_to_wal(struct txn *txn)
296296
diag_set(ClientError, ER_WAL_IO);
297297
diag_log();
298298
} else if (stop - start > too_long_threshold) {
299-
say_warn("too long WAL write: %d rows at LSN %lld: %.3f sec",
300-
txn->n_rows, res - txn->n_rows + 1, stop - start);
299+
say_warn_ratelimited("too long WAL write: %d rows at "
300+
"LSN %lld: %.3f sec", txn->n_rows,
301+
res - txn->n_rows + 1, stop - start);
301302
}
302303
/*
303304
* Use vclock_sum() from WAL writer as transaction signature.

src/box/vy_point_lookup.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -287,9 +287,10 @@ vy_point_lookup(struct vy_lsm *lsm, struct vy_tx *tx,
287287
latency_collect(&lsm->stat.latency, latency);
288288

289289
if (latency > lsm->env->too_long_threshold) {
290-
say_warn("%s: get(%s) => %s took too long: %.3f sec",
291-
vy_lsm_name(lsm), tuple_str(key),
292-
vy_stmt_str(*ret), latency);
290+
say_warn_ratelimited("%s: get(%s) => %s "
291+
"took too long: %.3f sec",
292+
vy_lsm_name(lsm), tuple_str(key),
293+
vy_stmt_str(*ret), latency);
293294
}
294295
return 0;
295296
}

src/box/vy_quota.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -241,8 +241,9 @@ vy_quota_use(struct vy_quota *q, size_t size, double timeout)
241241

242242
double wait_time = ev_monotonic_now(loop()) - wait_start;
243243
if (wait_time > q->too_long_threshold) {
244-
say_warn("waited for %zu bytes of vinyl memory quota "
245-
"for too long: %.3f sec", size, wait_time);
244+
say_warn_ratelimited("waited for %zu bytes of vinyl memory "
245+
"quota for too long: %.3f sec", size,
246+
wait_time);
246247
}
247248

248249
vy_quota_do_use(q, size);

src/box/vy_read_iterator.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -887,10 +887,11 @@ vy_read_iterator_next(struct vy_read_iterator *itr, struct tuple **result)
887887
latency_collect(&lsm->stat.latency, latency);
888888

889889
if (latency > lsm->env->too_long_threshold) {
890-
say_warn("%s: select(%s, %s) => %s took too long: %.3f sec",
891-
vy_lsm_name(lsm), tuple_str(itr->key),
892-
iterator_type_strs[itr->iterator_type],
893-
vy_stmt_str(stmt), latency);
890+
say_warn_ratelimited("%s: select(%s, %s) => %s "
891+
"took too long: %.3f sec",
892+
vy_lsm_name(lsm), tuple_str(itr->key),
893+
iterator_type_strs[itr->iterator_type],
894+
vy_stmt_str(stmt), latency);
894895
}
895896

896897
*result = stmt;

src/say.h

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
#include <tarantool_ev.h>
4040
#include "small/rlist.h"
4141
#include "fiber_cond.h"
42+
#include "ratelimit.h"
4243

4344
#if defined(__cplusplus)
4445
extern "C" {
@@ -327,6 +328,43 @@ CFORMAT(printf, 5, 0) extern sayfunc_t _say;
327328
#define panic(...) panic_status(EXIT_FAILURE, __VA_ARGS__)
328329
#define panic_syserror(...) ({ say(S_FATAL, strerror(errno), __VA_ARGS__); exit(EXIT_FAILURE); })
329330

331+
enum {
332+
/* 10 messages per 5 seconds. */
333+
SAY_RATELIMIT_INTERVAL = 5,
334+
SAY_RATELIMIT_BURST = 10,
335+
};
336+
337+
/**
338+
* Wrapper around ratelimit_check() that prints a warning if some
339+
* messages are suppressed. It uses ev_monotonic_now() as a time
340+
* source.
341+
*/
342+
#define say_ratelimit_check(rl) ({ \
343+
int suppressed = 0; \
344+
bool ret = ratelimit_check((rl), ev_monotonic_now(loop()), \
345+
&suppressed); \
346+
if (suppressed > 0) \
347+
say_warn("%d messages suppressed", suppressed); \
348+
ret; \
349+
})
350+
351+
/**
352+
* Same as say(), but rate limited. If this function is called more
353+
* often than SAY_RATELIMIT_BURST times per SAY_RATELIMIT_INTERVAL
354+
* seconds, extra messages are suppressed and a warning is printed
355+
* to the log.
356+
*/
357+
#define say_ratelimited(level, error, format, ...) ({ \
358+
static struct ratelimit rl = \
359+
RATELIMIT_INITIALIZER(SAY_RATELIMIT_INTERVAL, \
360+
SAY_RATELIMIT_BURST); \
361+
if (say_ratelimit_check(&rl)) \
362+
say(level, error, format, ##__VA_ARGS__); \
363+
})
364+
365+
#define say_warn_ratelimited(format, ...) \
366+
say_ratelimited(S_WARN, NULL, format, ##__VA_ARGS__)
367+
330368
/**
331369
* Format and print a message to Tarantool log file.
332370
*

0 commit comments

Comments
 (0)