From 3c805535c940d9f7f27f9026901b2006ee50598d Mon Sep 17 00:00:00 2001 From: Georgy Moiseev Date: Tue, 17 May 2022 16:49:45 +0300 Subject: [PATCH 1/3] doc: fix quantile tolerated error description --- crud/cfg.lua | 2 +- crud/stats/init.lua | 2 +- crud/stats/metrics_registry.lua | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/crud/cfg.lua b/crud/cfg.lua index d716f2bf..560659ff 100644 --- a/crud/cfg.lua +++ b/crud/cfg.lua @@ -103,7 +103,7 @@ end -- @number[opt=1e-3] opts.stats_quantile_tolerated_error -- See tarantool/metrics summary API for details: -- https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary --- If quantile value is -Inf, try to decrease quantile tolerance. +-- If quantile value is -Inf, try to decrease quantile tolerated error. -- See https://github.com/tarantool/metrics/issues/189 for issue details. -- Decreasing the value increases computational load. -- diff --git a/crud/stats/init.lua b/crud/stats/init.lua index bb2e6d08..28fa6f2f 100644 --- a/crud/stats/init.lua +++ b/crud/stats/init.lua @@ -87,7 +87,7 @@ end -- @number[opt=1e-3] opts.quantile_tolerated_error -- See tarantool/metrics summary API for details: -- https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary --- If quantile value is -Inf, try to decrease quantile tolerance. +-- If quantile value is -Inf, try to decrease quantile tolerated error. -- See https://github.com/tarantool/metrics/issues/189 for issue details. -- -- @treturn boolean Returns `true`. diff --git a/crud/stats/metrics_registry.lua b/crud/stats/metrics_registry.lua index cee8559f..1e3c7ab4 100644 --- a/crud/stats/metrics_registry.lua +++ b/crud/stats/metrics_registry.lua @@ -84,7 +84,7 @@ end -- @number[opt=1e-3] opts.quantile_tolerated_error -- See metrics summary API for details: -- https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary --- If quantile value is -Inf, try to decrease quantile tolerance. +-- If quantile value is -Inf, try to decrease quantile tolerated error. -- See https://github.com/tarantool/metrics/issues/189 for issue details. -- -- @treturn boolean Returns `true`. From 712c50b000d68afb5302d4247fe88b99d62a2b13 Mon Sep 17 00:00:00 2001 From: Georgy Moiseev Date: Tue, 17 May 2022 16:56:35 +0300 Subject: [PATCH 2/3] stats: make quantile age params configurable The main motivation of making these parameters configurable is to be able to write time-adequate test cases for #286 issue, but they may be useful in getting rid of #286 effect at all or in quantile time window calibration. After this patch, statistics summary quantile aging params `age_bucket_count` and `max_age_time` [1] could be configured: crud.cfg{ stats_quantile_age_bucket_count = 3, stats_quantile_max_age_time = 30, } Only type validation is conducted in crud.cfg, every other validation is performed by metrics itself. 1. https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary Part of #286 --- CHANGELOG.md | 1 + README.md | 12 +++- crud/cfg.lua | 41 ++++++++++- crud/stats/init.lua | 48 ++++++++++++- crud/stats/local_registry.lua | 8 +++ crud/stats/metrics_registry.lua | 27 ++++++-- test/integration/cfg_test.lua | 46 +++++++++++++ test/integration/stats_test.lua | 5 +- test/unit/stats_test.lua | 118 +++++++++++++++++++++++++++++++- 9 files changed, 293 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0cef8a29..7517df36 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] ### Added +* Make metrics quantile collector age params configurable (#286). ### Changed diff --git a/README.md b/README.md index 01413a53..7f15552d 100644 --- a/README.md +++ b/README.md @@ -797,7 +797,17 @@ crud.cfg{stats_quantile_tolerated_error = 1e-4} ``` See [tarantool/metrics#189](https://github.com/tarantool/metrics/issues/189) for details about the issue. - +You can also configure quantile `age_bucket_count` (default: 2) and +`max_age_time` (in seconds, default: 60): +```lua +crud.cfg{ + stats_quantile_age_bucket_count = 3, + stats_quantile_max_age_time = 30, +} +``` +See [`metrics` summary API](https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary) +for details. These parameters can be used to smooth time window move +or reduce the amount on `-nan` gaps for low request frequency applications. `select` section additionally contains `details` collectors. ```lua diff --git a/crud/cfg.lua b/crud/cfg.lua index 560659ff..af541caa 100644 --- a/crud/cfg.lua +++ b/crud/cfg.lua @@ -29,6 +29,14 @@ local function set_defaults_if_empty(cfg) cfg.stats_quantile_tolerated_error = stats.DEFAULT_QUANTILE_TOLERATED_ERROR end + if cfg.stats_quantile_age_buckets_count == nil then + cfg.stats_quantile_age_buckets_count = stats.DEFAULT_QUANTILE_AGE_BUCKET_COUNT + end + + if cfg.stats_quantile_max_age_time == nil then + cfg.stats_quantile_max_age_time = stats.DEFAULT_QUANTILE_MAX_AGE_TIME + end + return cfg end @@ -38,7 +46,9 @@ local function configure_stats(cfg, opts) if (opts.stats == nil) and (opts.stats_driver == nil) and (opts.stats_quantiles == nil) - and (opts.stats_quantile_tolerated_error == nil) then + and (opts.stats_quantile_tolerated_error == nil) + and (opts.stats_quantile_age_buckets_count == nil) + and (opts.stats_quantile_max_age_time == nil) then return end @@ -58,11 +68,21 @@ local function configure_stats(cfg, opts) opts.stats_quantile_tolerated_error = cfg.stats_quantile_tolerated_error end + if opts.stats_quantile_age_buckets_count == nil then + opts.stats_quantile_age_buckets_count = cfg.stats_quantile_age_buckets_count + end + + if opts.stats_quantile_max_age_time == nil then + opts.stats_quantile_max_age_time = cfg.stats_quantile_max_age_time + end + if opts.stats == true then stats.enable{ driver = opts.stats_driver, quantiles = opts.stats_quantiles, quantile_tolerated_error = opts.stats_quantile_tolerated_error, + quantile_age_buckets_count = opts.stats_quantile_age_buckets_count, + quantile_max_age_time = opts.stats_quantile_max_age_time, } else stats.disable() @@ -72,6 +92,8 @@ local function configure_stats(cfg, opts) rawset(cfg, 'stats_driver', opts.stats_driver) rawset(cfg, 'stats_quantiles', opts.stats_quantiles) rawset(cfg, 'stats_quantile_tolerated_error', opts.stats_quantile_tolerated_error) + rawset(cfg, 'stats_quantile_age_buckets_count', opts.stats_quantile_age_buckets_count) + rawset(cfg, 'stats_quantile_max_age_time', opts.stats_quantile_max_age_time) end --- Configure CRUD module. @@ -107,6 +129,21 @@ end -- See https://github.com/tarantool/metrics/issues/189 for issue details. -- Decreasing the value increases computational load. -- +-- @number[opt=2] opts.stats_quantile_age_buckets_count +-- Count of summary quantile buckets. +-- See tarantool/metrics summary API for details: +-- https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary +-- Increasing the value smoothes time window move, +-- but consumes additional memory and CPU. +-- +-- @number[opt=60] opts.stats_quantile_max_age_time +-- Duration of each bucket’s lifetime in seconds. +-- See tarantool/metrics summary API for details: +-- https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary +-- Smaller bucket lifetime results in smaller time window for quantiles, +-- but more CPU is spent on bucket rotation. If your application has low request +-- frequency, increase the value to reduce the amount of `-nan` gaps in quantile values. +-- -- @return Configuration table. -- local function __call(self, opts) @@ -115,6 +152,8 @@ local function __call(self, opts) stats_driver = '?string', stats_quantiles = '?boolean', stats_quantile_tolerated_error = '?number', + stats_quantile_age_buckets_count = '?number', + stats_quantile_max_age_time = '?number', }) opts = table.deepcopy(opts) or {} diff --git a/crud/stats/init.lua b/crud/stats/init.lua index 28fa6f2f..fefb32be 100644 --- a/crud/stats/init.lua +++ b/crud/stats/init.lua @@ -90,6 +90,21 @@ end -- If quantile value is -Inf, try to decrease quantile tolerated error. -- See https://github.com/tarantool/metrics/issues/189 for issue details. -- +-- @number[opt=2] opts.quantile_age_buckets_count +-- Count of summary quantile buckets. +-- See tarantool/metrics summary API for details: +-- https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary +-- Increasing the value smoothes time window move, +-- but consumes additional memory and CPU. +-- +-- @number[opt=60] opts.quantile_max_age_time +-- Duration of each bucket’s lifetime in seconds. +-- See tarantool/metrics summary API for details: +-- https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary +-- Smaller bucket lifetime results in smaller time window for quantiles, +-- but more CPU is spent on bucket rotation. If your application has low request +-- frequency, increase the value to reduce the amount of `-nan` gaps in quantile values. +-- -- @treturn boolean Returns `true`. -- function stats.enable(opts) @@ -97,6 +112,8 @@ function stats.enable(opts) driver = '?string', quantiles = '?boolean', quantile_tolerated_error = '?number', + quantile_age_buckets_count = '?number', + quantile_max_age_time = '?number', }) StatsError:assert( @@ -122,10 +139,20 @@ function stats.enable(opts) opts.quantile_tolerated_error = stats.DEFAULT_QUANTILE_TOLERATED_ERROR end + if opts.quantile_age_buckets_count == nil then + opts.quantile_age_buckets_count = stats.DEFAULT_QUANTILE_AGE_BUCKET_COUNT + end + + if opts.quantile_max_age_time == nil then + opts.quantile_max_age_time = stats.DEFAULT_QUANTILE_MAX_AGE_TIME + end + -- Do not reinit if called with same options. if internal.driver == opts.driver and internal.quantiles == opts.quantiles - and internal.quantile_tolerated_error == opts.quantile_tolerated_error then + and internal.quantile_tolerated_error == opts.quantile_tolerated_error + and internal.quantile_age_buckets_count == opts.quantile_age_buckets_count + and internal.quantile_max_age_time == opts.quantile_max_age_time then return true end @@ -136,11 +163,15 @@ function stats.enable(opts) internal:get_registry().init{ quantiles = opts.quantiles, - quantile_tolerated_error = opts.quantile_tolerated_error + quantile_tolerated_error = opts.quantile_tolerated_error, + quantile_age_buckets_count = opts.quantile_age_buckets_count, + quantile_max_age_time = opts.quantile_max_age_time, } internal.quantiles = opts.quantiles internal.quantile_tolerated_error = opts.quantile_tolerated_error + internal.quantile_age_buckets_count = opts.quantile_age_buckets_count + internal.quantile_max_age_time = opts.quantile_max_age_time return true end @@ -162,7 +193,9 @@ function stats.reset() internal:get_registry().destroy() internal:get_registry().init{ quantiles = internal.quantiles, - quantile_tolerated_error = internal.quantile_tolerated_error + quantile_tolerated_error = internal.quantile_tolerated_error, + quantile_age_buckets_count = internal.quantile_age_buckets_count, + quantile_max_age_time = internal.quantile_max_age_time, } return true @@ -184,6 +217,9 @@ function stats.disable() internal:get_registry().destroy() internal.driver = nil internal.quantiles = nil + internal.quantile_tolerated_error = nil + internal.quantile_age_buckets_count = nil + internal.quantile_max_age_time = nil return true end @@ -495,4 +531,10 @@ stats.internal = internal --- Default metrics quantile precision. stats.DEFAULT_QUANTILE_TOLERATED_ERROR = 1e-3 +--- Default metrics quantile bucket count. +stats.DEFAULT_QUANTILE_AGE_BUCKET_COUNT = 2 + +--- Default metrics quantile bucket lifetime. +stats.DEFAULT_QUANTILE_MAX_AGE_TIME = 60 + return stats diff --git a/crud/stats/local_registry.lua b/crud/stats/local_registry.lua index 3c2df64b..1ad7c1b5 100644 --- a/crud/stats/local_registry.lua +++ b/crud/stats/local_registry.lua @@ -28,12 +28,20 @@ local StatsLocalError = errors.new_class('StatsLocalError', {capture_stack = fal -- @number opts.quantile_tolerated_error -- Quantiles is not supported for local, so the value is ignored. -- +-- @number opts.quantile_age_buckets_count +-- Quantiles is not supported for local, so the value is ignored. +-- +-- @number opts.quantile_max_age_time +-- Quantiles is not supported for local, so the value is ignored. +-- -- @treturn boolean Returns `true`. -- function registry.init(opts) dev_checks({ quantiles = 'boolean', quantile_tolerated_error = 'number', + quantile_age_buckets_count = 'number', + quantile_max_age_time = 'number', }) StatsLocalError:assert(opts.quantiles == false, diff --git a/crud/stats/metrics_registry.lua b/crud/stats/metrics_registry.lua index 1e3c7ab4..f49524f0 100644 --- a/crud/stats/metrics_registry.lua +++ b/crud/stats/metrics_registry.lua @@ -31,11 +31,6 @@ local metric_name = { local LATENCY_QUANTILE = 0.99 -local DEFAULT_AGE_PARAMS = { - age_buckets_count = 2, - max_age_time = 60, -} - --- Check if application supports metrics rock for registry -- -- `metrics >= 0.10.0` is required. @@ -87,19 +82,39 @@ end -- If quantile value is -Inf, try to decrease quantile tolerated error. -- See https://github.com/tarantool/metrics/issues/189 for issue details. -- +-- @number[opt=2] opts.quantile_age_buckets_count +-- Count of summary quantile buckets. +-- See tarantool/metrics summary API for details: +-- https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary +-- Increasing the value smoothes time window move, +-- but consumes additional memory and CPU. +-- +-- @number[opt=60] opts.quantile_max_age_time +-- Duration of each bucket’s lifetime in seconds. +-- See tarantool/metrics summary API for details: +-- https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary +-- Smaller bucket lifetime results in smaller time window for quantiles, +-- but more CPU is spent on bucket rotation. If your application has low request +-- frequency, increase the value to reduce the amount of `-nan` gaps in quantile values. +-- -- @treturn boolean Returns `true`. -- function registry.init(opts) dev_checks({ quantiles = 'boolean', quantile_tolerated_error = 'number', + quantile_age_buckets_count = 'number', + quantile_max_age_time = 'number', }) local quantile_params = nil local age_params = nil if opts.quantiles == true then quantile_params = {[LATENCY_QUANTILE] = opts.quantile_tolerated_error} - age_params = DEFAULT_AGE_PARAMS + age_params = { + age_buckets_count = opts.quantile_age_buckets_count, + max_age_time = opts.quantile_max_age_time, + } end internal.registry = {} diff --git a/test/integration/cfg_test.lua b/test/integration/cfg_test.lua index cafa0f22..c77dffbc 100644 --- a/test/integration/cfg_test.lua +++ b/test/integration/cfg_test.lua @@ -27,6 +27,8 @@ group.test_defaults = function(g) stats_driver = stats.get_default_driver(), stats_quantiles = false, stats_quantile_tolerated_error = 1e-3, + stats_quantile_age_buckets_count = 2, + stats_quantile_max_age_time = 60, }) end @@ -100,3 +102,47 @@ group.test_gh_284_preset_stats_quantile_tolerated_error_is_preserved = function( t.assert_equals(cfg.stats_quantile_tolerated_error, 1e-4, 'Preset stats_quantile_tolerated_error presents') end + +group.test_gh_284_preset_stats_quantile_age_buckets_count_is_preserved = function(g) + -- Arrange some cfg values so test case will not depend on defaults. + local cfg = g.cluster:server('router'):eval( + "return require('crud').cfg(...)", + {{ stats = false }}) + t.assert_equals(cfg.stats, false) + + -- Set stats_age_buckets_count. + local cfg = g.cluster:server('router'):eval( + "return require('crud').cfg(...)", + {{ stats_quantile_age_buckets_count = 3 }}) + t.assert_equals(cfg.stats_quantile_age_buckets_count, 3) + + -- Set another cfg parameter, assert preset stats_quantile_age_buckets_count presents. + local cfg = g.cluster:server('router'):eval( + "return require('crud').cfg(...)", + {{ stats = true }}) + t.assert_equals(cfg.stats, true) + t.assert_equals(cfg.stats_quantile_age_buckets_count, 3, + 'Preset stats_quantile_age_buckets_count presents') +end + +group.test_gh_284_preset_stats_quantile_max_age_time_is_preserved = function(g) + -- Arrange some cfg values so test case will not depend on defaults. + local cfg = g.cluster:server('router'):eval( + "return require('crud').cfg(...)", + {{ stats = false }}) + t.assert_equals(cfg.stats, false) + + -- Set stats_age_buckets_count. + local cfg = g.cluster:server('router'):eval( + "return require('crud').cfg(...)", + {{ stats_quantile_max_age_time = 30 }}) + t.assert_equals(cfg.stats_quantile_max_age_time, 30) + + -- Set another cfg parameter, assert preset stats_quantile_max_age_time presents. + local cfg = g.cluster:server('router'):eval( + "return require('crud').cfg(...)", + {{ stats = true }}) + t.assert_equals(cfg.stats, true) + t.assert_equals(cfg.stats_quantile_max_age_time, 30, + 'Preset stats_quantile_max_age_time presents') +end diff --git a/test/integration/stats_test.lua b/test/integration/stats_test.lua index f712c69d..35e0d316 100644 --- a/test/integration/stats_test.lua +++ b/test/integration/stats_test.lua @@ -55,7 +55,10 @@ local function enable_stats(g, params) require('crud').cfg{ stats = true, stats_driver = params.driver, - stats_quantiles = params.quantiles + stats_quantiles = params.quantiles, + stats_quantile_tolerated_error = 1e-3, + stats_quantile_age_buckets_count = 3, + stats_quantile_max_age_time = 60, } ]], { params }) end diff --git a/test/unit/stats_test.lua b/test/unit/stats_test.lua index 495687d0..e7f8b5a0 100644 --- a/test/unit/stats_test.lua +++ b/test/unit/stats_test.lua @@ -661,7 +661,9 @@ end group_driver.test_default_quantile_tolerated_error = function(g) enable_stats(g) - local quantile_tolerated_error = g.router:eval(" return stats_module.internal.quantile_tolerated_error ") + local quantile_tolerated_error = g.router:eval([[ + return stats_module.internal.quantile_tolerated_error + ]]) t.assert_equals(quantile_tolerated_error, 1e-3) end @@ -692,6 +694,120 @@ group_driver.test_custom_quantile_tolerated_error = function(g) end +group_driver.test_default_quantile_age_buckets_count = function(g) + enable_stats(g) + + local quantile_age_buckets_count = g.router:eval([[ + return stats_module.internal.quantile_age_buckets_count + ]]) + t.assert_equals(quantile_age_buckets_count, 2) +end + + +group_driver.before_test( + 'test_custom_quantile_age_buckets_count', + function(g) + t.skip_if(g.is_metrics_supported == false, 'Metrics registry is unsupported') + end +) + +group_driver.test_custom_quantile_age_buckets_count = function(g) + g.router:call('crud.cfg', {{ + stats = true, + stats_driver = 'metrics', + stats_quantiles = true, + stats_quantile_age_buckets_count = 3, + }}) + + local resp = g.router:eval([[ + local metrics = require('metrics') + + local summary = metrics.summary('tnt_crud_stats') + return summary.age_buckets_count + ]]) + + t.assert_equals(resp, 3) +end + + +group_driver.before_test( + 'test_invalid_quantile_age_buckets_count', + function(g) + t.skip_if(g.is_metrics_supported == false, 'Metrics registry is unsupported') + end +) + +group_driver.test_invalid_quantile_age_buckets_count = function(g) + local cfg_before = g.router:call('crud.cfg') + + t.assert_error(g.router.call, g.router, 'crud.cfg', {{ + stats = true, + stats_driver = 'metrics', + stats_quantiles = true, + stats_quantile_age_buckets_count = -0.2, + }}) + + local cfg_after = g.router:call('crud.cfg') + t.assert_equals(cfg_after, cfg_before) +end + + +group_driver.test_default_quantile_max_age_time = function(g) + enable_stats(g) + + local quantile_max_age_time = g.router:eval(" return stats_module.internal.quantile_max_age_time ") + t.assert_equals(quantile_max_age_time, 60) +end + + +group_driver.before_test( + 'test_custom_quantile_max_age_time', + function(g) + t.skip_if(g.is_metrics_supported == false, 'Metrics registry is unsupported') + end +) + +group_driver.test_custom_quantile_max_age_time = function(g) + g.router:call('crud.cfg', {{ + stats = true, + stats_driver = 'metrics', + stats_quantiles = true, + stats_quantile_max_age_time = 10, + }}) + + local resp = g.router:eval([[ + local metrics = require('metrics') + + local summary = metrics.summary('tnt_crud_stats') + return summary.max_age_time + ]]) + + t.assert_equals(resp, 10) +end + + +group_driver.before_test( + 'test_invalid_quantile_max_age_time', + function(g) + t.skip_if(g.is_metrics_supported == false, 'Metrics registry is unsupported') + end +) + +group_driver.test_invalid_quantile_max_age_time = function(g) + local cfg_before = g.router:call('crud.cfg') + + t.assert_error(g.router.call, g.router, 'crud.cfg', {{ + stats = true, + stats_driver = 'metrics', + stats_quantiles = true, + stats_quantile_max_age_time = -0.2, + }}) + + local cfg_after = g.router:call('crud.cfg') + t.assert_equals(cfg_after, cfg_before) +end + + group_driver.before_test( 'test_stats_reenable_with_different_driver_reset_stats', function(g) From c66d33064a9a7a27eb1c2e5eca7e472db3b91154 Mon Sep 17 00:00:00 2001 From: Georgy Moiseev Date: Tue, 17 May 2022 17:32:35 +0300 Subject: [PATCH 3/3] stats: separate quantile and average fields Add separate quantile (`latency_quantile_recent`) and average ( `latency_average`) fields to `crud.stats()` output. `latency` field and tarantool/metrics output remains unchanged. Before this patch, `latency` displayed `latency_quantile_recent` or `latency_average` and there wasn't any was to see pre-computed average if quantiles are enabled. But it may be useful if quantile is `nan`. Quantiles may display `-nan` if there were no observations for a several ages. Such behavior is expected [1] and valid: for example, Grafana should ignore such values and they will be displayed as `No data` for a window when there wasn't any requests. 1. https://github.com/tarantool/metrics/issues/303 Closes #286 --- CHANGELOG.md | 2 ++ README.md | 31 ++++++++++++----- crud/stats/local_registry.lua | 3 +- crud/stats/metrics_registry.lua | 59 ++++++++++++--------------------- crud/stats/registry_utils.lua | 11 +++++- test/integration/stats_test.lua | 18 ++++++++++ test/unit/stats_test.lua | 38 +++++++++++++++++++-- 7 files changed, 112 insertions(+), 50 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7517df36..d9afea83 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Added * Make metrics quantile collector age params configurable (#286). +* Add separate `latency_average` and `latency_quantile_recent` + fields to `crud.stats()` output (#286). ### Changed diff --git a/README.md b/README.md index 7f15552d..966cf4fa 100644 --- a/README.md +++ b/README.md @@ -721,11 +721,15 @@ crud.stats() my_space: insert: ok: - latency: 0.002 + latency: 0.0015 + latency_average: 0.002 + latency_quantile_recent: 0.0015 count: 19800 time: 39.6 error: - latency: 0.000001 + latency: 0.0000008 + latency_average: 0.000001 + latency_quantile_recent: 0.0000008 count: 4 time: 0.000004 ... @@ -733,11 +737,15 @@ crud.stats('my_space') --- - insert: ok: - latency: 0.002 + latency: 0.0015 + latency_average: 0.002 + latency_quantile_recent: 0.0015 count: 19800 time: 39.6 error: - latency: 0.000001 + latency: 0.0000008 + latency_average: 0.000001 + latency_quantile_recent: 0.0000008 count: 4 time: 0.000004 ... @@ -759,10 +767,17 @@ and `borders` (for `min` and `max` calls). Each operation section consists of different collectors for success calls and error (both error throw and `nil, err`) returns. `count` is the total requests count since instance start -or stats restart. `latency` is the 0.99 quantile of request execution -time if `metrics` driver used and quantiles enabled, -otherwise `latency` is the total average. -`time` is the total time of requests execution. +or stats restart. `time` is the total time of requests execution. +`latency_average` is `time` / `count`. +`latency_quantile_recent` is the 0.99 quantile of request execution +time for a recent period (see +[`metrics` summary API](https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary)). +It is computed only if `metrics` driver is used and quantiles are +enabled. `latency_quantile_recent` value may be `-nan` if there +wasn't any observations for several ages, see +[tarantool/metrics#303](https://github.com/tarantool/metrics/issues/303). +`latency` is a `latency_quantile_recent` if `metrics` driver is used +and quantiles are enabled, otherwise it's `latency_average`. In [`metrics`](https://www.tarantool.io/en/doc/latest/book/monitoring/) registry statistics are stored as `tnt_crud_stats` metrics diff --git a/crud/stats/local_registry.lua b/crud/stats/local_registry.lua index 1ad7c1b5..10320bc1 100644 --- a/crud/stats/local_registry.lua +++ b/crud/stats/local_registry.lua @@ -121,7 +121,8 @@ function registry.observe(latency, space_name, op, status) collectors.count = collectors.count + 1 collectors.time = collectors.time + latency - collectors.latency = collectors.time / collectors.count + collectors.latency_average = collectors.time / collectors.count + collectors.latency = collectors.latency_average return true end diff --git a/crud/stats/metrics_registry.lua b/crud/stats/metrics_registry.lua index f49524f0..315a649d 100644 --- a/crud/stats/metrics_registry.lua +++ b/crud/stats/metrics_registry.lua @@ -162,50 +162,35 @@ function registry.destroy() return true end ---- Compute `latency` field of an observation. +--- Compute `latency_average` and set `latency` fields of each observation. -- --- If it is a `{ time = ..., count = ... }` observation, --- compute latency as overall average and store it --- inside observation object. +-- `latency` is `latency_average` if quantiles disabled +-- and `latency_quantile` otherwise. -- --- @function compute_obs_latency --- @local --- --- @tab obs --- Objects from `registry_utils` --- `stats.spaces[name][op][status]`. --- If something like `details` collector --- passed, do nothing. --- -local function compute_obs_latency(obs) - if obs.count == nil or obs.time == nil then - return - end - - if obs.count == 0 then - obs.latency = 0 - else - obs.latency = obs.time / obs.count - end -end - ---- Compute `latency` field of each observation. --- --- If quantiles disabled, we need to compute --- latency as overall average from `time` and --- `count` values. --- --- @function compute_latencies +-- @function compute_aggregates -- @local -- -- @tab stats -- Object from registry_utils stats. -- -local function compute_latencies(stats) +local function compute_aggregates(stats) for _, space_stats in pairs(stats.spaces) do for _, op_stats in pairs(space_stats) do for _, obs in pairs(op_stats) do - compute_obs_latency(obs) + -- There are no count in `details`. + if obs.count ~= nil then + if obs.count == 0 then + obs.latency_average = 0 + else + obs.latency_average = obs.time / obs.count + end + + if obs.latency_quantile_recent ~= nil then + obs.latency = obs.latency_quantile_recent + else + obs.latency = obs.latency_average + end + end end end end @@ -250,7 +235,7 @@ function registry.get(space_name) -- metric_name.stats presents only if quantiles enabled. if obs.metric_name == metric_name.stats then if obs.label_pairs.quantile == LATENCY_QUANTILE then - space_stats[op][status].latency = obs.value + space_stats[op][status].latency_quantile_recent = obs.value end elseif obs.metric_name == metric_name.stats_sum then space_stats[op][status].time = obs.value @@ -261,9 +246,7 @@ function registry.get(space_name) :: stats_continue :: end - if not internal.opts.quantiles then - compute_latencies(stats) - end + compute_aggregates(stats) -- Fill select/pairs detail statistics values. for stat_name, metric_name in pairs(metric_name.details) do diff --git a/crud/stats/registry_utils.lua b/crud/stats/registry_utils.lua index 95654461..1227d91e 100644 --- a/crud/stats/registry_utils.lua +++ b/crud/stats/registry_utils.lua @@ -16,7 +16,8 @@ local registry_utils = {} -- Use `require('crud.stats').op` to pick one. -- -- @treturn table Returns collectors for success and error requests. --- Collectors store 'count', 'latency' and 'time' values. Also +-- Collectors store 'count', 'latency', 'latency_average', +-- 'latency_quantile_recent' and 'time' values. Also -- returns additional collectors for select operation. -- function registry_utils.build_collectors(op) @@ -26,11 +27,19 @@ function registry_utils.build_collectors(op) ok = { count = 0, latency = 0, + latency_average = 0, + -- latency_quantile_recent presents only if driver + -- is 'metrics' and quantiles enabled. + latency_quantile_recent = nil, time = 0, }, error = { count = 0, latency = 0, + latency_average = 0, + -- latency_quantile_recent presents only if driver + -- is 'metrics' and quantiles enabled. + latency_quantile_recent = nil, time = 0, }, } diff --git a/test/integration/stats_test.lua b/test/integration/stats_test.lua index 35e0d316..2ddd7c4b 100644 --- a/test/integration/stats_test.lua +++ b/test/integration/stats_test.lua @@ -529,6 +529,24 @@ for name, case in pairs(simple_operation_cases) do t.assert_le(changed_after.latency, ok_latency_max, 'Changed latency has appropriate value') + local ok_average_max = math.max(changed_before.latency_average, after_finish - before_start) + + t.assert_gt(changed_after.latency_average, 0, + 'Changed average has appropriate value') + t.assert_le(changed_after.latency_average, ok_average_max, + 'Changed average has appropriate value') + + if g.params.quantiles == true then + local ok_quantile_max = math.max( + changed_before.latency_quantile_recent or 0, + after_finish - before_start) + + t.assert_gt(changed_after.latency_quantile_recent, 0, + 'Changed quantile has appropriate value') + t.assert_le(changed_after.latency_quantile_recent, ok_quantile_max, + 'Changed quantile has appropriate value') + end + local time_diff = changed_after.time - changed_before.time t.assert_gt(time_diff, 0, 'Total time increase has appropriate value') diff --git a/test/unit/stats_test.lua b/test/unit/stats_test.lua index e7f8b5a0..b8899a92 100644 --- a/test/unit/stats_test.lua +++ b/test/unit/stats_test.lua @@ -170,9 +170,25 @@ for name, case in pairs(observe_cases) do t.assert_equals(changed.count, run_count, 'Count incremented by count of runs') local sleep_time = helpers.simple_functions_params().sleep_time + t.assert_ge(changed.latency, sleep_time, 'Latency has appropriate value') t.assert_le(changed.latency, time_diffs[#time_diffs], 'Latency has appropriate value') + t.assert_ge(changed.latency_average, sleep_time, 'Average has appropriate value') + t.assert_le(changed.latency_average, time_diffs[#time_diffs], 'Average has appropriate value') + + if g.params.quantiles == true then + t.assert_ge( + changed.latency_quantile_recent, + sleep_time, + 'Quantile has appropriate value') + + t.assert_le( + changed.latency_quantile_recent, + time_diffs[#time_diffs], + 'Quantile has appropriate value') + end + t.assert_ge(changed.time, sleep_time * run_count, 'Total time increase has appropriate value') t.assert_le(changed.time, total_time, 'Total time increase has appropriate value') @@ -187,6 +203,7 @@ for name, case in pairs(observe_cases) do { count = 0, latency = 0, + latency_average = 0, time = 0 }, 'Other status collectors initialized after observations' @@ -387,15 +404,31 @@ for name, case in pairs(pairs_cases) do t.assert_equals(changed.count, 1, 'Count incremented by 1') - t.assert_ge(changed.latency, + t.assert_ge( + changed.latency, params.sleep_time * (case.build_sleep_multiplier + case.iterations_expected), 'Latency has appropriate value') t.assert_le(changed.latency, time_diff, 'Latency has appropriate value') + t.assert_ge( + changed.latency_average, + params.sleep_time * (case.build_sleep_multiplier + case.iterations_expected), + 'Average has appropriate value') + t.assert_le(changed.latency_average, time_diff, 'Average has appropriate value') + + if g.params.quantiles == true then + t.assert_ge( + changed.latency_quantile_recent, + params.sleep_time * (case.build_sleep_multiplier + case.iterations_expected), + 'Quantile has appropriate value') + + t.assert_le(changed.latency_quantile_recent, time_diff, 'Quantile has appropriate value') + end + t.assert_ge(changed.time, params.sleep_time * (case.build_sleep_multiplier + case.iterations_expected), 'Total time has appropriate value') - t.assert_le(changed.time, time_diff, 'Total time has appropriate value') + t.assert_le(changed.time, time_diff, 'Total time has appropriate value') -- Other collectors (unchanged_coll: 'error' or 'ok') -- have been initialized and have default values. @@ -407,6 +440,7 @@ for name, case in pairs(pairs_cases) do { count = 0, latency = 0, + latency_average = 0, time = 0 }, 'Other status collectors initialized after observations'