tarantool · Totktonada · May 20, 2022 · May 17, 2022 · May 17, 2022 · May 17, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ## [Unreleased]
 
 ### Added
+* Make metrics quantile collector age params configurable (#286).
+* Add separate `latency_average` and `latency_quantile_recent`
+  fields to `crud.stats()` output (#286).
 
 ### Changed
 

diff --git a/README.md b/README.md
@@ -721,23 +721,31 @@ crud.stats()
     my_space:
       insert:
         ok:
-          latency: 0.002
+          latency: 0.0015
+          latency_average: 0.002
+          latency_quantile_recent: 0.0015
           count: 19800
           time: 39.6
         error:
-          latency: 0.000001
+          latency: 0.0000008
+          latency_average: 0.000001
+          latency_quantile_recent: 0.0000008
           count: 4
           time: 0.000004
 ...
 crud.stats('my_space')
 ---
 - insert:
     ok:
-      latency: 0.002
+      latency: 0.0015
+      latency_average: 0.002
+      latency_quantile_recent: 0.0015
       count: 19800
       time: 39.6
     error:
-      latency: 0.000001
+      latency: 0.0000008
+      latency_average: 0.000001
+      latency_quantile_recent: 0.0000008
       count: 4
       time: 0.000004
 ...
@@ -759,10 +767,17 @@ and `borders` (for `min` and `max` calls).
 Each operation section consists of different collectors
 for success calls and error (both error throw and `nil, err`)
 returns. `count` is the total requests count since instance start
-or stats restart. `latency` is the 0.99 quantile of request execution
-time if `metrics` driver used and quantiles enabled,
-otherwise `latency` is the total average.
-`time` is the total time of requests execution.
+or stats restart.  `time` is the total time of requests execution.
+`latency_average` is `time` / `count`.
+`latency_quantile_recent` is the 0.99 quantile of request execution
+time for a recent period (see 
+[`metrics` summary API](https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary)).
+It is computed only if `metrics` driver is used and quantiles are
+enabled. `latency_quantile_recent` value may be `-nan` if there
+wasn't any observations for several ages, see
+[tarantool/metrics#303](https://github.com/tarantool/metrics/issues/303).
+`latency` is a `latency_quantile_recent` if `metrics` driver is used
+and quantiles are enabled, otherwise it's `latency_average`.
 
 In [`metrics`](https://www.tarantool.io/en/doc/latest/book/monitoring/)
 registry statistics are stored as `tnt_crud_stats` metrics
@@ -797,7 +812,17 @@ crud.cfg{stats_quantile_tolerated_error = 1e-4}
 ```
 See [tarantool/metrics#189](https://github.com/tarantool/metrics/issues/189) for
 details about the issue.
-
+You can also configure quantile `age_bucket_count` (default: 2) and
+`max_age_time` (in seconds, default: 60):
+```lua
+crud.cfg{
+    stats_quantile_age_bucket_count = 3,
+    stats_quantile_max_age_time = 30,
+}
+```
+See [`metrics` summary API](https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary)
+for details. These parameters can be used to smooth time window move
+or reduce the amount on `-nan` gaps for low request frequency applications.
 
 `select` section additionally contains `details` collectors.
 ```lua

diff --git a/crud/cfg.lua b/crud/cfg.lua
@@ -29,6 +29,14 @@ local function set_defaults_if_empty(cfg)
         cfg.stats_quantile_tolerated_error = stats.DEFAULT_QUANTILE_TOLERATED_ERROR
     end
 
+    if cfg.stats_quantile_age_buckets_count == nil then
+        cfg.stats_quantile_age_buckets_count = stats.DEFAULT_QUANTILE_AGE_BUCKET_COUNT
+    end
+
+    if cfg.stats_quantile_max_age_time == nil then
+        cfg.stats_quantile_max_age_time = stats.DEFAULT_QUANTILE_MAX_AGE_TIME
+    end
+
     return cfg
 end
 
@@ -38,7 +46,9 @@ local function configure_stats(cfg, opts)
     if  (opts.stats == nil)
     and (opts.stats_driver == nil)
     and (opts.stats_quantiles == nil)
-    and (opts.stats_quantile_tolerated_error == nil) then
+    and (opts.stats_quantile_tolerated_error == nil)
+    and (opts.stats_quantile_age_buckets_count == nil)
+    and (opts.stats_quantile_max_age_time == nil) then
         return
     end
 
@@ -58,11 +68,21 @@ local function configure_stats(cfg, opts)
         opts.stats_quantile_tolerated_error = cfg.stats_quantile_tolerated_error
     end
 
+    if opts.stats_quantile_age_buckets_count == nil then
+        opts.stats_quantile_age_buckets_count = cfg.stats_quantile_age_buckets_count
+    end
+
+    if opts.stats_quantile_max_age_time == nil then
+        opts.stats_quantile_max_age_time = cfg.stats_quantile_max_age_time
+    end
+
     if opts.stats == true then
         stats.enable{
             driver = opts.stats_driver,
             quantiles = opts.stats_quantiles,
             quantile_tolerated_error = opts.stats_quantile_tolerated_error,
+            quantile_age_buckets_count = opts.stats_quantile_age_buckets_count,
+            quantile_max_age_time = opts.stats_quantile_max_age_time,
         }
     else
         stats.disable()
@@ -72,6 +92,8 @@ local function configure_stats(cfg, opts)
     rawset(cfg, 'stats_driver', opts.stats_driver)
     rawset(cfg, 'stats_quantiles', opts.stats_quantiles)
     rawset(cfg, 'stats_quantile_tolerated_error', opts.stats_quantile_tolerated_error)
+    rawset(cfg, 'stats_quantile_age_buckets_count', opts.stats_quantile_age_buckets_count)
+    rawset(cfg, 'stats_quantile_max_age_time', opts.stats_quantile_max_age_time)
 end
 
 --- Configure CRUD module.
@@ -103,10 +125,25 @@ end
 -- @number[opt=1e-3] opts.stats_quantile_tolerated_error
 --  See tarantool/metrics summary API for details:
 --  https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary
---  If quantile value is -Inf, try to decrease quantile tolerance.
+--  If quantile value is -Inf, try to decrease quantile tolerated error.
 --  See https://github.com/tarantool/metrics/issues/189 for issue details.
 --  Decreasing the value increases computational load.
 --
+-- @number[opt=2] opts.stats_quantile_age_buckets_count
+--  Count of summary quantile buckets.
+--  See tarantool/metrics summary API for details:
+--  https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary
+--  Increasing the value smoothes time window move,
+--  but consumes additional memory and CPU.
+--
+-- @number[opt=60] opts.stats_quantile_max_age_time
+--  Duration of each bucket’s lifetime in seconds.
+--  See tarantool/metrics summary API for details:
+--  https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary
+--  Smaller bucket lifetime results in smaller time window for quantiles,
+--  but more CPU is spent on bucket rotation. If your application has low request
+--  frequency, increase the value to reduce the amount of `-nan` gaps in quantile values.
+--
 -- @return Configuration table.
 --
 local function __call(self, opts)
@@ -115,6 +152,8 @@ local function __call(self, opts)
         stats_driver = '?string',
         stats_quantiles = '?boolean',
         stats_quantile_tolerated_error = '?number',
+        stats_quantile_age_buckets_count = '?number',
+        stats_quantile_max_age_time = '?number',
     })
 
     opts = table.deepcopy(opts) or {}

diff --git a/crud/stats/init.lua b/crud/stats/init.lua
@@ -87,16 +87,33 @@ end
 -- @number[opt=1e-3] opts.quantile_tolerated_error
 --  See tarantool/metrics summary API for details:
 --  https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary
---  If quantile value is -Inf, try to decrease quantile tolerance.
+--  If quantile value is -Inf, try to decrease quantile tolerated error.
 --  See https://github.com/tarantool/metrics/issues/189 for issue details.
 --
+-- @number[opt=2] opts.quantile_age_buckets_count
+--  Count of summary quantile buckets.
+--  See tarantool/metrics summary API for details:
+--  https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary
+--  Increasing the value smoothes time window move,
+--  but consumes additional memory and CPU.
+--
+-- @number[opt=60] opts.quantile_max_age_time
+--  Duration of each bucket’s lifetime in seconds.
+--  See tarantool/metrics summary API for details:
+--  https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary
+--  Smaller bucket lifetime results in smaller time window for quantiles,
+--  but more CPU is spent on bucket rotation. If your application has low request
+--  frequency, increase the value to reduce the amount of `-nan` gaps in quantile values.
+--
 -- @treturn boolean Returns `true`.
 --
 function stats.enable(opts)
     checks({
         driver = '?string',
         quantiles = '?boolean',
         quantile_tolerated_error = '?number',
+        quantile_age_buckets_count = '?number',
+        quantile_max_age_time = '?number',
     })
 
     StatsError:assert(
@@ -122,10 +139,20 @@ function stats.enable(opts)
         opts.quantile_tolerated_error = stats.DEFAULT_QUANTILE_TOLERATED_ERROR
     end
 
+    if opts.quantile_age_buckets_count == nil then
+        opts.quantile_age_buckets_count = stats.DEFAULT_QUANTILE_AGE_BUCKET_COUNT
+    end
+
+    if opts.quantile_max_age_time == nil then
+        opts.quantile_max_age_time = stats.DEFAULT_QUANTILE_MAX_AGE_TIME
+    end
+
     -- Do not reinit if called with same options.
     if internal.driver == opts.driver
     and internal.quantiles == opts.quantiles
-    and internal.quantile_tolerated_error == opts.quantile_tolerated_error then
+    and internal.quantile_tolerated_error == opts.quantile_tolerated_error
+    and internal.quantile_age_buckets_count == opts.quantile_age_buckets_count
+    and internal.quantile_max_age_time == opts.quantile_max_age_time then
         return true
     end
 
@@ -136,11 +163,15 @@ function stats.enable(opts)
 
     internal:get_registry().init{
         quantiles = opts.quantiles,
-        quantile_tolerated_error = opts.quantile_tolerated_error
+        quantile_tolerated_error = opts.quantile_tolerated_error,
+        quantile_age_buckets_count = opts.quantile_age_buckets_count,
+        quantile_max_age_time = opts.quantile_max_age_time,
     }
 
     internal.quantiles = opts.quantiles
     internal.quantile_tolerated_error = opts.quantile_tolerated_error
+    internal.quantile_age_buckets_count = opts.quantile_age_buckets_count
+    internal.quantile_max_age_time = opts.quantile_max_age_time
 
     return true
 end
@@ -162,7 +193,9 @@ function stats.reset()
     internal:get_registry().destroy()
     internal:get_registry().init{
         quantiles = internal.quantiles,
-        quantile_tolerated_error = internal.quantile_tolerated_error
+        quantile_tolerated_error = internal.quantile_tolerated_error,
+        quantile_age_buckets_count = internal.quantile_age_buckets_count,
+        quantile_max_age_time = internal.quantile_max_age_time,
     }
 
     return true
@@ -184,6 +217,9 @@ function stats.disable()
     internal:get_registry().destroy()
     internal.driver = nil
     internal.quantiles = nil
+    internal.quantile_tolerated_error = nil
+    internal.quantile_age_buckets_count = nil
+    internal.quantile_max_age_time = nil
 
     return true
 end
@@ -495,4 +531,10 @@ stats.internal = internal
 --- Default metrics quantile precision.
 stats.DEFAULT_QUANTILE_TOLERATED_ERROR = 1e-3
 
+--- Default metrics quantile bucket count.
+stats.DEFAULT_QUANTILE_AGE_BUCKET_COUNT = 2
+
+--- Default metrics quantile bucket lifetime.
+stats.DEFAULT_QUANTILE_MAX_AGE_TIME = 60
+
 return stats
diff --git a/crud/stats/local_registry.lua b/crud/stats/local_registry.lua
@@ -28,12 +28,20 @@ local StatsLocalError = errors.new_class('StatsLocalError', {capture_stack = fal
 -- @number opts.quantile_tolerated_error
 --  Quantiles is not supported for local, so the value is ignored.
 --
+-- @number opts.quantile_age_buckets_count
+--  Quantiles is not supported for local, so the value is ignored.
+--
+-- @number opts.quantile_max_age_time
+--  Quantiles is not supported for local, so the value is ignored.
+--
 -- @treturn boolean Returns `true`.
 --
 function registry.init(opts)
     dev_checks({
         quantiles = 'boolean',
         quantile_tolerated_error = 'number',
+        quantile_age_buckets_count = 'number',
+        quantile_max_age_time = 'number',
     })
 
     StatsLocalError:assert(opts.quantiles == false,
@@ -113,7 +121,8 @@ function registry.observe(latency, space_name, op, status)
 
     collectors.count = collectors.count + 1
     collectors.time = collectors.time + latency
-    collectors.latency = collectors.time / collectors.count
+    collectors.latency_average = collectors.time / collectors.count
+    collectors.latency = collectors.latency_average
 
     return true
 end