From 7e76fbdccadb167995ff79579aa679dbb35b3b28 Mon Sep 17 00:00:00 2001 From: Tyler Reid Date: Thu, 24 Jun 2021 14:10:17 -0500 Subject: [PATCH 01/14] Add a new config and metric for reporting ruler query execution wall time. Signed-off-by: Tyler Reid --- CHANGELOG.md | 1 + docs/configuration/config-file-reference.md | 4 ++++ pkg/ruler/compat.go | 19 +++++++++++++++++-- pkg/ruler/compat_test.go | 3 ++- pkg/ruler/ruler.go | 4 ++++ pkg/ruler/ruler_test.go | 1 + 6 files changed, 29 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 73f08261fb6..5b03773487d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ * Ensure that a ring store is configured using `-alertmanager.sharding-ring.store`, and set the flags relevant to the chosen store type. * Enable the feature using `-alertmanager.sharding-enabled`. * Note the prior addition of a new configuration option `-alertmanager.persist-interval`. This sets the interval between persisting the current alertmanager state (notification log and silences) to object storage. See the [configuration file reference](https://cortexmetrics.io/docs/configuration/configuration-file/#alertmanager_config) for more information. +* [FEATURE] Ruler: Add new `-ruler.enable-query-stats` which when enabled will report the `cortex_ruler_query_seconds_total` metric that tracks the sum of the wall time of executing queries in the ruler. #XXXX * [ENHANCEMENT] Alertmanager: Cleanup persisted state objects from remote storage when a tenant configuration is deleted. #4167 * [ENHANCEMENT] Storage: Added the ability to disable Open Census within GCS client (e.g `-gcs.enable-opencensus=false`). #4219 * [ENHANCEMENT] Etcd: Added username and password to etcd config. #4205 diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index 801dc78f46e..8b826a29eb3 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -1616,6 +1616,10 @@ ring: # processing will ignore them instead. Subject to sharding. # CLI flag: -ruler.disabled-tenants [disabled_tenants: | default = ""] + +# Report the wall time for ruler queries to complete as a metric. +# CLI flag: -ruler.enable-query-stats +[enable_query_stats: | default = false] ``` ### `ruler_storage_config` diff --git a/pkg/ruler/compat.go b/pkg/ruler/compat.go index b0cfa3fe019..dd9e2670a00 100644 --- a/pkg/ruler/compat.go +++ b/pkg/ruler/compat.go @@ -143,10 +143,17 @@ func engineQueryFunc(engine *promql.Engine, q storage.Queryable, overrides Rules } } -func metricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Counter) rules.QueryFunc { +func metricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Counter, queryTime *prometheus.CounterVec, userID string) rules.QueryFunc { return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) { queries.Inc() + var startTime time.Time + // If we've been passed a counter vec we want to record the wall time spent executing this request. + if queryTime != nil { + startTime = time.Now() + defer queryTime.WithLabelValues(userID).Add(float64(time.Since(startTime))) + } + result, err := qf(ctx, qs, t) // We rely on TranslateToPromqlApiError to do its job here... it returns nil, if err is nil. @@ -199,12 +206,20 @@ func DefaultTenantManagerFactory(cfg Config, p Pusher, q storage.Queryable, engi Name: "cortex_ruler_queries_failed_total", Help: "Number of failed queries by ruler.", }) + var rulerQuerySeconds *prometheus.CounterVec + if cfg.RulerEnableQueryStats { + rulerQuerySeconds = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_ruler_query_seconds_total", + Help: "Total amount of wall clock time spend processing queries by the ruler.", + }, []string{"user"}) + } return func(ctx context.Context, userID string, notifier *notifier.Manager, logger log.Logger, reg prometheus.Registerer) RulesManager { + return rules.NewManager(&rules.ManagerOptions{ Appendable: NewPusherAppendable(p, userID, overrides, totalWrites, failedWrites), Queryable: q, - QueryFunc: metricsQueryFunc(engineQueryFunc(engine, q, overrides, userID), totalQueries, failedQueries), + QueryFunc: metricsQueryFunc(engineQueryFunc(engine, q, overrides, userID), totalQueries, failedQueries, rulerQuerySeconds, userID), Context: user.InjectOrgID(ctx, userID), ExternalURL: cfg.ExternalURL.URL, NotifyFunc: SendAlerts(notifier, cfg.ExternalURL.URL.String()), diff --git a/pkg/ruler/compat_test.go b/pkg/ruler/compat_test.go index 82fa045bcdd..e34cafb2ba3 100644 --- a/pkg/ruler/compat_test.go +++ b/pkg/ruler/compat_test.go @@ -226,12 +226,13 @@ func TestMetricsQueryFuncErrors(t *testing.T) { t.Run(name, func(t *testing.T) { queries := prometheus.NewCounter(prometheus.CounterOpts{}) failures := prometheus.NewCounter(prometheus.CounterOpts{}) + queryTime := prometheus.NewCounterVec(prometheus.CounterOpts{}, []string{"user"}) mockFunc := func(ctx context.Context, q string, t time.Time) (promql.Vector, error) { return promql.Vector{}, tc.returnedError } - qf := metricsQueryFunc(mockFunc, queries, failures) + qf := metricsQueryFunc(mockFunc, queries, failures, queryTime, "user") _, err := qf(context.Background(), "test", time.Now()) require.Equal(t, tc.returnedError, err) diff --git a/pkg/ruler/ruler.go b/pkg/ruler/ruler.go index 25061a8d353..0b112bdccba 100644 --- a/pkg/ruler/ruler.go +++ b/pkg/ruler/ruler.go @@ -115,6 +115,8 @@ type Config struct { DisabledTenants flagext.StringSliceCSV `yaml:"disabled_tenants"` RingCheckPeriod time.Duration `yaml:"-"` + + RulerEnableQueryStats bool `yaml:"enable_query_stats"` } // Validate config and returns error on failure @@ -173,6 +175,8 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) { f.Var(&cfg.EnabledTenants, "ruler.enabled-tenants", "Comma separated list of tenants whose rules this ruler can evaluate. If specified, only these tenants will be handled by ruler, otherwise this ruler can process rules from all tenants. Subject to sharding.") f.Var(&cfg.DisabledTenants, "ruler.disabled-tenants", "Comma separated list of tenants whose rules this ruler cannot evaluate. If specified, a ruler that would normally pick the specified tenant(s) for processing will ignore them instead. Subject to sharding.") + f.BoolVar(&cfg.RulerEnableQueryStats, "ruler.enable-query-stats", false, "Report the wall time for ruler queries to complete as a metric.") + cfg.RingCheckPeriod = 5 * time.Second } diff --git a/pkg/ruler/ruler_test.go b/pkg/ruler/ruler_test.go index a6e89918b6f..1ef856d5a7c 100644 --- a/pkg/ruler/ruler_test.go +++ b/pkg/ruler/ruler_test.go @@ -60,6 +60,7 @@ func defaultRulerConfig(store rulestore.RuleStore) (Config, func()) { cfg.Ring.ListenPort = 0 cfg.Ring.InstanceAddr = "localhost" cfg.Ring.InstanceID = "localhost" + cfg.RulerEnableQueryStats = false // Create a cleanup function that will be called at the end of the test cleanup := func() { From 32033a8b4b2390b959dd397fa7ecd3730905803f Mon Sep 17 00:00:00 2001 From: Tyler Reid Date: Thu, 24 Jun 2021 15:00:08 -0500 Subject: [PATCH 02/14] Spacing and PR number fixup Signed-off-by: Tyler Reid --- CHANGELOG.md | 2 +- pkg/ruler/compat.go | 2 +- pkg/ruler/compat_test.go | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cbaa1cc6561..eede6e7aa8f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,7 +22,7 @@ * Ensure that a ring store is configured using `-alertmanager.sharding-ring.store`, and set the flags relevant to the chosen store type. * Enable the feature using `-alertmanager.sharding-enabled`. * Note the prior addition of a new configuration option `-alertmanager.persist-interval`. This sets the interval between persisting the current alertmanager state (notification log and silences) to object storage. See the [configuration file reference](https://cortexmetrics.io/docs/configuration/configuration-file/#alertmanager_config) for more information. -* [FEATURE] Ruler: Add new `-ruler.enable-query-stats` which when enabled will report the `cortex_ruler_query_seconds_total` metric that tracks the sum of the wall time of executing queries in the ruler. #XXXX +* [FEATURE] Ruler: Add new `-ruler.enable-query-stats` which when enabled will report the `cortex_ruler_query_seconds_total` metric that tracks the sum of the wall time of executing queries in the ruler. #4317 * [ENHANCEMENT] Alertmanager: Cleanup persisted state objects from remote storage when a tenant configuration is deleted. #4167 * [ENHANCEMENT] Storage: Added the ability to disable Open Census within GCS client (e.g `-gcs.enable-opencensus=false`). #4219 * [ENHANCEMENT] Etcd: Added username and password to etcd config. #4205 diff --git a/pkg/ruler/compat.go b/pkg/ruler/compat.go index 9696abd4065..040059b3bfb 100644 --- a/pkg/ruler/compat.go +++ b/pkg/ruler/compat.go @@ -230,4 +230,4 @@ func DefaultTenantManagerFactory(cfg Config, p Pusher, q storage.Queryable, engi ResendDelay: cfg.ResendDelay, }) } -} \ No newline at end of file +} diff --git a/pkg/ruler/compat_test.go b/pkg/ruler/compat_test.go index 42a35196e04..d2b55c491b1 100644 --- a/pkg/ruler/compat_test.go +++ b/pkg/ruler/compat_test.go @@ -241,4 +241,4 @@ func TestMetricsQueryFuncErrors(t *testing.T) { require.Equal(t, tc.expectedFailedQueries, int(testutil.ToFloat64(failures))) }) } -} \ No newline at end of file +} From e4144d9ee27ebadf28d8de9e128a8bb37b114827 Mon Sep 17 00:00:00 2001 From: Tyler Reid Date: Thu, 24 Jun 2021 16:42:36 -0500 Subject: [PATCH 03/14] Wrap the defer in a function to make it defer after the return rather than after the if block. Add a unit test to validate we're tracking time correctly. Signed-off-by: Tyler Reid --- pkg/ruler/compat.go | 2 +- pkg/ruler/compat_test.go | 21 +++++++++++++++++++-- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/pkg/ruler/compat.go b/pkg/ruler/compat.go index 040059b3bfb..51a5444e7a1 100644 --- a/pkg/ruler/compat.go +++ b/pkg/ruler/compat.go @@ -151,7 +151,7 @@ func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Coun // If we've been passed a counter vec we want to record the wall time spent executing this request. if queryTime != nil { startTime = time.Now() - defer queryTime.WithLabelValues(userID).Add(float64(time.Since(startTime))) + defer func() { queryTime.WithLabelValues(userID).Add(float64(time.Since(startTime))) }() } result, err := qf(ctx, qs, t) diff --git a/pkg/ruler/compat_test.go b/pkg/ruler/compat_test.go index d2b55c491b1..5a3d87323f3 100644 --- a/pkg/ruler/compat_test.go +++ b/pkg/ruler/compat_test.go @@ -231,8 +231,7 @@ func TestMetricsQueryFuncErrors(t *testing.T) { mockFunc := func(ctx context.Context, q string, t time.Time) (promql.Vector, error) { return promql.Vector{}, tc.returnedError } - - qf := MetricsQueryFunc(mockFunc, queries, failures, queryTime, "user") + qf := MetricsQueryFunc(mockFunc, queries, failures, queryTime, "userID") _, err := qf(context.Background(), "test", time.Now()) require.Equal(t, tc.returnedError, err) @@ -242,3 +241,21 @@ func TestMetricsQueryFuncErrors(t *testing.T) { }) } } + +func TestMetricsQueryFuncMetrics(t *testing.T) { + queries := prometheus.NewCounter(prometheus.CounterOpts{}) + failures := prometheus.NewCounter(prometheus.CounterOpts{}) + queryTime := prometheus.NewCounterVec(prometheus.CounterOpts{}, []string{"user"}) + + mockFunc := func(ctx context.Context, q string, t time.Time) (promql.Vector, error) { + time.Sleep(1 * time.Millisecond) + return promql.Vector{}, nil + } + qf := MetricsQueryFunc(mockFunc, queries, failures, queryTime, "userID") + + _, _ = qf(context.Background(), "test", time.Now()) + + require.Equal(t, 1, int(testutil.ToFloat64(queries))) + require.Equal(t, 0, int(testutil.ToFloat64(failures))) + require.LessOrEqual(t, float64(1*time.Millisecond), testutil.ToFloat64(queryTime.WithLabelValues("userID"))) +} From 77fa0f01a5bad2e9cd42985bfc533181c9236f3d Mon Sep 17 00:00:00 2001 From: Tyler Reid Date: Thu, 24 Jun 2021 16:54:51 -0500 Subject: [PATCH 04/14] Use seconds for our duration rather than nanoseconds Signed-off-by: Tyler Reid --- CHANGELOG.md | 2 +- pkg/ruler/compat.go | 2 +- pkg/ruler/compat_test.go | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eede6e7aa8f..67529eb4414 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,7 +22,7 @@ * Ensure that a ring store is configured using `-alertmanager.sharding-ring.store`, and set the flags relevant to the chosen store type. * Enable the feature using `-alertmanager.sharding-enabled`. * Note the prior addition of a new configuration option `-alertmanager.persist-interval`. This sets the interval between persisting the current alertmanager state (notification log and silences) to object storage. See the [configuration file reference](https://cortexmetrics.io/docs/configuration/configuration-file/#alertmanager_config) for more information. -* [FEATURE] Ruler: Add new `-ruler.enable-query-stats` which when enabled will report the `cortex_ruler_query_seconds_total` metric that tracks the sum of the wall time of executing queries in the ruler. #4317 +* [FEATURE] Ruler: Add new `-ruler.enable-query-stats` which when enabled will report the `cortex_ruler_query_seconds_total` metric that tracks the sum of the wall time of executing queries in the ruler in seconds. #4317 * [ENHANCEMENT] Alertmanager: Cleanup persisted state objects from remote storage when a tenant configuration is deleted. #4167 * [ENHANCEMENT] Storage: Added the ability to disable Open Census within GCS client (e.g `-gcs.enable-opencensus=false`). #4219 * [ENHANCEMENT] Etcd: Added username and password to etcd config. #4205 diff --git a/pkg/ruler/compat.go b/pkg/ruler/compat.go index 51a5444e7a1..c497b6fffbe 100644 --- a/pkg/ruler/compat.go +++ b/pkg/ruler/compat.go @@ -151,7 +151,7 @@ func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Coun // If we've been passed a counter vec we want to record the wall time spent executing this request. if queryTime != nil { startTime = time.Now() - defer func() { queryTime.WithLabelValues(userID).Add(float64(time.Since(startTime))) }() + defer func() { queryTime.WithLabelValues(userID).Add(time.Since(startTime).Seconds()) }() } result, err := qf(ctx, qs, t) diff --git a/pkg/ruler/compat_test.go b/pkg/ruler/compat_test.go index 5a3d87323f3..30a443d0474 100644 --- a/pkg/ruler/compat_test.go +++ b/pkg/ruler/compat_test.go @@ -248,7 +248,7 @@ func TestMetricsQueryFuncMetrics(t *testing.T) { queryTime := prometheus.NewCounterVec(prometheus.CounterOpts{}, []string{"user"}) mockFunc := func(ctx context.Context, q string, t time.Time) (promql.Vector, error) { - time.Sleep(1 * time.Millisecond) + time.Sleep(1 * time.Second) return promql.Vector{}, nil } qf := MetricsQueryFunc(mockFunc, queries, failures, queryTime, "userID") @@ -257,5 +257,5 @@ func TestMetricsQueryFuncMetrics(t *testing.T) { require.Equal(t, 1, int(testutil.ToFloat64(queries))) require.Equal(t, 0, int(testutil.ToFloat64(failures))) - require.LessOrEqual(t, float64(1*time.Millisecond), testutil.ToFloat64(queryTime.WithLabelValues("userID"))) + require.LessOrEqual(t, float64(1), testutil.ToFloat64(queryTime.WithLabelValues("userID"))) } From 201130759dbca1d6d88a2dcf5801c492dd17afff Mon Sep 17 00:00:00 2001 From: Tyler Reid Date: Tue, 29 Jun 2021 09:40:44 -0500 Subject: [PATCH 05/14] Review comment fixes Signed-off-by: Tyler Reid --- CHANGELOG.md | 2 +- pkg/ruler/compat.go | 9 ++++----- pkg/ruler/ruler.go | 4 ++-- pkg/ruler/ruler_test.go | 2 +- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d7deb7c5e98..5b4c4f8129d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # Changelog ## master / unreleased +* [FEATURE] Ruler: Add new `-ruler.query-stats-enabled` which when enabled will report the `cortex_ruler_query_seconds_total` metric that tracks the sum of the wall time of executing queries in the ruler in seconds. #4317 ## 1.10.0-rc.0 / 2021-06-28 @@ -26,7 +27,6 @@ * Ensure that a ring store is configured using `-alertmanager.sharding-ring.store`, and set the flags relevant to the chosen store type. * Enable the feature using `-alertmanager.sharding-enabled`. * Note the prior addition of a new configuration option `-alertmanager.persist-interval`. This sets the interval between persisting the current alertmanager state (notification log and silences) to object storage. See the [configuration file reference](https://cortexmetrics.io/docs/configuration/configuration-file/#alertmanager_config) for more information. -* [FEATURE] Ruler: Add new `-ruler.enable-query-stats` which when enabled will report the `cortex_ruler_query_seconds_total` metric that tracks the sum of the wall time of executing queries in the ruler in seconds. #4317 * [ENHANCEMENT] Alertmanager: Cleanup persisted state objects from remote storage when a tenant configuration is deleted. #4167 * [ENHANCEMENT] Storage: Added the ability to disable Open Census within GCS client (e.g `-gcs.enable-opencensus=false`). #4219 * [ENHANCEMENT] Etcd: Added username and password to etcd config. #4205 diff --git a/pkg/ruler/compat.go b/pkg/ruler/compat.go index c497b6fffbe..2656dcf357a 100644 --- a/pkg/ruler/compat.go +++ b/pkg/ruler/compat.go @@ -147,11 +147,10 @@ func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Coun return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) { queries.Inc() - var startTime time.Time // If we've been passed a counter vec we want to record the wall time spent executing this request. if queryTime != nil { - startTime = time.Now() - defer func() { queryTime.WithLabelValues(userID).Add(time.Since(startTime).Seconds()) }() + timer := prometheus.NewTimer(nil) + defer func() { queryTime.WithLabelValues(userID).Add(timer.ObserveDuration().Seconds()) }() } result, err := qf(ctx, qs, t) @@ -207,10 +206,10 @@ func DefaultTenantManagerFactory(cfg Config, p Pusher, q storage.Queryable, engi Help: "Number of failed queries by ruler.", }) var rulerQuerySeconds *prometheus.CounterVec - if cfg.RulerEnableQueryStats { + if cfg.EnableQueryStats { rulerQuerySeconds = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ Name: "cortex_ruler_query_seconds_total", - Help: "Total amount of wall clock time spend processing queries by the ruler.", + Help: "Total amount of wall clock time spent processing queries by the ruler.", }, []string{"user"}) } diff --git a/pkg/ruler/ruler.go b/pkg/ruler/ruler.go index 0b112bdccba..c8aa0ccaf5f 100644 --- a/pkg/ruler/ruler.go +++ b/pkg/ruler/ruler.go @@ -116,7 +116,7 @@ type Config struct { RingCheckPeriod time.Duration `yaml:"-"` - RulerEnableQueryStats bool `yaml:"enable_query_stats"` + EnableQueryStats bool `yaml:"query_stats_enabled"` } // Validate config and returns error on failure @@ -175,7 +175,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) { f.Var(&cfg.EnabledTenants, "ruler.enabled-tenants", "Comma separated list of tenants whose rules this ruler can evaluate. If specified, only these tenants will be handled by ruler, otherwise this ruler can process rules from all tenants. Subject to sharding.") f.Var(&cfg.DisabledTenants, "ruler.disabled-tenants", "Comma separated list of tenants whose rules this ruler cannot evaluate. If specified, a ruler that would normally pick the specified tenant(s) for processing will ignore them instead. Subject to sharding.") - f.BoolVar(&cfg.RulerEnableQueryStats, "ruler.enable-query-stats", false, "Report the wall time for ruler queries to complete as a metric.") + f.BoolVar(&cfg.EnableQueryStats, "ruler.query-stats-enabled", false, "Report the wall time for ruler queries to complete as a metric.") cfg.RingCheckPeriod = 5 * time.Second } diff --git a/pkg/ruler/ruler_test.go b/pkg/ruler/ruler_test.go index 1ef856d5a7c..503474afd8c 100644 --- a/pkg/ruler/ruler_test.go +++ b/pkg/ruler/ruler_test.go @@ -60,7 +60,7 @@ func defaultRulerConfig(store rulestore.RuleStore) (Config, func()) { cfg.Ring.ListenPort = 0 cfg.Ring.InstanceAddr = "localhost" cfg.Ring.InstanceID = "localhost" - cfg.RulerEnableQueryStats = false + cfg.EnableQueryStats = false // Create a cleanup function that will be called at the end of the test cleanup := func() { From 7db3cfa373e8113f879611b8df343ec276c5818d Mon Sep 17 00:00:00 2001 From: Tyler Reid Date: Tue, 29 Jun 2021 10:32:22 -0500 Subject: [PATCH 06/14] Update config flag in the config docs Signed-off-by: Tyler Reid --- docs/configuration/config-file-reference.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index 8b826a29eb3..5871a24f9de 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -1618,8 +1618,8 @@ ring: [disabled_tenants: | default = ""] # Report the wall time for ruler queries to complete as a metric. -# CLI flag: -ruler.enable-query-stats -[enable_query_stats: | default = false] +# CLI flag: -ruler.query-stats-enabled +[query_stats_enabled: | default = false] ``` ### `ruler_storage_config` From bb7773328974f51654bdd63f44a88a55531d4fe4 Mon Sep 17 00:00:00 2001 From: Tyler Reid Date: Wed, 30 Jun 2021 15:15:24 -0500 Subject: [PATCH 07/14] Pass counter rather than counter vector for metrics query function Signed-off-by: Tyler Reid --- pkg/ruler/compat.go | 11 +++++++---- pkg/ruler/compat_test.go | 4 ++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/pkg/ruler/compat.go b/pkg/ruler/compat.go index 2656dcf357a..4a17ea65c5d 100644 --- a/pkg/ruler/compat.go +++ b/pkg/ruler/compat.go @@ -143,18 +143,17 @@ func EngineQueryFunc(engine *promql.Engine, q storage.Queryable, overrides Rules } } -func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Counter, queryTime *prometheus.CounterVec, userID string) rules.QueryFunc { +func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Counter, queryTime prometheus.Counter) rules.QueryFunc { return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) { queries.Inc() // If we've been passed a counter vec we want to record the wall time spent executing this request. if queryTime != nil { timer := prometheus.NewTimer(nil) - defer func() { queryTime.WithLabelValues(userID).Add(timer.ObserveDuration().Seconds()) }() + defer func() { queryTime.Add(timer.ObserveDuration().Seconds()) }() } result, err := qf(ctx, qs, t) - // We rely on TranslateToPromqlApiError to do its job here... it returns nil, if err is nil. // It returns promql.ErrStorage, if error should be reported back as 500. // Other errors it returns are either for canceled or timed-out queriers (we're not reporting those as failures), @@ -214,11 +213,15 @@ func DefaultTenantManagerFactory(cfg Config, p Pusher, q storage.Queryable, engi } return func(ctx context.Context, userID string, notifier *notifier.Manager, logger log.Logger, reg prometheus.Registerer) RulesManager { + var queryTime prometheus.Counter = nil + if rulerQuerySeconds != nil { + queryTime = rulerQuerySeconds.WithLabelValues(userID) + } return rules.NewManager(&rules.ManagerOptions{ Appendable: NewPusherAppendable(p, userID, overrides, totalWrites, failedWrites), Queryable: q, - QueryFunc: MetricsQueryFunc(EngineQueryFunc(engine, q, overrides, userID), totalQueries, failedQueries, rulerQuerySeconds, userID), + QueryFunc: MetricsQueryFunc(EngineQueryFunc(engine, q, overrides, userID), totalQueries, failedQueries, queryTime), Context: user.InjectOrgID(ctx, userID), ExternalURL: cfg.ExternalURL.URL, NotifyFunc: SendAlerts(notifier, cfg.ExternalURL.URL.String()), diff --git a/pkg/ruler/compat_test.go b/pkg/ruler/compat_test.go index 30a443d0474..1409f6b8ab0 100644 --- a/pkg/ruler/compat_test.go +++ b/pkg/ruler/compat_test.go @@ -231,7 +231,7 @@ func TestMetricsQueryFuncErrors(t *testing.T) { mockFunc := func(ctx context.Context, q string, t time.Time) (promql.Vector, error) { return promql.Vector{}, tc.returnedError } - qf := MetricsQueryFunc(mockFunc, queries, failures, queryTime, "userID") + qf := MetricsQueryFunc(mockFunc, queries, failures, queryTime.WithLabelValues("userID")) _, err := qf(context.Background(), "test", time.Now()) require.Equal(t, tc.returnedError, err) @@ -251,7 +251,7 @@ func TestMetricsQueryFuncMetrics(t *testing.T) { time.Sleep(1 * time.Second) return promql.Vector{}, nil } - qf := MetricsQueryFunc(mockFunc, queries, failures, queryTime, "userID") + qf := MetricsQueryFunc(mockFunc, queries, failures, queryTime.WithLabelValues("userID")) _, _ = qf(context.Background(), "test", time.Now()) From 259a408c56dd5c9f5458e254e77c62fb3bf8b36c Mon Sep 17 00:00:00 2001 From: Tyler Reid Date: Thu, 1 Jul 2021 09:10:02 -0500 Subject: [PATCH 08/14] Fix comment in MetricsQueryFunction Signed-off-by: Tyler Reid --- pkg/ruler/compat.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/ruler/compat.go b/pkg/ruler/compat.go index 4a17ea65c5d..198c4016431 100644 --- a/pkg/ruler/compat.go +++ b/pkg/ruler/compat.go @@ -147,7 +147,7 @@ func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Coun return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) { queries.Inc() - // If we've been passed a counter vec we want to record the wall time spent executing this request. + // If we've been passed a counter we want to record the wall time spent executing this request. if queryTime != nil { timer := prometheus.NewTimer(nil) defer func() { queryTime.Add(timer.ObserveDuration().Seconds()) }() From e5894e53e1ac9dd277a43afe6bda833e39671a8a Mon Sep 17 00:00:00 2001 From: Tyler Reid Date: Thu, 1 Jul 2021 14:49:44 -0500 Subject: [PATCH 09/14] Move query metric and log to separate function. Add log message for ruler query time. Signed-off-by: Tyler Reid --- pkg/ruler/compat.go | 36 +++++++++++++++++++++++++++--------- pkg/ruler/compat_test.go | 6 +++--- 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/pkg/ruler/compat.go b/pkg/ruler/compat.go index 198c4016431..09397f93162 100644 --- a/pkg/ruler/compat.go +++ b/pkg/ruler/compat.go @@ -6,6 +6,7 @@ import ( "time" "github.com/go-kit/kit/log" + "github.com/go-kit/kit/log/level" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" "github.com/prometheus/prometheus/notifier" @@ -20,6 +21,7 @@ import ( "github.com/cortexproject/cortex/pkg/cortexpb" "github.com/cortexproject/cortex/pkg/querier" + util_log "github.com/cortexproject/cortex/pkg/util/log" ) // Pusher is an ingester server that accepts pushes. @@ -143,16 +145,9 @@ func EngineQueryFunc(engine *promql.Engine, q storage.Queryable, overrides Rules } } -func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Counter, queryTime prometheus.Counter) rules.QueryFunc { +func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Counter) rules.QueryFunc { return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) { queries.Inc() - - // If we've been passed a counter we want to record the wall time spent executing this request. - if queryTime != nil { - timer := prometheus.NewTimer(nil) - defer func() { queryTime.Add(timer.ObserveDuration().Seconds()) }() - } - result, err := qf(ctx, qs, t) // We rely on TranslateToPromqlApiError to do its job here... it returns nil, if err is nil. // It returns promql.ErrStorage, if error should be reported back as 500. @@ -168,6 +163,29 @@ func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Coun } } +func RecordAndReportRuleQueryMetrics(qf rules.QueryFunc, queryTime prometheus.Counter, logger log.Logger) rules.QueryFunc { + return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) { + // If we've been passed a counter we want to record the wall time spent executing this request. + if queryTime != nil { + timer := prometheus.NewTimer(nil) + defer func() { + querySeconds := timer.ObserveDuration().Seconds() + queryTime.Add(querySeconds) + + // Log ruler query stats. + logMessage := append([]interface{}{ + "msg", "ruler query stats", + "cortex_ruler_query_seconds_total", querySeconds, + }, qs) + level.Info(util_log.WithContext(ctx, logger)).Log(logMessage...) + }() + } + + result, err := qf(ctx, qs, t) + return result, err + } +} + // This interface mimicks rules.Manager API. Interface is used to simplify tests. type RulesManager interface { // Starts rules manager. Blocks until Stop is called. @@ -221,7 +239,7 @@ func DefaultTenantManagerFactory(cfg Config, p Pusher, q storage.Queryable, engi return rules.NewManager(&rules.ManagerOptions{ Appendable: NewPusherAppendable(p, userID, overrides, totalWrites, failedWrites), Queryable: q, - QueryFunc: MetricsQueryFunc(EngineQueryFunc(engine, q, overrides, userID), totalQueries, failedQueries, queryTime), + QueryFunc: RecordAndReportRuleQueryMetrics(MetricsQueryFunc(EngineQueryFunc(engine, q, overrides, userID), totalQueries, failedQueries), queryTime, logger), Context: user.InjectOrgID(ctx, userID), ExternalURL: cfg.ExternalURL.URL, NotifyFunc: SendAlerts(notifier, cfg.ExternalURL.URL.String()), diff --git a/pkg/ruler/compat_test.go b/pkg/ruler/compat_test.go index 1409f6b8ab0..6aba21cfa0b 100644 --- a/pkg/ruler/compat_test.go +++ b/pkg/ruler/compat_test.go @@ -8,6 +8,7 @@ import ( "testing" "time" + "github.com/go-kit/kit/log" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/testutil" "github.com/prometheus/common/model" @@ -226,12 +227,11 @@ func TestMetricsQueryFuncErrors(t *testing.T) { t.Run(name, func(t *testing.T) { queries := prometheus.NewCounter(prometheus.CounterOpts{}) failures := prometheus.NewCounter(prometheus.CounterOpts{}) - queryTime := prometheus.NewCounterVec(prometheus.CounterOpts{}, []string{"user"}) mockFunc := func(ctx context.Context, q string, t time.Time) (promql.Vector, error) { return promql.Vector{}, tc.returnedError } - qf := MetricsQueryFunc(mockFunc, queries, failures, queryTime.WithLabelValues("userID")) + qf := MetricsQueryFunc(mockFunc, queries, failures) _, err := qf(context.Background(), "test", time.Now()) require.Equal(t, tc.returnedError, err) @@ -251,7 +251,7 @@ func TestMetricsQueryFuncMetrics(t *testing.T) { time.Sleep(1 * time.Second) return promql.Vector{}, nil } - qf := MetricsQueryFunc(mockFunc, queries, failures, queryTime.WithLabelValues("userID")) + qf := RecordAndReportRuleQueryMetrics(MetricsQueryFunc(mockFunc, queries, failures), queryTime.WithLabelValues("userID"), log.NewNopLogger()) _, _ = qf(context.Background(), "test", time.Now()) From 33bb72147e1cca0ade41fffce050985238be427e Mon Sep 17 00:00:00 2001 From: Tyler Reid Date: Thu, 1 Jul 2021 15:04:12 -0500 Subject: [PATCH 10/14] Update config file and change log to show this a per user metric Signed-off-by: Tyler Reid --- CHANGELOG.md | 2 +- docs/configuration/config-file-reference.md | 3 ++- pkg/ruler/ruler.go | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5b4c4f8129d..762f5574d4a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ # Changelog ## master / unreleased -* [FEATURE] Ruler: Add new `-ruler.query-stats-enabled` which when enabled will report the `cortex_ruler_query_seconds_total` metric that tracks the sum of the wall time of executing queries in the ruler in seconds. #4317 +* [FEATURE] Ruler: Add new `-ruler.query-stats-enabled` which when enabled will report the `cortex_ruler_query_seconds_total` as a per-user metric that tracks the sum of the wall time of executing queries in the ruler in seconds. #4317 ## 1.10.0-rc.0 / 2021-06-28 diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index 5871a24f9de..eeb4f773ea6 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -1617,7 +1617,8 @@ ring: # CLI flag: -ruler.disabled-tenants [disabled_tenants: | default = ""] -# Report the wall time for ruler queries to complete as a metric. +# Report the wall time for ruler queries to complete as a per user metric and as +# an info level log message. # CLI flag: -ruler.query-stats-enabled [query_stats_enabled: | default = false] ``` diff --git a/pkg/ruler/ruler.go b/pkg/ruler/ruler.go index c8aa0ccaf5f..57b51ed103c 100644 --- a/pkg/ruler/ruler.go +++ b/pkg/ruler/ruler.go @@ -175,7 +175,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) { f.Var(&cfg.EnabledTenants, "ruler.enabled-tenants", "Comma separated list of tenants whose rules this ruler can evaluate. If specified, only these tenants will be handled by ruler, otherwise this ruler can process rules from all tenants. Subject to sharding.") f.Var(&cfg.DisabledTenants, "ruler.disabled-tenants", "Comma separated list of tenants whose rules this ruler cannot evaluate. If specified, a ruler that would normally pick the specified tenant(s) for processing will ignore them instead. Subject to sharding.") - f.BoolVar(&cfg.EnableQueryStats, "ruler.query-stats-enabled", false, "Report the wall time for ruler queries to complete as a metric.") + f.BoolVar(&cfg.EnableQueryStats, "ruler.query-stats-enabled", false, "Report the wall time for ruler queries to complete as a per user metric and as an info level log message.") cfg.RingCheckPeriod = 5 * time.Second } From db2966c229f29b09354948b9317352645993c214 Mon Sep 17 00:00:00 2001 From: Tyler Reid Date: Tue, 6 Jul 2021 18:12:22 -0500 Subject: [PATCH 11/14] code review fixes Signed-off-by: Tyler Reid --- pkg/ruler/compat.go | 30 ++++++++++++++++-------------- pkg/ruler/compat_test.go | 4 ++-- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/pkg/ruler/compat.go b/pkg/ruler/compat.go index 09397f93162..fc46be93520 100644 --- a/pkg/ruler/compat.go +++ b/pkg/ruler/compat.go @@ -164,22 +164,24 @@ func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Coun } func RecordAndReportRuleQueryMetrics(qf rules.QueryFunc, queryTime prometheus.Counter, logger log.Logger) rules.QueryFunc { + if queryTime == nil { + return qf + } + return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) { // If we've been passed a counter we want to record the wall time spent executing this request. - if queryTime != nil { - timer := prometheus.NewTimer(nil) - defer func() { - querySeconds := timer.ObserveDuration().Seconds() - queryTime.Add(querySeconds) - - // Log ruler query stats. - logMessage := append([]interface{}{ - "msg", "ruler query stats", - "cortex_ruler_query_seconds_total", querySeconds, - }, qs) - level.Info(util_log.WithContext(ctx, logger)).Log(logMessage...) - }() - } + timer := prometheus.NewTimer(nil) + defer func() { + querySeconds := timer.ObserveDuration().Seconds() + queryTime.Add(querySeconds) + + // Log ruler query stats. + logMessage := append([]interface{}{ + "msg", "ruler query stats", + "cortex_ruler_query_seconds_total", querySeconds, + }, qs) + level.Info(util_log.WithContext(ctx, logger)).Log(logMessage...) + }() result, err := qf(ctx, qs, t) return result, err diff --git a/pkg/ruler/compat_test.go b/pkg/ruler/compat_test.go index 6aba21cfa0b..0d667012d3d 100644 --- a/pkg/ruler/compat_test.go +++ b/pkg/ruler/compat_test.go @@ -242,7 +242,7 @@ func TestMetricsQueryFuncErrors(t *testing.T) { } } -func TestMetricsQueryFuncMetrics(t *testing.T) { +func TestRecordAndReportRuleQueryMetrics(t *testing.T) { queries := prometheus.NewCounter(prometheus.CounterOpts{}) failures := prometheus.NewCounter(prometheus.CounterOpts{}) queryTime := prometheus.NewCounterVec(prometheus.CounterOpts{}, []string{"user"}) @@ -257,5 +257,5 @@ func TestMetricsQueryFuncMetrics(t *testing.T) { require.Equal(t, 1, int(testutil.ToFloat64(queries))) require.Equal(t, 0, int(testutil.ToFloat64(failures))) - require.LessOrEqual(t, float64(1), testutil.ToFloat64(queryTime.WithLabelValues("userID"))) + require.GreaterOrEqual(t, testutil.ToFloat64(queryTime.WithLabelValues("userID")), float64(1)) } From 591548964b40b4f00eb75303583d69857dd07c8f Mon Sep 17 00:00:00 2001 From: Tyler Reid Date: Wed, 7 Jul 2021 08:37:12 -0500 Subject: [PATCH 12/14] update log message for ruler query metrics Signed-off-by: Tyler Reid --- pkg/ruler/compat.go | 6 ++++-- pkg/ruler/compat_test.go | 7 +------ 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/pkg/ruler/compat.go b/pkg/ruler/compat.go index fc46be93520..d1a0e51b6dd 100644 --- a/pkg/ruler/compat.go +++ b/pkg/ruler/compat.go @@ -177,9 +177,11 @@ func RecordAndReportRuleQueryMetrics(qf rules.QueryFunc, queryTime prometheus.Co // Log ruler query stats. logMessage := append([]interface{}{ - "msg", "ruler query stats", + "msg", "query stats", + "component", "ruler", "cortex_ruler_query_seconds_total", querySeconds, - }, qs) + "query", qs, + }) level.Info(util_log.WithContext(ctx, logger)).Log(logMessage...) }() diff --git a/pkg/ruler/compat_test.go b/pkg/ruler/compat_test.go index 0d667012d3d..dfcb251803a 100644 --- a/pkg/ruler/compat_test.go +++ b/pkg/ruler/compat_test.go @@ -243,19 +243,14 @@ func TestMetricsQueryFuncErrors(t *testing.T) { } func TestRecordAndReportRuleQueryMetrics(t *testing.T) { - queries := prometheus.NewCounter(prometheus.CounterOpts{}) - failures := prometheus.NewCounter(prometheus.CounterOpts{}) queryTime := prometheus.NewCounterVec(prometheus.CounterOpts{}, []string{"user"}) mockFunc := func(ctx context.Context, q string, t time.Time) (promql.Vector, error) { time.Sleep(1 * time.Second) return promql.Vector{}, nil } - qf := RecordAndReportRuleQueryMetrics(MetricsQueryFunc(mockFunc, queries, failures), queryTime.WithLabelValues("userID"), log.NewNopLogger()) - + qf := RecordAndReportRuleQueryMetrics(mockFunc, queryTime.WithLabelValues("userID"), log.NewNopLogger()) _, _ = qf(context.Background(), "test", time.Now()) - require.Equal(t, 1, int(testutil.ToFloat64(queries))) - require.Equal(t, 0, int(testutil.ToFloat64(failures))) require.GreaterOrEqual(t, testutil.ToFloat64(queryTime.WithLabelValues("userID")), float64(1)) } From e487e220b1156d9883b4499625c62181fbfb7603 Mon Sep 17 00:00:00 2001 From: Tyler Reid Date: Wed, 7 Jul 2021 08:51:45 -0500 Subject: [PATCH 13/14] Remove append and just use the array for key values in the log messag Signed-off-by: Tyler Reid --- pkg/ruler/compat.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/ruler/compat.go b/pkg/ruler/compat.go index d1a0e51b6dd..c95cfcd9446 100644 --- a/pkg/ruler/compat.go +++ b/pkg/ruler/compat.go @@ -176,12 +176,12 @@ func RecordAndReportRuleQueryMetrics(qf rules.QueryFunc, queryTime prometheus.Co queryTime.Add(querySeconds) // Log ruler query stats. - logMessage := append([]interface{}{ + logMessage := []interface{}{ "msg", "query stats", "component", "ruler", "cortex_ruler_query_seconds_total", querySeconds, "query", qs, - }) + } level.Info(util_log.WithContext(ctx, logger)).Log(logMessage...) }() From 97b4e4cb06005939b7df8e1587d6584f8baa2f84 Mon Sep 17 00:00:00 2001 From: Tyler Reid Date: Thu, 8 Jul 2021 08:21:08 -0500 Subject: [PATCH 14/14] Add query-frontend component to front end log message Signed-off-by: Tyler Reid --- pkg/frontend/transport/handler.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/frontend/transport/handler.go b/pkg/frontend/transport/handler.go index 8fcf8e9630e..435a0227481 100644 --- a/pkg/frontend/transport/handler.go +++ b/pkg/frontend/transport/handler.go @@ -173,6 +173,7 @@ func (f *Handler) reportQueryStats(r *http.Request, queryString url.Values, quer // Log stats. logMessage := append([]interface{}{ "msg", "query stats", + "component", "query-frontend", "method", r.Method, "path", r.URL.Path, "response_time", queryResponseTime,