From 2a685344c9acba5d6f0cb3511fb64ccdee3f0b07 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Wed, 24 Jun 2020 21:05:57 +0100 Subject: [PATCH 1/4] Instrument the Ruler Notifier The ruler notifier is not instrumented at the moment. This commit wraps around the ruler registry with the userID as a label and `cortex_` as a prefix and passes it to the notifier options. Signed-off-by: gotjosh --- pkg/ruler/ruler.go | 3 +++ pkg/ruler/ruler_test.go | 9 +++++++++ 2 files changed, 12 insertions(+) diff --git a/pkg/ruler/ruler.go b/pkg/ruler/ruler.go index 6bc0fb3ee60..341be3a36ea 100644 --- a/pkg/ruler/ruler.go +++ b/pkg/ruler/ruler.go @@ -330,8 +330,11 @@ func (r *Ruler) getOrCreateNotifier(userID string) (*notifier.Manager, error) { return n.notifier, nil } + reg := prometheus.WrapRegistererWith(prometheus.Labels{"user": userID}, r.registry) + reg = prometheus.WrapRegistererWithPrefix("cortex_", reg) n = newRulerNotifier(¬ifier.Options{ QueueCapacity: r.cfg.NotificationQueueCapacity, + Registerer: reg, Do: func(ctx context.Context, client *http.Client, req *http.Request) (*http.Response, error) { // Note: The passed-in context comes from the Prometheus notifier // and does *not* contain the userID. So it needs to be added to the context diff --git a/pkg/ruler/ruler_test.go b/pkg/ruler/ruler_test.go index 27e5f637e4a..9ef62260e25 100644 --- a/pkg/ruler/ruler_test.go +++ b/pkg/ruler/ruler_test.go @@ -6,6 +6,7 @@ import ( "net/http" "net/http/httptest" "os" + "strings" "sync" "testing" "time" @@ -13,6 +14,7 @@ import ( "github.com/go-kit/kit/log" "github.com/go-kit/kit/log/level" "github.com/prometheus/client_golang/prometheus" + promTU "github.com/prometheus/client_golang/prometheus/testutil" "github.com/prometheus/prometheus/notifier" "github.com/prometheus/prometheus/pkg/labels" "github.com/prometheus/prometheus/promql" @@ -136,6 +138,13 @@ func TestNotifierSendsUserIDHeader(t *testing.T) { }) wg.Wait() + + // Ensure we have metrics in the notifier. + assert.NoError(t, promTU.GatherAndCompare(r.registry.(*prometheus.Registry), strings.NewReader(` + # HELP cortex_prometheus_notifications_dropped_total Total number of alerts dropped due to errors when sending to Alertmanager. + # TYPE cortex_prometheus_notifications_dropped_total counter + cortex_prometheus_notifications_dropped_total{user="1"} 0 + `), "cortex_prometheus_notifications_dropped_total")) } func TestRuler_Rules(t *testing.T) { From 538c5b0a3a0f7514f6d6d75035519f785b3aefd2 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Wed, 24 Jun 2020 21:11:19 +0100 Subject: [PATCH 2/4] Add Changelog entry Signed-off-by: gotjosh --- CHANGELOG.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 32da536d097..2fa608bf23d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,14 @@ ## master / unreleased +* [ENHANCEMENT Ruler: Added the following metrics: #2786 + * `cortex_prometheus_notifications_latency_seconds` + * `cortex_prometheus_notifications_errors_total` + * `cortex_prometheus_notifications_sent_total` + * `cortex_prometheus_notifications_dropped_total` + * `cortex_prometheus_notifications_queue_length` + * `cortex_prometheus_notifications_queue_capacity` + * `cortex_prometheus_notifications_prometheus_notifications_alertmanagers_discovered` * [FEATURE] Introduced `ruler.for-outage-tolerance`, Max time to tolerate outage for restoring "for" state of alert. #2783 * [FEATURE] Introduced `ruler.for-grace-period`, Minimum duration between alert and restored "for" state. This is maintained only for alerts with configured "for" time greater than grace period. #2783 * [FEATURE] Introduced `ruler.for-resend-delay`, Minimum amount of time to wait before resending an alert to Alertmanager. #2783 From 04147c5c36f00fa42524ef2cf7337e4709b2a05f Mon Sep 17 00:00:00 2001 From: gotjosh Date: Thu, 25 Jun 2020 10:45:21 +0100 Subject: [PATCH 3/4] Rename prom_testutil Signed-off-by: gotjosh --- pkg/ruler/ruler_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/ruler/ruler_test.go b/pkg/ruler/ruler_test.go index 9ef62260e25..d69b58be7de 100644 --- a/pkg/ruler/ruler_test.go +++ b/pkg/ruler/ruler_test.go @@ -14,7 +14,7 @@ import ( "github.com/go-kit/kit/log" "github.com/go-kit/kit/log/level" "github.com/prometheus/client_golang/prometheus" - promTU "github.com/prometheus/client_golang/prometheus/testutil" + prom_testutil "github.com/prometheus/client_golang/prometheus/testutil" "github.com/prometheus/prometheus/notifier" "github.com/prometheus/prometheus/pkg/labels" "github.com/prometheus/prometheus/promql" @@ -140,7 +140,7 @@ func TestNotifierSendsUserIDHeader(t *testing.T) { wg.Wait() // Ensure we have metrics in the notifier. - assert.NoError(t, promTU.GatherAndCompare(r.registry.(*prometheus.Registry), strings.NewReader(` + assert.NoError(t, prom_testutil.GatherAndCompare(r.registry.(*prometheus.Registry), strings.NewReader(` # HELP cortex_prometheus_notifications_dropped_total Total number of alerts dropped due to errors when sending to Alertmanager. # TYPE cortex_prometheus_notifications_dropped_total counter cortex_prometheus_notifications_dropped_total{user="1"} 0 From 0e6b71c5004f0d056d0a866c2d6dc9534222198b Mon Sep 17 00:00:00 2001 From: gotjosh Date: Thu, 25 Jun 2020 10:58:38 +0100 Subject: [PATCH 4/4] Fix the changelog Signed-off-by: gotjosh --- CHANGELOG.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2fa608bf23d..e4bd20aa3d9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,14 +2,6 @@ ## master / unreleased -* [ENHANCEMENT Ruler: Added the following metrics: #2786 - * `cortex_prometheus_notifications_latency_seconds` - * `cortex_prometheus_notifications_errors_total` - * `cortex_prometheus_notifications_sent_total` - * `cortex_prometheus_notifications_dropped_total` - * `cortex_prometheus_notifications_queue_length` - * `cortex_prometheus_notifications_queue_capacity` - * `cortex_prometheus_notifications_prometheus_notifications_alertmanagers_discovered` * [FEATURE] Introduced `ruler.for-outage-tolerance`, Max time to tolerate outage for restoring "for" state of alert. #2783 * [FEATURE] Introduced `ruler.for-grace-period`, Minimum duration between alert and restored "for" state. This is maintained only for alerts with configured "for" time greater than grace period. #2783 * [FEATURE] Introduced `ruler.for-resend-delay`, Minimum amount of time to wait before resending an alert to Alertmanager. #2783 @@ -68,6 +60,14 @@ * [FEATURE] TLS config options added for GRPC clients in Querier (Query-frontend client & Ingester client), Ruler, Store Gateway, as well as HTTP client in Config store client. #2502 * [FEATURE] The flag `-frontend.max-cache-freshness` is now supported within the limits overrides, to specify per-tenant max cache freshness values. The corresponding YAML config parameter has been changed from `results_cache.max_freshness` to `limits_config.max_cache_freshness`. The legacy YAML config parameter (`results_cache.max_freshness`) will continue to be supported till Cortex release `v1.4.0`. #2609 * [FEATURE] Experimental gRPC Store: Added support to 3rd parties index and chunk stores using gRPC client/server plugin mechanism. #2220 +* [ENHANCEMENT] Ruler: Added the following metrics: #2786 + * `cortex_prometheus_notifications_latency_seconds` + * `cortex_prometheus_notifications_errors_total` + * `cortex_prometheus_notifications_sent_total` + * `cortex_prometheus_notifications_dropped_total` + * `cortex_prometheus_notifications_queue_length` + * `cortex_prometheus_notifications_queue_capacity` + * `cortex_prometheus_notifications_alertmanagers_discovered` * [ENHANCEMENT] Propagate GOPROXY value when building `build-image`. This is to help the builders building the code in a Network where default Go proxy is not accessible (e.g. when behind some corporate VPN). #2741 * [ENHANCEMENT] Querier: Added metric `cortex_querier_request_duration_seconds` for all requests to the querier. #2708 * [ENHANCEMENT] Experimental TSDB: added the following metrics to the ingester: #2580 #2583 #2589 #2654