diff --git a/CHANGELOG.md b/CHANGELOG.md index 734a2241218..97a35d8cb49 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,14 @@ * [FEATURE] Introduced `ruler.for-grace-period`, Minimum duration between alert and restored "for" state. This is maintained only for alerts with configured "for" time greater than grace period. #2783 * [FEATURE] Introduced `ruler.for-resend-delay`, Minimum amount of time to wait before resending an alert to Alertmanager. #2783 * [ENHANCEMENT] Experimental: Querier can now optionally query secondary store. This is specified by using `-querier.second-store-engine` option, with values `chunks` or `tsdb`. Standard configuration options for this store are used. Additionally, this querying can be configured to happen only for queries that need data older than `-querier.use-second-store-before-time`. Default value of zero will always query secondary store. #2747 +* [ENHANCEMENT] Ruler: Added the following metrics: #2786 + * `cortex_prometheus_notifications_latency_seconds` + * `cortex_prometheus_notifications_errors_total` + * `cortex_prometheus_notifications_sent_total` + * `cortex_prometheus_notifications_dropped_total` + * `cortex_prometheus_notifications_queue_length` + * `cortex_prometheus_notifications_queue_capacity` + * `cortex_prometheus_notifications_alertmanagers_discovered` * [BUGFIX] Fixed a bug in the index intersect code causing storage to return more chunks/series than required. #2796 * [BUGFIX] Fixed the number of reported keys in the background cache queue. #2764 * [BUGFIX] Fix race in processing of headers in sharded queries. #2762 diff --git a/pkg/ruler/ruler.go b/pkg/ruler/ruler.go index 6bc0fb3ee60..341be3a36ea 100644 --- a/pkg/ruler/ruler.go +++ b/pkg/ruler/ruler.go @@ -330,8 +330,11 @@ func (r *Ruler) getOrCreateNotifier(userID string) (*notifier.Manager, error) { return n.notifier, nil } + reg := prometheus.WrapRegistererWith(prometheus.Labels{"user": userID}, r.registry) + reg = prometheus.WrapRegistererWithPrefix("cortex_", reg) n = newRulerNotifier(¬ifier.Options{ QueueCapacity: r.cfg.NotificationQueueCapacity, + Registerer: reg, Do: func(ctx context.Context, client *http.Client, req *http.Request) (*http.Response, error) { // Note: The passed-in context comes from the Prometheus notifier // and does *not* contain the userID. So it needs to be added to the context diff --git a/pkg/ruler/ruler_test.go b/pkg/ruler/ruler_test.go index 27e5f637e4a..d69b58be7de 100644 --- a/pkg/ruler/ruler_test.go +++ b/pkg/ruler/ruler_test.go @@ -6,6 +6,7 @@ import ( "net/http" "net/http/httptest" "os" + "strings" "sync" "testing" "time" @@ -13,6 +14,7 @@ import ( "github.com/go-kit/kit/log" "github.com/go-kit/kit/log/level" "github.com/prometheus/client_golang/prometheus" + prom_testutil "github.com/prometheus/client_golang/prometheus/testutil" "github.com/prometheus/prometheus/notifier" "github.com/prometheus/prometheus/pkg/labels" "github.com/prometheus/prometheus/promql" @@ -136,6 +138,13 @@ func TestNotifierSendsUserIDHeader(t *testing.T) { }) wg.Wait() + + // Ensure we have metrics in the notifier. + assert.NoError(t, prom_testutil.GatherAndCompare(r.registry.(*prometheus.Registry), strings.NewReader(` + # HELP cortex_prometheus_notifications_dropped_total Total number of alerts dropped due to errors when sending to Alertmanager. + # TYPE cortex_prometheus_notifications_dropped_total counter + cortex_prometheus_notifications_dropped_total{user="1"} 0 + `), "cortex_prometheus_notifications_dropped_total")) } func TestRuler_Rules(t *testing.T) {