From fc4126f67243f5ffaf20628c96e18a7afdbaffdb Mon Sep 17 00:00:00 2001 From: Nick Pillitteri Date: Thu, 3 Jun 2021 13:36:15 -0400 Subject: [PATCH 1/3] Enable active series metrics in the ingester by default This change calculates and exports the `cortex_ingester_active_series` by default. Up to this point, the metric was disabled by default since calculating it consumes some amount of memory. The original PR (#3153) estimated for 1M active series at least 40MB and up to another 200MB depending on our luck reusing labels from the ref cache. We (Grafana) have been running with this setting enabled on our ingesters for some time and the resource usage doesn't appear to be significant. This feature appears to add between 1.2 - 1.6% in memory usage when enabled: ~140MB out of a total of ~10GB of memory used per ingester. The ingesters I measured this on * Have multiple tenants running production workloads * Have about 1.3M active series each * Have about a 10GB working set (as measured by `kubectl top` and exported k8s metrics) Based on this and the utility of the metric itself, I'd like to enable it by default. Screenshots of the pprof heap output attached. Signed-off-by: Nick Pillitteri --- CHANGELOG.md | 1 + docs/configuration/config-file-reference.md | 2 +- pkg/ingester/ingester.go | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0fd2370f7ef..5bc7ddf91fe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ - `-alertmanager.receivers-firewall.block.cidr-networks` renamed to `-alertmanager.receivers-firewall-block-cidr-networks` - `-alertmanager.receivers-firewall.block.private-addresses` renamed to `-alertmanager.receivers-firewall-block-private-addresses` * [CHANGE] Change default value of `-server.grpc.keepalive.min-time-between-pings` to `10s` and `-server.grpc.keepalive.ping-without-stream-allowed` to `true`. #4168 +* [CHANGE] Ingester: Change default value of `-ingester.active-series-metrics-enabled` to `true`. * [FEATURE] Querier: Added new `-querier.max-fetched-series-per-query` flag. When Cortex is running with blocks storage, the max series per query limit is enforced in the querier and applies to unique series received from ingesters and store-gateway (long-term storage). #4179 * [FEATURE] Querier/Ruler: Added new `-querier.max-fetched-chunk-bytes-per-query` flag. When Cortex is running with blocks storage, the max chunk bytes limit is enforced in the querier and ruler and limits the size of all aggregated chunks returned from ingesters and storage as bytes for a query. #4216 * [FEATURE] Alertmanager: Added rate-limits to notifiers. Rate limits used by all integrations can be configured using `-alertmanager.notification-rate-limit`, while per-integration rate limits can be specified via `-alertmanager.notification-rate-limit-per-integration` parameter. Both shared and per-integration limits can be overwritten using overrides mechanism. These limits are applied on individual (per-tenant) alertmanagers. Rate-limited notifications are failed notifications. It is possible to monitor rate-limited notifications via new `cortex_alertmanager_notification_rate_limited_total` metric. #4135 #4163 diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index 790fae634fe..b2cdd7c02c4 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -776,7 +776,7 @@ lifecycler: # Enable tracking of active series and export them as metrics. # CLI flag: -ingester.active-series-metrics-enabled -[active_series_metrics_enabled: | default = false] +[active_series_metrics_enabled: | default = true] # How often to update active series metrics. # CLI flag: -ingester.active-series-metrics-update-period diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 7f387b04abb..8ac907a8d6f 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -122,7 +122,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) { f.DurationVar(&cfg.MetadataRetainPeriod, "ingester.metadata-retain-period", 10*time.Minute, "Period at which metadata we have not seen will remain in memory before being deleted.") f.DurationVar(&cfg.RateUpdatePeriod, "ingester.rate-update-period", 15*time.Second, "Period with which to update the per-user ingestion rates.") - f.BoolVar(&cfg.ActiveSeriesMetricsEnabled, "ingester.active-series-metrics-enabled", false, "Enable tracking of active series and export them as metrics.") + f.BoolVar(&cfg.ActiveSeriesMetricsEnabled, "ingester.active-series-metrics-enabled", true, "Enable tracking of active series and export them as metrics.") f.DurationVar(&cfg.ActiveSeriesMetricsUpdatePeriod, "ingester.active-series-metrics-update-period", 1*time.Minute, "How often to update active series metrics.") f.DurationVar(&cfg.ActiveSeriesMetricsIdleTimeout, "ingester.active-series-metrics-idle-timeout", 10*time.Minute, "After what time a series is considered to be inactive.") f.BoolVar(&cfg.StreamChunksWhenUsingBlocks, "ingester.stream-chunks-when-using-blocks", false, "Stream chunks when using blocks. This is experimental feature and not yet tested. Once ready, it will be made default and this config option removed.") From 9c3831540fbae675a569b6a28ba36268e86994bd Mon Sep 17 00:00:00 2001 From: Nick Pillitteri Date: Thu, 3 Jun 2021 13:55:14 -0400 Subject: [PATCH 2/3] Changelog Signed-off-by: Nick Pillitteri --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5bc7ddf91fe..721ef0a7ea7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ - `-alertmanager.receivers-firewall.block.cidr-networks` renamed to `-alertmanager.receivers-firewall-block-cidr-networks` - `-alertmanager.receivers-firewall.block.private-addresses` renamed to `-alertmanager.receivers-firewall-block-private-addresses` * [CHANGE] Change default value of `-server.grpc.keepalive.min-time-between-pings` to `10s` and `-server.grpc.keepalive.ping-without-stream-allowed` to `true`. #4168 -* [CHANGE] Ingester: Change default value of `-ingester.active-series-metrics-enabled` to `true`. +* [CHANGE] Ingester: Change default value of `-ingester.active-series-metrics-enabled` to `true`. #4257 * [FEATURE] Querier: Added new `-querier.max-fetched-series-per-query` flag. When Cortex is running with blocks storage, the max series per query limit is enforced in the querier and applies to unique series received from ingesters and store-gateway (long-term storage). #4179 * [FEATURE] Querier/Ruler: Added new `-querier.max-fetched-chunk-bytes-per-query` flag. When Cortex is running with blocks storage, the max chunk bytes limit is enforced in the querier and ruler and limits the size of all aggregated chunks returned from ingesters and storage as bytes for a query. #4216 * [FEATURE] Alertmanager: Added rate-limits to notifiers. Rate limits used by all integrations can be configured using `-alertmanager.notification-rate-limit`, while per-integration rate limits can be specified via `-alertmanager.notification-rate-limit-per-integration` parameter. Both shared and per-integration limits can be overwritten using overrides mechanism. These limits are applied on individual (per-tenant) alertmanagers. Rate-limited notifications are failed notifications. It is possible to monitor rate-limited notifications via new `cortex_alertmanager_notification_rate_limited_total` metric. #4135 #4163 From c6afabcffc2f3e4a1ac488de65590ac43c7a15f6 Mon Sep 17 00:00:00 2001 From: Nick Pillitteri Date: Fri, 4 Jun 2021 09:15:33 -0400 Subject: [PATCH 3/3] More details in CHANGELOG entry Signed-off-by: Nick Pillitteri --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 721ef0a7ea7..685a2c05a3c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ - `-alertmanager.receivers-firewall.block.cidr-networks` renamed to `-alertmanager.receivers-firewall-block-cidr-networks` - `-alertmanager.receivers-firewall.block.private-addresses` renamed to `-alertmanager.receivers-firewall-block-private-addresses` * [CHANGE] Change default value of `-server.grpc.keepalive.min-time-between-pings` to `10s` and `-server.grpc.keepalive.ping-without-stream-allowed` to `true`. #4168 -* [CHANGE] Ingester: Change default value of `-ingester.active-series-metrics-enabled` to `true`. #4257 +* [CHANGE] Ingester: Change default value of `-ingester.active-series-metrics-enabled` to `true`. This incurs a small increase in memory usage, between 1.2% and 1.6% as measured on ingesters with 1.3M active series. #4257 * [FEATURE] Querier: Added new `-querier.max-fetched-series-per-query` flag. When Cortex is running with blocks storage, the max series per query limit is enforced in the querier and applies to unique series received from ingesters and store-gateway (long-term storage). #4179 * [FEATURE] Querier/Ruler: Added new `-querier.max-fetched-chunk-bytes-per-query` flag. When Cortex is running with blocks storage, the max chunk bytes limit is enforced in the querier and ruler and limits the size of all aggregated chunks returned from ingesters and storage as bytes for a query. #4216 * [FEATURE] Alertmanager: Added rate-limits to notifiers. Rate limits used by all integrations can be configured using `-alertmanager.notification-rate-limit`, while per-integration rate limits can be specified via `-alertmanager.notification-rate-limit-per-integration` parameter. Both shared and per-integration limits can be overwritten using overrides mechanism. These limits are applied on individual (per-tenant) alertmanagers. Rate-limited notifications are failed notifications. It is possible to monitor rate-limited notifications via new `cortex_alertmanager_notification_rate_limited_total` metric. #4135 #4163