From 5a64f813d359ced8999c8ab9d98ba8611c774116 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Wed, 1 Feb 2023 13:06:55 +0100 Subject: [PATCH 1/3] Support enabled_tenants and disabled_tenants in alertmanager Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 1 + docs/configuration/config-file-reference.md | 12 ++++++ pkg/alertmanager/multitenant.go | 23 +++++++++++ pkg/alertmanager/multitenant_test.go | 46 +++++++++++++++++++++ 4 files changed, 82 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 98f44cd4512..b4ec404a69c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ * [ENHANCEMENT] Ingester: The metadata APIs should honour `querier.query-ingesters-within` when `querier.query-store-for-labels-enabled` is true. #5027 * [ENHANCEMENT] Query Frontend: Skip instant query roundtripper if sharding is not applicable. #5062 * [ENHANCEMENT] Push reduce one hash operation of Labels. #4945 #5114 +* [ENHANCEMENT] Alertmanager: Added `-alertmanager.enabled-tenants` and `-alertmanager.disabled-tenants` to explicitly enable or disable alertmanager for specific tenants. * [FEATURE] Querier/Query Frontend: support Prometheus /api/v1/status/buildinfo API. #4978 * [FEATURE] Ingester: Add active series to all_user_stats page. #4972 * [FEATURE] Ingester: Added `-blocks-storage.tsdb.head-chunks-write-queue-size` allowing to configure the size of the in-memory queue used before flushing chunks to the disk . #5000 diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index 18e56571ba0..259e80a530c 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -1886,6 +1886,18 @@ alertmanager_client: # result in potentially fewer lost silences, and fewer duplicate notifications. # CLI flag: -alertmanager.persist-interval [persist_interval: | default = 15m] + +# Comma separated list of tenants whose alerts this alertmanager can process. If +# specified, only these tenants will be handled by alertmanager, otherwise this +# alertmanager can process alerts from all tenants. +# CLI flag: -alertmanager.enabled-tenants +[enabled_tenants: | default = ""] + +# Comma separated list of tenants whose alerts this alertmanager cannot process. +# If specified, a alertmanager that would normally pick the specified tenant(s) +# for processing will ignore them instead. +# CLI flag: -alertmanager.disabled-tenants +[disabled_tenants: | default = ""] ``` ### `alertmanager_storage_config` diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index 4a5c2b7ab55..2050a48a5f3 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -85,6 +85,9 @@ type MultitenantAlertmanagerConfig struct { // For the state persister. Persister PersisterConfig `yaml:",inline"` + + EnabledTenants flagext.StringSliceCSV `yaml:"enabled_tenants"` + DisabledTenants flagext.StringSliceCSV `yaml:"disabled_tenants"` } type ClusterConfig struct { @@ -116,6 +119,8 @@ func (cfg *MultitenantAlertmanagerConfig) RegisterFlags(f *flag.FlagSet) { f.BoolVar(&cfg.EnableAPI, "experimental.alertmanager.enable-api", false, "Enable the experimental alertmanager config api.") f.BoolVar(&cfg.ShardingEnabled, "alertmanager.sharding-enabled", false, "Shard tenants across multiple alertmanager instances.") + f.Var(&cfg.EnabledTenants, "alertmanager.enabled-tenants", "Comma separated list of tenants whose alerts this alertmanager can process. If specified, only these tenants will be handled by alertmanager, otherwise this alertmanager can process alerts from all tenants.") + f.Var(&cfg.DisabledTenants, "alertmanager.disabled-tenants", "Comma separated list of tenants whose alerts this alertmanager cannot process. If specified, a alertmanager that would normally pick the specified tenant(s) for processing will ignore them instead.") cfg.AlertmanagerClient.RegisterFlagsWithPrefix("alertmanager.alertmanager-client", f) cfg.Persister.RegisterFlagsWithPrefix("alertmanager", f) @@ -269,6 +274,8 @@ type MultitenantAlertmanager struct { limits Limits + allowedTenants *util.AllowedTenants + registry prometheus.Registerer ringCheckErrors prometheus.Counter tenantsOwned prometheus.Gauge @@ -359,6 +366,7 @@ func createMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, fallbackC logger: log.With(logger, "component", "MultiTenantAlertmanager"), registry: registerer, limits: limits, + allowedTenants: util.NewAllowedTenants(cfg.EnabledTenants, cfg.DisabledTenants), ringCheckErrors: promauto.With(registerer).NewCounter(prometheus.CounterOpts{ Name: "cortex_alertmanager_ring_check_errors_total", Help: "Number of errors that have occurred when checking the ring for ownership.", @@ -418,6 +426,13 @@ func createMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, fallbackC } } + if len(cfg.EnabledTenants) > 0 { + level.Info(am.logger).Log("msg", "alertmanager using enabled users", "enabled", strings.Join(cfg.EnabledTenants, ", ")) + } + if len(cfg.DisabledTenants) > 0 { + level.Info(am.logger).Log("msg", "alertmanager using disabled users", "disabled", strings.Join(cfg.DisabledTenants, ", ")) + } + if registerer != nil { registerer.MustRegister(am.alertmanagerMetrics) } @@ -735,6 +750,10 @@ func (am *MultitenantAlertmanager) loadAlertmanagerConfigs(ctx context.Context) // Filter out users not owned by this shard. for _, userID := range allUserIDs { + if !am.allowedTenants.IsAllowed(userID) { + level.Debug(am.logger).Log("msg", "ignoring alertmanager for user, not allowed", "user", userID) + continue + } if am.isUserOwned(userID) { ownedUserIDs = append(ownedUserIDs, userID) } @@ -1197,6 +1216,10 @@ func (am *MultitenantAlertmanager) deleteUnusedRemoteUserState(ctx context.Conte } for _, userID := range usersWithState { + if !am.allowedTenants.IsAllowed(userID) { + level.Debug(am.logger).Log("msg", "not deleting remote state for user, not allowed", "user", userID) + continue + } if _, ok := users[userID]; ok { continue } diff --git a/pkg/alertmanager/multitenant_test.go b/pkg/alertmanager/multitenant_test.go index 8c953d054f6..bd52526068f 100644 --- a/pkg/alertmanager/multitenant_test.go +++ b/pkg/alertmanager/multitenant_test.go @@ -1110,6 +1110,8 @@ func TestMultitenantAlertmanager_PerTenantSharding(t *testing.T) { configs int expectedTenants int withSharding bool + enabledTenants []string + disabledTenants []string }{ { name: "sharding disabled, 1 instance", @@ -1117,6 +1119,20 @@ func TestMultitenantAlertmanager_PerTenantSharding(t *testing.T) { configs: 10, expectedTenants: 10, }, + { + name: "sharding disabled, 1 instance, single user allowed", + instances: 1, + configs: 10, + expectedTenants: 1, + enabledTenants: []string{"u-1"}, + }, + { + name: "sharding disabled, 1 instance, single user disabled", + instances: 1, + configs: 10, + expectedTenants: 9, + disabledTenants: []string{"u-2"}, + }, { name: "sharding disabled, 2 instances", instances: 2, @@ -1131,6 +1147,24 @@ func TestMultitenantAlertmanager_PerTenantSharding(t *testing.T) { configs: 10, expectedTenants: 10, // same as no sharding and 1 instance }, + { + name: "sharding enabled, 1 instance, enabled tenants, single user allowed", + withSharding: true, + instances: 1, + replicationFactor: 1, + configs: 10, + expectedTenants: 1, + enabledTenants: []string{"u-3"}, + }, + { + name: "sharding enabled, 1 instance, enabled tenants, single user disabled", + withSharding: true, + instances: 1, + replicationFactor: 1, + configs: 10, + expectedTenants: 9, + disabledTenants: []string{"u-4"}, + }, { name: "sharding enabled, 2 instances, RF = 1", withSharding: true, @@ -1155,6 +1189,15 @@ func TestMultitenantAlertmanager_PerTenantSharding(t *testing.T) { configs: 10, expectedTenants: 30, // configs * replication factor }, + { + name: "sharding enabled, 5 instances, RF = 3, two users disabled", + withSharding: true, + instances: 5, + replicationFactor: 3, + configs: 10, + expectedTenants: 24, // (configs - disabled-tenants) * replication factor + disabledTenants: []string{"u-1", "u-2"}, + }, } for _, tt := range tc { @@ -1192,6 +1235,9 @@ func TestMultitenantAlertmanager_PerTenantSharding(t *testing.T) { amConfig.PollInterval = time.Hour amConfig.ShardingRing.RingCheckPeriod = time.Hour + amConfig.EnabledTenants = tt.enabledTenants + amConfig.DisabledTenants = tt.disabledTenants + if tt.withSharding { amConfig.ShardingEnabled = true } From bfa1be3e216b9f03856428d9bc807d8579544833 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Wed, 1 Feb 2023 13:43:31 +0100 Subject: [PATCH 2/3] Add PR number Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b4ec404a69c..cce1bae0c13 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,7 @@ * [ENHANCEMENT] Ingester: The metadata APIs should honour `querier.query-ingesters-within` when `querier.query-store-for-labels-enabled` is true. #5027 * [ENHANCEMENT] Query Frontend: Skip instant query roundtripper if sharding is not applicable. #5062 * [ENHANCEMENT] Push reduce one hash operation of Labels. #4945 #5114 -* [ENHANCEMENT] Alertmanager: Added `-alertmanager.enabled-tenants` and `-alertmanager.disabled-tenants` to explicitly enable or disable alertmanager for specific tenants. +* [ENHANCEMENT] Alertmanager: Added `-alertmanager.enabled-tenants` and `-alertmanager.disabled-tenants` to explicitly enable or disable alertmanager for specific tenants. #5116 * [FEATURE] Querier/Query Frontend: support Prometheus /api/v1/status/buildinfo API. #4978 * [FEATURE] Ingester: Add active series to all_user_stats page. #4972 * [FEATURE] Ingester: Added `-blocks-storage.tsdb.head-chunks-write-queue-size` allowing to configure the size of the in-memory queue used before flushing chunks to the disk . #5000 From 93a48e95d941f39740dd77fb2bf9226497337521 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Wed, 1 Feb 2023 22:35:26 +0100 Subject: [PATCH 3/3] Disable alertmanager UI and API for tenants disabled Signed-off-by: Friedrich Gonzalez --- pkg/alertmanager/distributor.go | 7 ++++++- pkg/alertmanager/distributor_test.go | 18 +++++++++++++++++- pkg/alertmanager/multitenant.go | 6 +++++- 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/pkg/alertmanager/distributor.go b/pkg/alertmanager/distributor.go index df16f9702cc..9f815edb48e 100644 --- a/pkg/alertmanager/distributor.go +++ b/pkg/alertmanager/distributor.go @@ -118,7 +118,7 @@ func (d *Distributor) isUnaryReadPath(p string) bool { // In case of reads, it proxies the request to one of the alertmanagers. // DistributeRequest assumes that the caller has verified IsPathSupported returns // true for the route. -func (d *Distributor) DistributeRequest(w http.ResponseWriter, r *http.Request) { +func (d *Distributor) DistributeRequest(w http.ResponseWriter, r *http.Request, allowedTenants *util.AllowedTenants) { d.requestsInFlight.Add(1) defer d.requestsInFlight.Done() @@ -128,6 +128,11 @@ func (d *Distributor) DistributeRequest(w http.ResponseWriter, r *http.Request) return } + if !allowedTenants.IsAllowed(userID) { + http.Error(w, "Tenant is not allowed", http.StatusUnauthorized) + return + } + logger := util_log.WithContext(r.Context(), d.logger) if r.Method == http.MethodPost { diff --git a/pkg/alertmanager/distributor_test.go b/pkg/alertmanager/distributor_test.go index 98245eaa53a..40ef486cfaa 100644 --- a/pkg/alertmanager/distributor_test.go +++ b/pkg/alertmanager/distributor_test.go @@ -27,6 +27,7 @@ import ( "github.com/cortexproject/cortex/pkg/ring" "github.com/cortexproject/cortex/pkg/ring/kv" "github.com/cortexproject/cortex/pkg/ring/kv/consul" + "github.com/cortexproject/cortex/pkg/util" "github.com/cortexproject/cortex/pkg/util/flagext" util_log "github.com/cortexproject/cortex/pkg/util/log" "github.com/cortexproject/cortex/pkg/util/services" @@ -40,6 +41,7 @@ func TestDistributor_DistributeRequest(t *testing.T) { replicationFactor int isRead bool isDelete bool + isTenantDisabled bool expStatusCode int expectedTotalCalls int headersNotPreserved bool @@ -56,6 +58,16 @@ func TestDistributor_DistributeRequest(t *testing.T) { expStatusCode: http.StatusOK, expectedTotalCalls: 3, route: "/alerts", + }, { + name: "Write /alerts, Simple AM request, all AM healthy, not allowed", + numAM: 4, + numHappyAM: 4, + replicationFactor: 3, + expStatusCode: http.StatusUnauthorized, + expectedTotalCalls: 0, + route: "/alerts", + headersNotPreserved: true, + isTenantDisabled: true, }, { name: "Write /alerts, Less than quorum AM available", numAM: 1, @@ -262,9 +274,13 @@ func TestDistributor_DistributeRequest(t *testing.T) { req.Method = http.MethodDelete } req.RequestURI = url + var allowedTenants *util.AllowedTenants + if c.isTenantDisabled { + allowedTenants = util.NewAllowedTenants(nil, []string{"1"}) + } w := httptest.NewRecorder() - d.DistributeRequest(w, req) + d.DistributeRequest(w, req, allowedTenants) resp := w.Result() require.Equal(t, c.expStatusCode, resp.StatusCode) diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index 2050a48a5f3..e868c0cd51a 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -1012,7 +1012,7 @@ func (am *MultitenantAlertmanager) ServeHTTP(w http.ResponseWriter, req *http.Re } if am.cfg.ShardingEnabled && am.distributor.IsPathSupported(req.URL.Path) { - am.distributor.DistributeRequest(w, req) + am.distributor.DistributeRequest(w, req, am.allowedTenants) return } @@ -1033,6 +1033,10 @@ func (am *MultitenantAlertmanager) serveRequest(w http.ResponseWriter, req *http http.Error(w, err.Error(), http.StatusUnauthorized) return } + if !am.allowedTenants.IsAllowed(userID) { + http.Error(w, "Tenant is not allowed", http.StatusUnauthorized) + return + } am.alertmanagersMtx.Lock() userAM, ok := am.alertmanagers[userID] am.alertmanagersMtx.Unlock()