Skip to content

Support enabled_tenants and disabled_tenants in alertmanager #5116

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
* [ENHANCEMENT] Ingester: The metadata APIs should honour `querier.query-ingesters-within` when `querier.query-store-for-labels-enabled` is true. #5027
* [ENHANCEMENT] Query Frontend: Skip instant query roundtripper if sharding is not applicable. #5062
* [ENHANCEMENT] Push reduce one hash operation of Labels. #4945 #5114
* [ENHANCEMENT] Alertmanager: Added `-alertmanager.enabled-tenants` and `-alertmanager.disabled-tenants` to explicitly enable or disable alertmanager for specific tenants. #5116
* [FEATURE] Querier/Query Frontend: support Prometheus /api/v1/status/buildinfo API. #4978
* [FEATURE] Ingester: Add active series to all_user_stats page. #4972
* [FEATURE] Ingester: Added `-blocks-storage.tsdb.head-chunks-write-queue-size` allowing to configure the size of the in-memory queue used before flushing chunks to the disk . #5000
Expand Down
12 changes: 12 additions & 0 deletions docs/configuration/config-file-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -1886,6 +1886,18 @@ alertmanager_client:
# result in potentially fewer lost silences, and fewer duplicate notifications.
# CLI flag: -alertmanager.persist-interval
[persist_interval: <duration> | default = 15m]

# Comma separated list of tenants whose alerts this alertmanager can process. If
# specified, only these tenants will be handled by alertmanager, otherwise this
# alertmanager can process alerts from all tenants.
# CLI flag: -alertmanager.enabled-tenants
[enabled_tenants: <string> | default = ""]

# Comma separated list of tenants whose alerts this alertmanager cannot process.
# If specified, a alertmanager that would normally pick the specified tenant(s)
# for processing will ignore them instead.
# CLI flag: -alertmanager.disabled-tenants
[disabled_tenants: <string> | default = ""]
```

### `alertmanager_storage_config`
Expand Down
7 changes: 6 additions & 1 deletion pkg/alertmanager/distributor.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ func (d *Distributor) isUnaryReadPath(p string) bool {
// In case of reads, it proxies the request to one of the alertmanagers.
// DistributeRequest assumes that the caller has verified IsPathSupported returns
// true for the route.
func (d *Distributor) DistributeRequest(w http.ResponseWriter, r *http.Request) {
func (d *Distributor) DistributeRequest(w http.ResponseWriter, r *http.Request, allowedTenants *util.AllowedTenants) {
d.requestsInFlight.Add(1)
defer d.requestsInFlight.Done()

Expand All @@ -128,6 +128,11 @@ func (d *Distributor) DistributeRequest(w http.ResponseWriter, r *http.Request)
return
}

if !allowedTenants.IsAllowed(userID) {
http.Error(w, "Tenant is not allowed", http.StatusUnauthorized)
return
}

logger := util_log.WithContext(r.Context(), d.logger)

if r.Method == http.MethodPost {
Expand Down
18 changes: 17 additions & 1 deletion pkg/alertmanager/distributor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"github.com/cortexproject/cortex/pkg/ring"
"github.com/cortexproject/cortex/pkg/ring/kv"
"github.com/cortexproject/cortex/pkg/ring/kv/consul"
"github.com/cortexproject/cortex/pkg/util"
"github.com/cortexproject/cortex/pkg/util/flagext"
util_log "github.com/cortexproject/cortex/pkg/util/log"
"github.com/cortexproject/cortex/pkg/util/services"
Expand All @@ -40,6 +41,7 @@ func TestDistributor_DistributeRequest(t *testing.T) {
replicationFactor int
isRead bool
isDelete bool
isTenantDisabled bool
expStatusCode int
expectedTotalCalls int
headersNotPreserved bool
Expand All @@ -56,6 +58,16 @@ func TestDistributor_DistributeRequest(t *testing.T) {
expStatusCode: http.StatusOK,
expectedTotalCalls: 3,
route: "/alerts",
}, {
name: "Write /alerts, Simple AM request, all AM healthy, not allowed",
numAM: 4,
numHappyAM: 4,
replicationFactor: 3,
expStatusCode: http.StatusUnauthorized,
expectedTotalCalls: 0,
route: "/alerts",
headersNotPreserved: true,
isTenantDisabled: true,
}, {
name: "Write /alerts, Less than quorum AM available",
numAM: 1,
Expand Down Expand Up @@ -262,9 +274,13 @@ func TestDistributor_DistributeRequest(t *testing.T) {
req.Method = http.MethodDelete
}
req.RequestURI = url
var allowedTenants *util.AllowedTenants
if c.isTenantDisabled {
allowedTenants = util.NewAllowedTenants(nil, []string{"1"})
}

w := httptest.NewRecorder()
d.DistributeRequest(w, req)
d.DistributeRequest(w, req, allowedTenants)
resp := w.Result()
require.Equal(t, c.expStatusCode, resp.StatusCode)

Expand Down
29 changes: 28 additions & 1 deletion pkg/alertmanager/multitenant.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ type MultitenantAlertmanagerConfig struct {

// For the state persister.
Persister PersisterConfig `yaml:",inline"`

EnabledTenants flagext.StringSliceCSV `yaml:"enabled_tenants"`
DisabledTenants flagext.StringSliceCSV `yaml:"disabled_tenants"`
}

type ClusterConfig struct {
Expand Down Expand Up @@ -116,6 +119,8 @@ func (cfg *MultitenantAlertmanagerConfig) RegisterFlags(f *flag.FlagSet) {
f.BoolVar(&cfg.EnableAPI, "experimental.alertmanager.enable-api", false, "Enable the experimental alertmanager config api.")

f.BoolVar(&cfg.ShardingEnabled, "alertmanager.sharding-enabled", false, "Shard tenants across multiple alertmanager instances.")
f.Var(&cfg.EnabledTenants, "alertmanager.enabled-tenants", "Comma separated list of tenants whose alerts this alertmanager can process. If specified, only these tenants will be handled by alertmanager, otherwise this alertmanager can process alerts from all tenants.")
f.Var(&cfg.DisabledTenants, "alertmanager.disabled-tenants", "Comma separated list of tenants whose alerts this alertmanager cannot process. If specified, a alertmanager that would normally pick the specified tenant(s) for processing will ignore them instead.")

cfg.AlertmanagerClient.RegisterFlagsWithPrefix("alertmanager.alertmanager-client", f)
cfg.Persister.RegisterFlagsWithPrefix("alertmanager", f)
Expand Down Expand Up @@ -269,6 +274,8 @@ type MultitenantAlertmanager struct {

limits Limits

allowedTenants *util.AllowedTenants

registry prometheus.Registerer
ringCheckErrors prometheus.Counter
tenantsOwned prometheus.Gauge
Expand Down Expand Up @@ -359,6 +366,7 @@ func createMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, fallbackC
logger: log.With(logger, "component", "MultiTenantAlertmanager"),
registry: registerer,
limits: limits,
allowedTenants: util.NewAllowedTenants(cfg.EnabledTenants, cfg.DisabledTenants),
ringCheckErrors: promauto.With(registerer).NewCounter(prometheus.CounterOpts{
Name: "cortex_alertmanager_ring_check_errors_total",
Help: "Number of errors that have occurred when checking the ring for ownership.",
Expand Down Expand Up @@ -418,6 +426,13 @@ func createMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, fallbackC
}
}

if len(cfg.EnabledTenants) > 0 {
level.Info(am.logger).Log("msg", "alertmanager using enabled users", "enabled", strings.Join(cfg.EnabledTenants, ", "))
}
if len(cfg.DisabledTenants) > 0 {
level.Info(am.logger).Log("msg", "alertmanager using disabled users", "disabled", strings.Join(cfg.DisabledTenants, ", "))
}

if registerer != nil {
registerer.MustRegister(am.alertmanagerMetrics)
}
Expand Down Expand Up @@ -735,6 +750,10 @@ func (am *MultitenantAlertmanager) loadAlertmanagerConfigs(ctx context.Context)

// Filter out users not owned by this shard.
for _, userID := range allUserIDs {
if !am.allowedTenants.IsAllowed(userID) {
level.Debug(am.logger).Log("msg", "ignoring alertmanager for user, not allowed", "user", userID)
continue
}
if am.isUserOwned(userID) {
ownedUserIDs = append(ownedUserIDs, userID)
}
Expand Down Expand Up @@ -993,7 +1012,7 @@ func (am *MultitenantAlertmanager) ServeHTTP(w http.ResponseWriter, req *http.Re
}

if am.cfg.ShardingEnabled && am.distributor.IsPathSupported(req.URL.Path) {
am.distributor.DistributeRequest(w, req)
am.distributor.DistributeRequest(w, req, am.allowedTenants)
return
}

Expand All @@ -1014,6 +1033,10 @@ func (am *MultitenantAlertmanager) serveRequest(w http.ResponseWriter, req *http
http.Error(w, err.Error(), http.StatusUnauthorized)
return
}
if !am.allowedTenants.IsAllowed(userID) {
http.Error(w, "Tenant is not allowed", http.StatusUnauthorized)
return
}
am.alertmanagersMtx.Lock()
userAM, ok := am.alertmanagers[userID]
am.alertmanagersMtx.Unlock()
Expand Down Expand Up @@ -1197,6 +1220,10 @@ func (am *MultitenantAlertmanager) deleteUnusedRemoteUserState(ctx context.Conte
}

for _, userID := range usersWithState {
if !am.allowedTenants.IsAllowed(userID) {
level.Debug(am.logger).Log("msg", "not deleting remote state for user, not allowed", "user", userID)
continue
}
if _, ok := users[userID]; ok {
continue
}
Expand Down
46 changes: 46 additions & 0 deletions pkg/alertmanager/multitenant_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1110,13 +1110,29 @@ func TestMultitenantAlertmanager_PerTenantSharding(t *testing.T) {
configs int
expectedTenants int
withSharding bool
enabledTenants []string
disabledTenants []string
}{
{
name: "sharding disabled, 1 instance",
instances: 1,
configs: 10,
expectedTenants: 10,
},
{
name: "sharding disabled, 1 instance, single user allowed",
instances: 1,
configs: 10,
expectedTenants: 1,
enabledTenants: []string{"u-1"},
},
{
name: "sharding disabled, 1 instance, single user disabled",
instances: 1,
configs: 10,
expectedTenants: 9,
disabledTenants: []string{"u-2"},
},
{
name: "sharding disabled, 2 instances",
instances: 2,
Expand All @@ -1131,6 +1147,24 @@ func TestMultitenantAlertmanager_PerTenantSharding(t *testing.T) {
configs: 10,
expectedTenants: 10, // same as no sharding and 1 instance
},
{
name: "sharding enabled, 1 instance, enabled tenants, single user allowed",
withSharding: true,
instances: 1,
replicationFactor: 1,
configs: 10,
expectedTenants: 1,
enabledTenants: []string{"u-3"},
},
{
name: "sharding enabled, 1 instance, enabled tenants, single user disabled",
withSharding: true,
instances: 1,
replicationFactor: 1,
configs: 10,
expectedTenants: 9,
disabledTenants: []string{"u-4"},
},
{
name: "sharding enabled, 2 instances, RF = 1",
withSharding: true,
Expand All @@ -1155,6 +1189,15 @@ func TestMultitenantAlertmanager_PerTenantSharding(t *testing.T) {
configs: 10,
expectedTenants: 30, // configs * replication factor
},
{
name: "sharding enabled, 5 instances, RF = 3, two users disabled",
withSharding: true,
instances: 5,
replicationFactor: 3,
configs: 10,
expectedTenants: 24, // (configs - disabled-tenants) * replication factor
disabledTenants: []string{"u-1", "u-2"},
},
}

for _, tt := range tc {
Expand Down Expand Up @@ -1192,6 +1235,9 @@ func TestMultitenantAlertmanager_PerTenantSharding(t *testing.T) {
amConfig.PollInterval = time.Hour
amConfig.ShardingRing.RingCheckPeriod = time.Hour

amConfig.EnabledTenants = tt.enabledTenants
amConfig.DisabledTenants = tt.disabledTenants

if tt.withSharding {
amConfig.ShardingEnabled = true
}
Expand Down