Skip to content

Commit 58d02b2

Browse files
committed
Review comments.
Signed-off-by: Steve Simpson <[email protected]>
1 parent 1b23be8 commit 58d02b2

File tree

3 files changed

+68
-19
lines changed

3 files changed

+68
-19
lines changed

pkg/alertmanager/alertmanager_metrics.go

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -178,31 +178,31 @@ func newAlertmanagerMetrics() *alertmanagerMetrics {
178178
fetchReplicaStateTotal: prometheus.NewDesc(
179179
"cortex_alertmanager_state_fetch_replica_state_total",
180180
"Number of times we have tried to read and merge the full state from another replica.",
181-
[]string{"user"}, nil),
181+
nil, nil),
182182
fetchReplicaStateFailed: prometheus.NewDesc(
183183
"cortex_alertmanager_state_fetch_replica_state_failed_total",
184184
"Number of times we have failed to read and merge the full state from another replica.",
185-
[]string{"user"}, nil),
185+
nil, nil),
186186
initialSyncTotal: prometheus.NewDesc(
187187
"cortex_alertmanager_state_initial_sync_total",
188188
"Number of times we have tried to sync initial state from peers or storage.",
189-
[]string{"user"}, nil),
189+
nil, nil),
190190
initialSyncCompleted: prometheus.NewDesc(
191191
"cortex_alertmanager_state_initial_sync_completed_total",
192192
"Number of times we have completed syncing initial state for each possible outcome.",
193-
[]string{"user", "outcome"}, nil),
193+
[]string{"outcome"}, nil),
194194
initialSyncDuration: prometheus.NewDesc(
195195
"cortex_alertmanager_state_initial_sync_duration_seconds",
196196
"Time spent syncing initial state from peers or storage.",
197197
nil, nil),
198198
persistTotal: prometheus.NewDesc(
199199
"cortex_alertmanager_state_persist_total",
200200
"Number of times we have tried to persist the running state to storage.",
201-
[]string{"user"}, nil),
201+
nil, nil),
202202
persistFailed: prometheus.NewDesc(
203203
"cortex_alertmanager_state_persist_failed_total",
204204
"Number of times we have failed to persist the running state to storage.",
205-
[]string{"user"}, nil),
205+
nil, nil),
206206
}
207207
}
208208

@@ -290,11 +290,11 @@ func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) {
290290
data.SendSumOfCountersPerUser(out, m.partialMergesFailed, "alertmanager_partial_state_merges_failed_total")
291291
data.SendSumOfCountersPerUser(out, m.replicationTotal, "alertmanager_state_replication_total")
292292
data.SendSumOfCountersPerUser(out, m.replicationFailed, "alertmanager_state_replication_failed_total")
293-
data.SendSumOfCountersPerUser(out, m.fetchReplicaStateTotal, "alertmanager_state_fetch_replica_state_total")
294-
data.SendSumOfCountersPerUser(out, m.fetchReplicaStateFailed, "alertmanager_state_fetch_replica_state_failed_total")
295-
data.SendSumOfCountersPerUser(out, m.initialSyncTotal, "alertmanager_state_initial_sync_total")
296-
data.SendSumOfCountersPerUserWithLabels(out, m.initialSyncCompleted, "alertmanager_state_replication_total", "outcome")
293+
data.SendSumOfCounters(out, m.fetchReplicaStateTotal, "alertmanager_state_fetch_replica_state_total")
294+
data.SendSumOfCounters(out, m.fetchReplicaStateFailed, "alertmanager_state_fetch_replica_state_failed_total")
295+
data.SendSumOfCounters(out, m.initialSyncTotal, "alertmanager_state_initial_sync_total")
296+
data.SendSumOfCountersWithLabels(out, m.initialSyncCompleted, "alertmanager_state_initial_sync_completed_total", "outcome")
297297
data.SendSumOfHistograms(out, m.initialSyncDuration, "alertmanager_state_initial_sync_duration_seconds")
298-
data.SendSumOfCountersPerUser(out, m.persistTotal, "alertmanager_state_persist_total")
299-
data.SendSumOfCountersPerUser(out, m.persistFailed, "alertmanager_state_persist_failed_total")
298+
data.SendSumOfCounters(out, m.persistTotal, "alertmanager_state_persist_total")
299+
data.SendSumOfCounters(out, m.persistFailed, "alertmanager_state_persist_failed_total")
300300
}

pkg/alertmanager/alertmanager_metrics_test.go

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,11 +254,26 @@ func TestAlertmanagerMetricsStore(t *testing.T) {
254254
# HELP cortex_alertmanager_silences_snapshot_size_bytes Size of the last silence snapshot in bytes.
255255
# TYPE cortex_alertmanager_silences_snapshot_size_bytes gauge
256256
cortex_alertmanager_silences_snapshot_size_bytes 111
257+
# HELP cortex_alertmanager_state_fetch_replica_state_failed_total Number of times we have failed to read and merge the full state from another replica.
258+
# TYPE cortex_alertmanager_state_fetch_replica_state_failed_total counter
259+
cortex_alertmanager_state_fetch_replica_state_failed_total 0
260+
# HELP cortex_alertmanager_state_fetch_replica_state_total Number of times we have tried to read and merge the full state from another replica.
261+
# TYPE cortex_alertmanager_state_fetch_replica_state_total counter
262+
cortex_alertmanager_state_fetch_replica_state_total 0
257263
# HELP cortex_alertmanager_state_initial_sync_duration_seconds Time spent syncing initial state from peers or storage.
258264
# TYPE cortex_alertmanager_state_initial_sync_duration_seconds histogram
259265
cortex_alertmanager_state_initial_sync_duration_seconds_bucket{le="+Inf"} 0
260266
cortex_alertmanager_state_initial_sync_duration_seconds_sum 0
261267
cortex_alertmanager_state_initial_sync_duration_seconds_count 0
268+
# HELP cortex_alertmanager_state_initial_sync_total Number of times we have tried to sync initial state from peers or storage.
269+
# TYPE cortex_alertmanager_state_initial_sync_total counter
270+
cortex_alertmanager_state_initial_sync_total 0
271+
# HELP cortex_alertmanager_state_persist_failed_total Number of times we have failed to persist the running state to storage.
272+
# TYPE cortex_alertmanager_state_persist_failed_total counter
273+
cortex_alertmanager_state_persist_failed_total 0
274+
# HELP cortex_alertmanager_state_persist_total Number of times we have tried to persist the running state to storage.
275+
# TYPE cortex_alertmanager_state_persist_total counter
276+
cortex_alertmanager_state_persist_total 0
262277
`))
263278
require.NoError(t, err)
264279
}
@@ -522,11 +537,26 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {
522537
# HELP cortex_alertmanager_silences_snapshot_size_bytes Size of the last silence snapshot in bytes.
523538
# TYPE cortex_alertmanager_silences_snapshot_size_bytes gauge
524539
cortex_alertmanager_silences_snapshot_size_bytes 111
540+
# HELP cortex_alertmanager_state_fetch_replica_state_failed_total Number of times we have failed to read and merge the full state from another replica.
541+
# TYPE cortex_alertmanager_state_fetch_replica_state_failed_total counter
542+
cortex_alertmanager_state_fetch_replica_state_failed_total 0
543+
# HELP cortex_alertmanager_state_fetch_replica_state_total Number of times we have tried to read and merge the full state from another replica.
544+
# TYPE cortex_alertmanager_state_fetch_replica_state_total counter
545+
cortex_alertmanager_state_fetch_replica_state_total 0
525546
# HELP cortex_alertmanager_state_initial_sync_duration_seconds Time spent syncing initial state from peers or storage.
526547
# TYPE cortex_alertmanager_state_initial_sync_duration_seconds histogram
527548
cortex_alertmanager_state_initial_sync_duration_seconds_bucket{le="+Inf"} 0
528549
cortex_alertmanager_state_initial_sync_duration_seconds_sum 0
529550
cortex_alertmanager_state_initial_sync_duration_seconds_count 0
551+
# HELP cortex_alertmanager_state_initial_sync_total Number of times we have tried to sync initial state from peers or storage.
552+
# TYPE cortex_alertmanager_state_initial_sync_total counter
553+
cortex_alertmanager_state_initial_sync_total 0
554+
# HELP cortex_alertmanager_state_persist_failed_total Number of times we have failed to persist the running state to storage.
555+
# TYPE cortex_alertmanager_state_persist_failed_total counter
556+
cortex_alertmanager_state_persist_failed_total 0
557+
# HELP cortex_alertmanager_state_persist_total Number of times we have tried to persist the running state to storage.
558+
# TYPE cortex_alertmanager_state_persist_total counter
559+
cortex_alertmanager_state_persist_total 0
530560
`))
531561
require.NoError(t, err)
532562

@@ -738,11 +768,26 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {
738768
# TYPE cortex_alertmanager_silences_snapshot_size_bytes gauge
739769
cortex_alertmanager_silences_snapshot_size_bytes 11
740770
771+
# HELP cortex_alertmanager_state_fetch_replica_state_failed_total Number of times we have failed to read and merge the full state from another replica.
772+
# TYPE cortex_alertmanager_state_fetch_replica_state_failed_total counter
773+
cortex_alertmanager_state_fetch_replica_state_failed_total 0
774+
# HELP cortex_alertmanager_state_fetch_replica_state_total Number of times we have tried to read and merge the full state from another replica.
775+
# TYPE cortex_alertmanager_state_fetch_replica_state_total counter
776+
cortex_alertmanager_state_fetch_replica_state_total 0
741777
# HELP cortex_alertmanager_state_initial_sync_duration_seconds Time spent syncing initial state from peers or storage.
742778
# TYPE cortex_alertmanager_state_initial_sync_duration_seconds histogram
743779
cortex_alertmanager_state_initial_sync_duration_seconds_bucket{le="+Inf"} 0
744780
cortex_alertmanager_state_initial_sync_duration_seconds_sum 0
745781
cortex_alertmanager_state_initial_sync_duration_seconds_count 0
782+
# HELP cortex_alertmanager_state_initial_sync_total Number of times we have tried to sync initial state from peers or storage.
783+
# TYPE cortex_alertmanager_state_initial_sync_total counter
784+
cortex_alertmanager_state_initial_sync_total 0
785+
# HELP cortex_alertmanager_state_persist_failed_total Number of times we have failed to persist the running state to storage.
786+
# TYPE cortex_alertmanager_state_persist_failed_total counter
787+
cortex_alertmanager_state_persist_failed_total 0
788+
# HELP cortex_alertmanager_state_persist_total Number of times we have tried to persist the running state to storage.
789+
# TYPE cortex_alertmanager_state_persist_total counter
790+
cortex_alertmanager_state_persist_total 0
746791
`))
747792
require.NoError(t, err)
748793
}

pkg/alertmanager/state_persister.go

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -98,28 +98,32 @@ func (s *statePersister) iteration(ctx context.Context) error {
9898
return nil
9999
}
100100

101-
func (s *statePersister) persist(ctx context.Context) error {
102-
s.persistTotal.Inc()
103-
101+
func (s *statePersister) persist(ctx context.Context) (err error) {
104102
// Only the replica at position zero should write the state.
105103
if s.state.Position() != 0 {
106104
return nil
107105
}
108106

107+
s.persistTotal.Inc()
108+
defer func() {
109+
if err != nil {
110+
s.persistFailed.Inc()
111+
}
112+
}()
113+
109114
level.Debug(s.logger).Log("msg", "persisting state", "user", s.userID)
110115

111-
fs, err := s.state.GetFullState()
116+
var fs *clusterpb.FullState
117+
fs, err = s.state.GetFullState()
112118
if err != nil {
113-
s.persistFailed.Inc()
114119
return err
115120
}
116121

117122
ctx, cancel := context.WithTimeout(ctx, s.timeout)
118123
defer cancel()
119124

120125
desc := alertspb.FullStateDesc{State: fs}
121-
if err := s.store.SetFullState(ctx, s.userID, desc); err != nil {
122-
s.persistFailed.Inc()
126+
if err = s.store.SetFullState(ctx, s.userID, desc); err != nil {
123127
return err
124128
}
125129

0 commit comments

Comments
 (0)