Skip to content

Commit 76ef1af

Browse files
geroplroboquat
authored andcommitted
[ops] Add alert 'InstanceStartFailures'
1 parent 8aa11bd commit 76ef1af

File tree

1 file changed

+14
-0
lines changed

1 file changed

+14
-0
lines changed

operations/observability/mixins/meta/rules/components/server/alerts.libsonnet

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,20 @@
3535
description: 'Server has accumulated {{ printf "%.2f" $value }}ms event loop lag.',
3636
},
3737
},
38+
{
39+
alert: 'InstanceStartFailures',
40+
// Reasoning: 1 failure every 120s should not trigger an incident: 1/120 = 0.00833.. => 0.01
41+
expr: 'sum (irate(gitpod_server_instance_starts_failed_total[2m])) by (reason) > 0.01',
42+
'for': '30s',
43+
labels: {
44+
severity: 'critical',
45+
},
46+
annotations: {
47+
runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/InstanceStartFailures.md',
48+
summary: 'Server tries to start an instance, but cannot for whatever reason. Investigation required.',
49+
description: 'Server cannot start workspace instances on workspace clusters.',
50+
},
51+
},
3852
],
3953
},
4054
],

0 commit comments

Comments
 (0)