File tree 1 file changed +14
-0
lines changed
operations/observability/mixins/meta/rules/components/server
1 file changed +14
-0
lines changed Original file line number Diff line number Diff line change 35
35
description: 'Server has accumulated {{ printf "%.2f" $value }}ms event loop lag.' ,
36
36
},
37
37
},
38
+ {
39
+ alert: 'InstanceStartFailures' ,
40
+ // Reasoning: 1 failure every 120s should not trigger an incident: 1/120 = 0.00833.. => 0.01
41
+ expr: 'sum (irate(gitpod_server_instance_starts_failed_total[2m])) by (reason) > 0.01' ,
42
+ 'for' : '30s' ,
43
+ labels: {
44
+ severity: 'critical' ,
45
+ },
46
+ annotations: {
47
+ runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/InstanceStartFailures.md' ,
48
+ summary: 'Server tries to start an instance, but cannot for whatever reason. Investigation required.' ,
49
+ description: 'Server cannot start workspace instances on workspace clusters.' ,
50
+ },
51
+ },
38
52
],
39
53
},
40
54
],
You can’t perform that action at this time.
0 commit comments