Skip to content

Commit 92a0b0f

Browse files
committed
[alerts] Group by cluster, where relevant, ahead of centralizing rule evaluation
1 parent 6fb1ac0 commit 92a0b0f

File tree

1 file changed

+11
-11
lines changed

1 file changed

+11
-11
lines changed

operations/observability/mixins/meta/rules/server.yaml

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,23 +15,23 @@ spec:
1515
- name: server
1616
rules:
1717
- alert: WebsocketConnectionsNotClosing
18-
expr: sum(server_websocket_connection_count) == 10000
18+
expr: sum(server_websocket_connection_count) by (cluster) == 10000
1919
for: 10m
2020
labels:
2121
severity: critical
2222
annotations:
2323
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebsocketConnectionsNotClosing.md
24-
summary: Open websocket connections are not closing for the last 10 minutes and accumulating.
24+
summary: Open websocket connections are not closing for the last 10 minutes and accumulating in {{ $labels.cluster }}.
2525
description: We have accumulated {{ printf "%.2f" $value }} open websocket connections.
2626

2727
- alert: ServerEventLoopLagTooHigh
28-
expr: avg_over_time(nodejs_eventloop_lag_seconds{job="server"}[20m]) > 0.35
28+
expr: avg_over_time(nodejs_eventloop_lag_seconds{job="server"}[20m]) by (cluster) > 0.35
2929
for: 5m
3030
labels:
3131
severity: critical
3232
annotations:
3333
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/ServerEventLoopLagTooHigh.md
34-
summary: Server accumulated too much "event loop lag". The webapp will become unresponsive if we don't act here.
34+
summary: Server accumulated too much "event loop lag" on {{ $labels.cluster }}. The webapp will become unresponsive if we don't act here.
3535
description: Server has accumulated {{ printf "%.2f" $value }}s event loop lag.
3636

3737
- alert: InstanceStartFailures
@@ -48,15 +48,15 @@ spec:
4848
# Rollout alerts
4949
- alert: JsonRpcApiErrorRates
5050
# Reasoning: the values are taken from past data
51-
expr: sum (rate(gitpod_server_api_calls_total{statusCode!~"2..|429"}[5m])) / sum(rate(gitpod_server_api_calls_total[5m])) > 0.04
51+
expr: sum (rate(gitpod_server_api_calls_total{statusCode!~"2..|429"}[5m])) by (cluster) / sum(rate(gitpod_server_api_calls_total[5m])) by (cluster) > 0.04
5252
for: 5m
5353
labels:
5454
# sent to the team internal channel until we fine tuned it
5555
severity: warning
5656
team: webapp
5757
annotations:
5858
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodApiErrorRate.md
59-
summary: The error rate of the JSON RPC API is high. Investigation required.
59+
summary: The error rate of the JSON RPC API is high on {{ $labels.cluster }}. Investigation required.
6060
description: JSON RPC API error rate high
6161

6262
- alert: WebsocketConnectionRateHigh
@@ -87,20 +87,20 @@ spec:
8787
# description: db-sync pod not running
8888

8989
- alert: MessagebusNotRunning
90-
expr: up{job="messagebus"} < 1
90+
expr: sum(up{job="messagebus"}) by (cluster) < 1
9191
for: 2m
9292
labels:
9393
# sent to the team internal channel until we fine tuned it
9494
severity: warning
9595
team: webapp
9696
annotations:
9797
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/MessagebusNotRunning.md
98-
summary: The messagebus pod is not running. Workspace information is not being correctly propagated into web app clusters. Investigation required.
98+
summary: The messagebus pod is not running in {{ $labels.cluster }}. Workspace information is not being correctly propagated into web app clusters. Investigation required.
9999
description: Messagebus pod not running
100100

101101
- alert: WebAppServicesHighCPUUsage
102102
# Reasoning: high rates of CPU consumption should only be temporary.
103-
expr: sum(rate(container_cpu_usage_seconds_total{container!="POD", node=~".*", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m])) by (pod, node) > 0.80
103+
expr: sum(rate(container_cpu_usage_seconds_total{container!="POD", node=~".*", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m])) by (pod, node, cluster) > 0.80
104104
for: 10m
105105
labels:
106106
# sent to the team internal channel until we fine tuned it
@@ -114,13 +114,13 @@ spec:
114114

115115
- alert: WebAppServicesCrashlooping
116116
# Reasoning: alert if any pod is restarting more than 3 times / 5 minutes.
117-
expr: increase(kube_pod_container_status_restarts_total{container!="POD", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m]) > 3
117+
expr: sum(increase(kube_pod_container_status_restarts_total{container!="POD", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m])) by (cluster) > 3
118118
for: 5m
119119
labels:
120120
# sent to the team internal channel until we fine tuned it
121121
severity: warning
122122
team: webapp
123123
annotations:
124124
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebAppServicesCrashlooping.md
125-
summary: Pod is crash looping.
125+
summary: Pod is crash looping in {{ $labels.cluster }}.
126126
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes

0 commit comments

Comments
 (0)