[alerts] Group by cluster, where relevant, ahead of centralizing rule evaluation

easyCZ · easyCZ · commit 92a0b0fa5e24 · 2022-10-11T12:52:59.000Z
diff --git a/operations/observability/mixins/meta/rules/server.yaml b/operations/observability/mixins/meta/rules/server.yaml
@@ -15,23 +15,23 @@ spec:
   - name: server
     rules:
     - alert: WebsocketConnectionsNotClosing
-      expr: sum(server_websocket_connection_count) == 10000
+      expr: sum(server_websocket_connection_count) by (cluster) == 10000
       for: 10m
       labels:
         severity: critical
       annotations:
         runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebsocketConnectionsNotClosing.md
-        summary: Open websocket connections are not closing for the last 10 minutes and accumulating.
+        summary: Open websocket connections are not closing for the last 10 minutes and accumulating in {{ $labels.cluster }}.
         description: We have accumulated {{ printf "%.2f" $value }} open websocket connections.
 
     - alert: ServerEventLoopLagTooHigh
-      expr: avg_over_time(nodejs_eventloop_lag_seconds{job="server"}[20m]) > 0.35
+      expr: avg_over_time(nodejs_eventloop_lag_seconds{job="server"}[20m]) by (cluster) > 0.35
       for: 5m
       labels:
         severity: critical
       annotations:
         runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/ServerEventLoopLagTooHigh.md
-        summary: Server accumulated too much "event loop lag". The webapp will become unresponsive if we don't act here.
+        summary: Server accumulated too much "event loop lag" on {{ $labels.cluster }}. The webapp will become unresponsive if we don't act here.
         description: Server has accumulated {{ printf "%.2f" $value }}s event loop lag.
 
     - alert: InstanceStartFailures
@@ -48,15 +48,15 @@ spec:
     # Rollout alerts
     - alert: JsonRpcApiErrorRates
       # Reasoning: the values are taken from past data
-      expr: sum (rate(gitpod_server_api_calls_total{statusCode!~"2..|429"}[5m])) / sum(rate(gitpod_server_api_calls_total[5m])) > 0.04
+      expr: sum (rate(gitpod_server_api_calls_total{statusCode!~"2..|429"}[5m])) by (cluster) / sum(rate(gitpod_server_api_calls_total[5m])) by (cluster) > 0.04
       for: 5m
       labels:
         # sent to the team internal channel until we fine tuned it
         severity: warning
         team: webapp
       annotations:
         runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodApiErrorRate.md
-        summary: The error rate of the JSON RPC API is high. Investigation required.
+        summary: The error rate of the JSON RPC API is high on {{ $labels.cluster }}. Investigation required.
         description: JSON RPC API error rate high
 
     - alert: WebsocketConnectionRateHigh
@@ -87,20 +87,20 @@ spec:
     #     description: db-sync pod not running
 
     - alert: MessagebusNotRunning
-      expr: up{job="messagebus"} < 1
+      expr: sum(up{job="messagebus"}) by (cluster) < 1
       for: 2m
       labels:
         # sent to the team internal channel until we fine tuned it
         severity: warning
         team: webapp
       annotations:
         runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/MessagebusNotRunning.md
-        summary: The messagebus pod is not running. Workspace information is not being correctly propagated into web app clusters. Investigation required.
+        summary: The messagebus pod is not running in {{ $labels.cluster }}. Workspace information is not being correctly propagated into web app clusters. Investigation required.
         description: Messagebus pod not running
 
     - alert: WebAppServicesHighCPUUsage
       # Reasoning: high rates of CPU consumption should only be temporary.
-      expr: sum(rate(container_cpu_usage_seconds_total{container!="POD", node=~".*", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m])) by (pod, node) > 0.80
+      expr: sum(rate(container_cpu_usage_seconds_total{container!="POD", node=~".*", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m])) by (pod, node, cluster) > 0.80
       for: 10m
       labels:
         # sent to the team internal channel until we fine tuned it
@@ -114,13 +114,13 @@ spec:
 
     - alert: WebAppServicesCrashlooping
       # Reasoning: alert if any pod is restarting more than 3 times / 5 minutes.
-      expr: increase(kube_pod_container_status_restarts_total{container!="POD", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m]) > 3
+      expr: sum(increase(kube_pod_container_status_restarts_total{container!="POD", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m])) by (cluster) > 3
       for: 5m
       labels:
         # sent to the team internal channel until we fine tuned it
         severity: warning
         team: webapp
       annotations:
         runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebAppServicesCrashlooping.md
-        summary: Pod is crash looping.
+        summary: Pod is crash looping in {{ $labels.cluster }}.
         description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes