gitpod-io · roboquat · Oct 26, 2022 · Oct 11, 2022 · Oct 14, 2022 · laushinka
@@ -9,7 +9,6 @@ metadata:
     prometheus: k8s
     role: alert-rules
   name: login-slo-monitoring-rules
-  namespace: monitoring-satellite
 spec:
   groups:
   - name: login-slo-recording-rules

@@ -9,7 +9,6 @@ metadata:
     prometheus: k8s
     role: alert-rules
   name: messagebus-monitoring-rules
-  namespace: monitoring-satellite
 spec:
   groups:
   - name: messagebus

@@ -9,7 +9,6 @@ metadata:
     prometheus: k8s
     role: alert-rules
   name: meta-nodes-monitoring-rules
-  namespace: monitoring-satellite
 spec:
   groups:
   - name: meta-nodes

@@ -9,34 +9,33 @@ metadata:
     prometheus: k8s
     role: alert-rules
   name: server-monitoring-rules
-  namespace: monitoring-satellite
 spec:
   groups:
   - name: server
     rules:
     - alert: WebsocketConnectionsNotClosing
-      expr: sum(server_websocket_connection_count) == 10000
+      expr: sum(server_websocket_connection_count) by (cluster) == 10000
       for: 10m
       labels:
         severity: critical
       annotations:
         runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebsocketConnectionsNotClosing.md
-        summary: Open websocket connections are not closing for the last 10 minutes and accumulating.
+        summary: Open websocket connections are not closing for the last 10 minutes and accumulating in {{ $labels.cluster }}.
         description: We have accumulated {{ printf "%.2f" $value }} open websocket connections.
 
     - alert: ServerEventLoopLagTooHigh
-      expr: avg_over_time(nodejs_eventloop_lag_seconds{job="server"}[20m]) > 0.35
+      expr: avg_over_time(nodejs_eventloop_lag_seconds{job="server"}[20m]) by (cluster) > 0.35
       for: 5m
       labels:
         severity: critical
       annotations:
         runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/ServerEventLoopLagTooHigh.md
-        summary: Server accumulated too much "event loop lag". The webapp will become unresponsive if we don't act here.
+        summary: Server accumulated too much "event loop lag" on {{ $labels.cluster }}. The webapp will become unresponsive if we don't act here.
         description: Server has accumulated {{ printf "%.2f" $value }}s event loop lag.
 
     - alert: InstanceStartFailures
       # Reasoning: 1 failure every 120s should not trigger an incident: 1/120 = 0.00833.. => 0.01
-      expr: sum (irate(gitpod_server_instance_starts_failed_total[2m])) by (reason) > 0.01
+      expr: sum(irate(gitpod_server_instance_starts_failed_total[2m])) by (reason) > 0.01
       for: 30s
       labels:
         severity: critical
@@ -48,15 +47,15 @@ spec:
     # Rollout alerts
     - alert: JsonRpcApiErrorRates
       # Reasoning: the values are taken from past data
-      expr: sum (rate(gitpod_server_api_calls_total{statusCode!~"2..|429"}[5m])) / sum(rate(gitpod_server_api_calls_total[5m])) > 0.04
+      expr: sum(rate(gitpod_server_api_calls_total{statusCode!~"2..|429"}[5m])) by (cluster) / sum(rate(gitpod_server_api_calls_total[5m])) by (cluster) > 0.04
       for: 5m
       labels:
         # sent to the team internal channel until we fine tuned it
         severity: warning
         team: webapp
       annotations:
         runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodApiErrorRate.md
-        summary: The error rate of the JSON RPC API is high. Investigation required.
+        summary: The error rate of the JSON RPC API is high on {{ $labels.cluster }}. Investigation required.
         description: JSON RPC API error rate high
 
     - alert: WebsocketConnectionRateHigh
@@ -87,20 +86,20 @@ spec:
     #     description: db-sync pod not running
 
     - alert: MessagebusNotRunning
-      expr: up{job="messagebus"} < 1
+      expr: sum(up{job="messagebus"}) by (cluster) < 1
       for: 2m
       labels:
         # sent to the team internal channel until we fine tuned it
         severity: warning
         team: webapp
       annotations:
         runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/MessagebusNotRunning.md
-        summary: The messagebus pod is not running. Workspace information is not being correctly propagated into web app clusters. Investigation required.
+        summary: The messagebus pod is not running in {{ $labels.cluster }}. Workspace information is not being correctly propagated into web app clusters. Investigation required.
         description: Messagebus pod not running
 
     - alert: WebAppServicesHighCPUUsage
       # Reasoning: high rates of CPU consumption should only be temporary.
-      expr: sum(rate(container_cpu_usage_seconds_total{container!="POD", node=~".*", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m])) by (pod, node) > 0.80
+      expr: sum(rate(container_cpu_usage_seconds_total{container!="POD", node=~".*", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m])) by (pod, node, cluster) > 0.80
       for: 10m
       labels:
         # sent to the team internal channel until we fine tuned it
@@ -114,13 +113,13 @@ spec:
 
     - alert: WebAppServicesCrashlooping
       # Reasoning: alert if any pod is restarting more than 3 times / 5 minutes.
-      expr: increase(kube_pod_container_status_restarts_total{container!="POD", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m]) > 3
+      expr: sum(increase(kube_pod_container_status_restarts_total{container!="POD", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m])) by (cluster) > 3
       for: 5m
       labels:
         # sent to the team internal channel until we fine tuned it
         severity: warning
         team: webapp
       annotations:
         runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebAppServicesCrashlooping.md
-        summary: Pod is crash looping.
+        summary: Pod is crash looping in {{ $labels.cluster }}.
         description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes
@@ -9,7 +9,6 @@ metadata:
     prometheus: k8s
     role: alert-rules
   name: usage-monitoring-rules
-  namespace: monitoring-satellite
 spec:
   groups:
   - name: usage