From f0fc2d8368a62dd7ec2c99e3fd132d2b087050e3 Mon Sep 17 00:00:00 2001
From: Milan Pavlik <milan@gitpod.io>
Date: Tue, 11 Oct 2022 12:52:59 +0000
Subject: [PATCH 1/2] [alerts] Group by cluster, where relevant, ahead of
 centralizing rule evaluation

---
 .../mixins/meta/rules/server.yaml             | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/operations/observability/mixins/meta/rules/server.yaml b/operations/observability/mixins/meta/rules/server.yaml
index 823135938883c4..ad458e6b1c43b1 100644
--- a/operations/observability/mixins/meta/rules/server.yaml
+++ b/operations/observability/mixins/meta/rules/server.yaml
@@ -15,28 +15,28 @@ spec:
   - name: server
     rules:
     - alert: WebsocketConnectionsNotClosing
-      expr: sum(server_websocket_connection_count) == 10000
+      expr: sum(server_websocket_connection_count) by (cluster) == 10000
       for: 10m
       labels:
         severity: critical
       annotations:
         runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebsocketConnectionsNotClosing.md
-        summary: Open websocket connections are not closing for the last 10 minutes and accumulating.
+        summary: Open websocket connections are not closing for the last 10 minutes and accumulating in {{ $labels.cluster }}.
         description: We have accumulated {{ printf "%.2f" $value }} open websocket connections.
 
     - alert: ServerEventLoopLagTooHigh
-      expr: avg_over_time(nodejs_eventloop_lag_seconds{job="server"}[20m]) > 0.35
+      expr: avg_over_time(nodejs_eventloop_lag_seconds{job="server"}[20m]) by (cluster) > 0.35
       for: 5m
       labels:
         severity: critical
       annotations:
         runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/ServerEventLoopLagTooHigh.md
-        summary: Server accumulated too much "event loop lag". The webapp will become unresponsive if we don't act here.
+        summary: Server accumulated too much "event loop lag" on {{ $labels.cluster }}. The webapp will become unresponsive if we don't act here.
         description: Server has accumulated {{ printf "%.2f" $value }}s event loop lag.
 
     - alert: InstanceStartFailures
       # Reasoning: 1 failure every 120s should not trigger an incident: 1/120 = 0.00833.. => 0.01
-      expr: sum (irate(gitpod_server_instance_starts_failed_total[2m])) by (reason) > 0.01
+      expr: sum(irate(gitpod_server_instance_starts_failed_total[2m])) by (reason) > 0.01
       for: 30s
       labels:
         severity: critical
@@ -48,7 +48,7 @@ spec:
     # Rollout alerts
     - alert: JsonRpcApiErrorRates
       # Reasoning: the values are taken from past data
-      expr: sum (rate(gitpod_server_api_calls_total{statusCode!~"2..|429"}[5m])) / sum(rate(gitpod_server_api_calls_total[5m])) > 0.04
+      expr: sum(rate(gitpod_server_api_calls_total{statusCode!~"2..|429"}[5m])) by (cluster) / sum(rate(gitpod_server_api_calls_total[5m])) by (cluster) > 0.04
       for: 5m
       labels:
         # sent to the team internal channel until we fine tuned it
@@ -56,7 +56,7 @@ spec:
         team: webapp
       annotations:
         runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodApiErrorRate.md
-        summary: The error rate of the JSON RPC API is high. Investigation required.
+        summary: The error rate of the JSON RPC API is high on {{ $labels.cluster }}. Investigation required.
         description: JSON RPC API error rate high
 
     - alert: WebsocketConnectionRateHigh
@@ -87,7 +87,7 @@ spec:
     #     description: db-sync pod not running
 
     - alert: MessagebusNotRunning
-      expr: up{job="messagebus"} < 1
+      expr: sum(up{job="messagebus"}) by (cluster) < 1
       for: 2m
       labels:
         # sent to the team internal channel until we fine tuned it
@@ -95,12 +95,12 @@ spec:
         team: webapp
       annotations:
         runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/MessagebusNotRunning.md
-        summary: The messagebus pod is not running. Workspace information is not being correctly propagated into web app clusters. Investigation required.
+        summary: The messagebus pod is not running in {{ $labels.cluster }}. Workspace information is not being correctly propagated into web app clusters. Investigation required.
         description: Messagebus pod not running
 
     - alert: WebAppServicesHighCPUUsage
       # Reasoning: high rates of CPU consumption should only be temporary.
-      expr: sum(rate(container_cpu_usage_seconds_total{container!="POD", node=~".*", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m])) by (pod, node) > 0.80
+      expr: sum(rate(container_cpu_usage_seconds_total{container!="POD", node=~".*", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m])) by (pod, node, cluster) > 0.80
       for: 10m
       labels:
         # sent to the team internal channel until we fine tuned it
@@ -114,7 +114,7 @@ spec:
 
     - alert: WebAppServicesCrashlooping
       # Reasoning: alert if any pod is restarting more than 3 times / 5 minutes.
-      expr: increase(kube_pod_container_status_restarts_total{container!="POD", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m]) > 3
+      expr: sum(increase(kube_pod_container_status_restarts_total{container!="POD", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m])) by (cluster) > 3
       for: 5m
       labels:
         # sent to the team internal channel until we fine tuned it
@@ -122,5 +122,5 @@ spec:
         team: webapp
       annotations:
         runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebAppServicesCrashlooping.md
-        summary: Pod is crash looping.
+        summary: Pod is crash looping in {{ $labels.cluster }}.
         description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes

From 4ee1f2cda14c1458671325e7137847aa3f74bea5 Mon Sep 17 00:00:00 2001
From: Milan Pavlik <milan@gitpod.io>
Date: Fri, 14 Oct 2022 12:27:16 +0000
Subject: [PATCH 2/2] [webapp] Remove namespace from monitoring rules

---
 operations/observability/mixins/meta/rules/login-slo.yaml  | 1 -
 operations/observability/mixins/meta/rules/messagebus.yaml | 1 -
 operations/observability/mixins/meta/rules/meta-nodes.yaml | 1 -
 operations/observability/mixins/meta/rules/server.yaml     | 1 -
 operations/observability/mixins/meta/rules/usage.yaml      | 1 -
 5 files changed, 5 deletions(-)

diff --git a/operations/observability/mixins/meta/rules/login-slo.yaml b/operations/observability/mixins/meta/rules/login-slo.yaml
index 751214bfb89c42..5731fbbbc2e86f 100644
--- a/operations/observability/mixins/meta/rules/login-slo.yaml
+++ b/operations/observability/mixins/meta/rules/login-slo.yaml
@@ -9,7 +9,6 @@ metadata:
     prometheus: k8s
     role: alert-rules
   name: login-slo-monitoring-rules
-  namespace: monitoring-satellite
 spec:
   groups:
   - name: login-slo-recording-rules
diff --git a/operations/observability/mixins/meta/rules/messagebus.yaml b/operations/observability/mixins/meta/rules/messagebus.yaml
index 2a246207ff90bc..3610241d9bbe9b 100644
--- a/operations/observability/mixins/meta/rules/messagebus.yaml
+++ b/operations/observability/mixins/meta/rules/messagebus.yaml
@@ -9,7 +9,6 @@ metadata:
     prometheus: k8s
     role: alert-rules
   name: messagebus-monitoring-rules
-  namespace: monitoring-satellite
 spec:
   groups:
   - name: messagebus
diff --git a/operations/observability/mixins/meta/rules/meta-nodes.yaml b/operations/observability/mixins/meta/rules/meta-nodes.yaml
index 788d6de2505996..217e6d1f900fba 100644
--- a/operations/observability/mixins/meta/rules/meta-nodes.yaml
+++ b/operations/observability/mixins/meta/rules/meta-nodes.yaml
@@ -9,7 +9,6 @@ metadata:
     prometheus: k8s
     role: alert-rules
   name: meta-nodes-monitoring-rules
-  namespace: monitoring-satellite
 spec:
   groups:
   - name: meta-nodes
diff --git a/operations/observability/mixins/meta/rules/server.yaml b/operations/observability/mixins/meta/rules/server.yaml
index ad458e6b1c43b1..2e20991d769f1b 100644
--- a/operations/observability/mixins/meta/rules/server.yaml
+++ b/operations/observability/mixins/meta/rules/server.yaml
@@ -9,7 +9,6 @@ metadata:
     prometheus: k8s
     role: alert-rules
   name: server-monitoring-rules
-  namespace: monitoring-satellite
 spec:
   groups:
   - name: server
diff --git a/operations/observability/mixins/meta/rules/usage.yaml b/operations/observability/mixins/meta/rules/usage.yaml
index c851a901e177e4..ee382f98938da7 100644
--- a/operations/observability/mixins/meta/rules/usage.yaml
+++ b/operations/observability/mixins/meta/rules/usage.yaml
@@ -9,7 +9,6 @@ metadata:
     prometheus: k8s
     role: alert-rules
   name: usage-monitoring-rules
-  namespace: monitoring-satellite
 spec:
   groups:
   - name: usage