From f0fc2d8368a62dd7ec2c99e3fd132d2b087050e3 Mon Sep 17 00:00:00 2001 From: Milan Pavlik Date: Tue, 11 Oct 2022 12:52:59 +0000 Subject: [PATCH 1/2] [alerts] Group by cluster, where relevant, ahead of centralizing rule evaluation --- .../mixins/meta/rules/server.yaml | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/operations/observability/mixins/meta/rules/server.yaml b/operations/observability/mixins/meta/rules/server.yaml index 823135938883c4..ad458e6b1c43b1 100644 --- a/operations/observability/mixins/meta/rules/server.yaml +++ b/operations/observability/mixins/meta/rules/server.yaml @@ -15,28 +15,28 @@ spec: - name: server rules: - alert: WebsocketConnectionsNotClosing - expr: sum(server_websocket_connection_count) == 10000 + expr: sum(server_websocket_connection_count) by (cluster) == 10000 for: 10m labels: severity: critical annotations: runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebsocketConnectionsNotClosing.md - summary: Open websocket connections are not closing for the last 10 minutes and accumulating. + summary: Open websocket connections are not closing for the last 10 minutes and accumulating in {{ $labels.cluster }}. description: We have accumulated {{ printf "%.2f" $value }} open websocket connections. - alert: ServerEventLoopLagTooHigh - expr: avg_over_time(nodejs_eventloop_lag_seconds{job="server"}[20m]) > 0.35 + expr: avg_over_time(nodejs_eventloop_lag_seconds{job="server"}[20m]) by (cluster) > 0.35 for: 5m labels: severity: critical annotations: runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/ServerEventLoopLagTooHigh.md - summary: Server accumulated too much "event loop lag". The webapp will become unresponsive if we don't act here. + summary: Server accumulated too much "event loop lag" on {{ $labels.cluster }}. The webapp will become unresponsive if we don't act here. description: Server has accumulated {{ printf "%.2f" $value }}s event loop lag. - alert: InstanceStartFailures # Reasoning: 1 failure every 120s should not trigger an incident: 1/120 = 0.00833.. => 0.01 - expr: sum (irate(gitpod_server_instance_starts_failed_total[2m])) by (reason) > 0.01 + expr: sum(irate(gitpod_server_instance_starts_failed_total[2m])) by (reason) > 0.01 for: 30s labels: severity: critical @@ -48,7 +48,7 @@ spec: # Rollout alerts - alert: JsonRpcApiErrorRates # Reasoning: the values are taken from past data - expr: sum (rate(gitpod_server_api_calls_total{statusCode!~"2..|429"}[5m])) / sum(rate(gitpod_server_api_calls_total[5m])) > 0.04 + expr: sum(rate(gitpod_server_api_calls_total{statusCode!~"2..|429"}[5m])) by (cluster) / sum(rate(gitpod_server_api_calls_total[5m])) by (cluster) > 0.04 for: 5m labels: # sent to the team internal channel until we fine tuned it @@ -56,7 +56,7 @@ spec: team: webapp annotations: runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodApiErrorRate.md - summary: The error rate of the JSON RPC API is high. Investigation required. + summary: The error rate of the JSON RPC API is high on {{ $labels.cluster }}. Investigation required. description: JSON RPC API error rate high - alert: WebsocketConnectionRateHigh @@ -87,7 +87,7 @@ spec: # description: db-sync pod not running - alert: MessagebusNotRunning - expr: up{job="messagebus"} < 1 + expr: sum(up{job="messagebus"}) by (cluster) < 1 for: 2m labels: # sent to the team internal channel until we fine tuned it @@ -95,12 +95,12 @@ spec: team: webapp annotations: runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/MessagebusNotRunning.md - summary: The messagebus pod is not running. Workspace information is not being correctly propagated into web app clusters. Investigation required. + summary: The messagebus pod is not running in {{ $labels.cluster }}. Workspace information is not being correctly propagated into web app clusters. Investigation required. description: Messagebus pod not running - alert: WebAppServicesHighCPUUsage # Reasoning: high rates of CPU consumption should only be temporary. - expr: sum(rate(container_cpu_usage_seconds_total{container!="POD", node=~".*", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m])) by (pod, node) > 0.80 + expr: sum(rate(container_cpu_usage_seconds_total{container!="POD", node=~".*", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m])) by (pod, node, cluster) > 0.80 for: 10m labels: # sent to the team internal channel until we fine tuned it @@ -114,7 +114,7 @@ spec: - alert: WebAppServicesCrashlooping # Reasoning: alert if any pod is restarting more than 3 times / 5 minutes. - expr: increase(kube_pod_container_status_restarts_total{container!="POD", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m]) > 3 + expr: sum(increase(kube_pod_container_status_restarts_total{container!="POD", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m])) by (cluster) > 3 for: 5m labels: # sent to the team internal channel until we fine tuned it @@ -122,5 +122,5 @@ spec: team: webapp annotations: runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebAppServicesCrashlooping.md - summary: Pod is crash looping. + summary: Pod is crash looping in {{ $labels.cluster }}. description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes From 4ee1f2cda14c1458671325e7137847aa3f74bea5 Mon Sep 17 00:00:00 2001 From: Milan Pavlik Date: Fri, 14 Oct 2022 12:27:16 +0000 Subject: [PATCH 2/2] [webapp] Remove namespace from monitoring rules --- operations/observability/mixins/meta/rules/login-slo.yaml | 1 - operations/observability/mixins/meta/rules/messagebus.yaml | 1 - operations/observability/mixins/meta/rules/meta-nodes.yaml | 1 - operations/observability/mixins/meta/rules/server.yaml | 1 - operations/observability/mixins/meta/rules/usage.yaml | 1 - 5 files changed, 5 deletions(-) diff --git a/operations/observability/mixins/meta/rules/login-slo.yaml b/operations/observability/mixins/meta/rules/login-slo.yaml index 751214bfb89c42..5731fbbbc2e86f 100644 --- a/operations/observability/mixins/meta/rules/login-slo.yaml +++ b/operations/observability/mixins/meta/rules/login-slo.yaml @@ -9,7 +9,6 @@ metadata: prometheus: k8s role: alert-rules name: login-slo-monitoring-rules - namespace: monitoring-satellite spec: groups: - name: login-slo-recording-rules diff --git a/operations/observability/mixins/meta/rules/messagebus.yaml b/operations/observability/mixins/meta/rules/messagebus.yaml index 2a246207ff90bc..3610241d9bbe9b 100644 --- a/operations/observability/mixins/meta/rules/messagebus.yaml +++ b/operations/observability/mixins/meta/rules/messagebus.yaml @@ -9,7 +9,6 @@ metadata: prometheus: k8s role: alert-rules name: messagebus-monitoring-rules - namespace: monitoring-satellite spec: groups: - name: messagebus diff --git a/operations/observability/mixins/meta/rules/meta-nodes.yaml b/operations/observability/mixins/meta/rules/meta-nodes.yaml index 788d6de2505996..217e6d1f900fba 100644 --- a/operations/observability/mixins/meta/rules/meta-nodes.yaml +++ b/operations/observability/mixins/meta/rules/meta-nodes.yaml @@ -9,7 +9,6 @@ metadata: prometheus: k8s role: alert-rules name: meta-nodes-monitoring-rules - namespace: monitoring-satellite spec: groups: - name: meta-nodes diff --git a/operations/observability/mixins/meta/rules/server.yaml b/operations/observability/mixins/meta/rules/server.yaml index ad458e6b1c43b1..2e20991d769f1b 100644 --- a/operations/observability/mixins/meta/rules/server.yaml +++ b/operations/observability/mixins/meta/rules/server.yaml @@ -9,7 +9,6 @@ metadata: prometheus: k8s role: alert-rules name: server-monitoring-rules - namespace: monitoring-satellite spec: groups: - name: server diff --git a/operations/observability/mixins/meta/rules/usage.yaml b/operations/observability/mixins/meta/rules/usage.yaml index c851a901e177e4..ee382f98938da7 100644 --- a/operations/observability/mixins/meta/rules/usage.yaml +++ b/operations/observability/mixins/meta/rules/usage.yaml @@ -9,7 +9,6 @@ metadata: prometheus: k8s role: alert-rules name: usage-monitoring-rules - namespace: monitoring-satellite spec: groups: - name: usage