diff --git a/operations/observability/mixins/meta/rules/login-slo.yaml b/operations/observability/mixins/meta/rules/login-slo.yaml index 751214bfb89c42..5731fbbbc2e86f 100644 --- a/operations/observability/mixins/meta/rules/login-slo.yaml +++ b/operations/observability/mixins/meta/rules/login-slo.yaml @@ -9,7 +9,6 @@ metadata: prometheus: k8s role: alert-rules name: login-slo-monitoring-rules - namespace: monitoring-satellite spec: groups: - name: login-slo-recording-rules diff --git a/operations/observability/mixins/meta/rules/messagebus.yaml b/operations/observability/mixins/meta/rules/messagebus.yaml index 2a246207ff90bc..3610241d9bbe9b 100644 --- a/operations/observability/mixins/meta/rules/messagebus.yaml +++ b/operations/observability/mixins/meta/rules/messagebus.yaml @@ -9,7 +9,6 @@ metadata: prometheus: k8s role: alert-rules name: messagebus-monitoring-rules - namespace: monitoring-satellite spec: groups: - name: messagebus diff --git a/operations/observability/mixins/meta/rules/meta-nodes.yaml b/operations/observability/mixins/meta/rules/meta-nodes.yaml index 788d6de2505996..217e6d1f900fba 100644 --- a/operations/observability/mixins/meta/rules/meta-nodes.yaml +++ b/operations/observability/mixins/meta/rules/meta-nodes.yaml @@ -9,7 +9,6 @@ metadata: prometheus: k8s role: alert-rules name: meta-nodes-monitoring-rules - namespace: monitoring-satellite spec: groups: - name: meta-nodes diff --git a/operations/observability/mixins/meta/rules/server.yaml b/operations/observability/mixins/meta/rules/server.yaml index 823135938883c4..2e20991d769f1b 100644 --- a/operations/observability/mixins/meta/rules/server.yaml +++ b/operations/observability/mixins/meta/rules/server.yaml @@ -9,34 +9,33 @@ metadata: prometheus: k8s role: alert-rules name: server-monitoring-rules - namespace: monitoring-satellite spec: groups: - name: server rules: - alert: WebsocketConnectionsNotClosing - expr: sum(server_websocket_connection_count) == 10000 + expr: sum(server_websocket_connection_count) by (cluster) == 10000 for: 10m labels: severity: critical annotations: runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebsocketConnectionsNotClosing.md - summary: Open websocket connections are not closing for the last 10 minutes and accumulating. + summary: Open websocket connections are not closing for the last 10 minutes and accumulating in {{ $labels.cluster }}. description: We have accumulated {{ printf "%.2f" $value }} open websocket connections. - alert: ServerEventLoopLagTooHigh - expr: avg_over_time(nodejs_eventloop_lag_seconds{job="server"}[20m]) > 0.35 + expr: avg_over_time(nodejs_eventloop_lag_seconds{job="server"}[20m]) by (cluster) > 0.35 for: 5m labels: severity: critical annotations: runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/ServerEventLoopLagTooHigh.md - summary: Server accumulated too much "event loop lag". The webapp will become unresponsive if we don't act here. + summary: Server accumulated too much "event loop lag" on {{ $labels.cluster }}. The webapp will become unresponsive if we don't act here. description: Server has accumulated {{ printf "%.2f" $value }}s event loop lag. - alert: InstanceStartFailures # Reasoning: 1 failure every 120s should not trigger an incident: 1/120 = 0.00833.. => 0.01 - expr: sum (irate(gitpod_server_instance_starts_failed_total[2m])) by (reason) > 0.01 + expr: sum(irate(gitpod_server_instance_starts_failed_total[2m])) by (reason) > 0.01 for: 30s labels: severity: critical @@ -48,7 +47,7 @@ spec: # Rollout alerts - alert: JsonRpcApiErrorRates # Reasoning: the values are taken from past data - expr: sum (rate(gitpod_server_api_calls_total{statusCode!~"2..|429"}[5m])) / sum(rate(gitpod_server_api_calls_total[5m])) > 0.04 + expr: sum(rate(gitpod_server_api_calls_total{statusCode!~"2..|429"}[5m])) by (cluster) / sum(rate(gitpod_server_api_calls_total[5m])) by (cluster) > 0.04 for: 5m labels: # sent to the team internal channel until we fine tuned it @@ -56,7 +55,7 @@ spec: team: webapp annotations: runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodApiErrorRate.md - summary: The error rate of the JSON RPC API is high. Investigation required. + summary: The error rate of the JSON RPC API is high on {{ $labels.cluster }}. Investigation required. description: JSON RPC API error rate high - alert: WebsocketConnectionRateHigh @@ -87,7 +86,7 @@ spec: # description: db-sync pod not running - alert: MessagebusNotRunning - expr: up{job="messagebus"} < 1 + expr: sum(up{job="messagebus"}) by (cluster) < 1 for: 2m labels: # sent to the team internal channel until we fine tuned it @@ -95,12 +94,12 @@ spec: team: webapp annotations: runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/MessagebusNotRunning.md - summary: The messagebus pod is not running. Workspace information is not being correctly propagated into web app clusters. Investigation required. + summary: The messagebus pod is not running in {{ $labels.cluster }}. Workspace information is not being correctly propagated into web app clusters. Investigation required. description: Messagebus pod not running - alert: WebAppServicesHighCPUUsage # Reasoning: high rates of CPU consumption should only be temporary. - expr: sum(rate(container_cpu_usage_seconds_total{container!="POD", node=~".*", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m])) by (pod, node) > 0.80 + expr: sum(rate(container_cpu_usage_seconds_total{container!="POD", node=~".*", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m])) by (pod, node, cluster) > 0.80 for: 10m labels: # sent to the team internal channel until we fine tuned it @@ -114,7 +113,7 @@ spec: - alert: WebAppServicesCrashlooping # Reasoning: alert if any pod is restarting more than 3 times / 5 minutes. - expr: increase(kube_pod_container_status_restarts_total{container!="POD", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m]) > 3 + expr: sum(increase(kube_pod_container_status_restarts_total{container!="POD", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m])) by (cluster) > 3 for: 5m labels: # sent to the team internal channel until we fine tuned it @@ -122,5 +121,5 @@ spec: team: webapp annotations: runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebAppServicesCrashlooping.md - summary: Pod is crash looping. + summary: Pod is crash looping in {{ $labels.cluster }}. description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes diff --git a/operations/observability/mixins/meta/rules/usage.yaml b/operations/observability/mixins/meta/rules/usage.yaml index c851a901e177e4..ee382f98938da7 100644 --- a/operations/observability/mixins/meta/rules/usage.yaml +++ b/operations/observability/mixins/meta/rules/usage.yaml @@ -9,7 +9,6 @@ metadata: prometheus: k8s role: alert-rules name: usage-monitoring-rules - namespace: monitoring-satellite spec: groups: - name: usage