From 985289897fcae4316ec2bd176450f160d040a9b8 Mon Sep 17 00:00:00 2001 From: Gero Posmyk-Leinemann Date: Thu, 1 Sep 2022 15:43:36 +0000 Subject: [PATCH 1/3] [ops] WebApp: Fix alert WebsocketConnectionRateHigh by using a rate(total) instead of rate(gauge) --- .../mixins/meta/rules/components/server/alerts.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet b/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet index a1ea19122b2938..785c0067a8ded8 100644 --- a/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet +++ b/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet @@ -69,8 +69,8 @@ { alert: 'WebsocketConnectionRateHigh', // Reasoning: the values are taken from past data - expr: 'sum(rate(server_websocket_connection_count[2m])) > 30', - 'for': '5m', + expr: 'sum(rate(gitpod_server_api_connections_total[2m])) by (pod) > 5', + 'for': '10m', labels: { // sent to the team internal channel until we fine tuned it severity: 'warning', From 3b70e4719957c9abeb557d2818617ed65c91d8a9 Mon Sep 17 00:00:00 2001 From: Gero Posmyk-Leinemann Date: Thu, 1 Sep 2022 15:44:45 +0000 Subject: [PATCH 2/3] [ops] WebApp: Remove rate(memory): rate(gauge) does not work --- .../rules/components/server/alerts.libsonnet | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet b/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet index 785c0067a8ded8..600185a293dcfa 100644 --- a/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet +++ b/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet @@ -116,22 +116,6 @@ description: 'Messagebus pod not running', }, }, - { - alert: 'WebAppServicesHighMemoryUsage', - // Reasoning: high rates of RAM consumption should only be temporary. Values based on past data (around 5-10 is constant) - expr: 'sum(rate(container_memory_working_set_bytes{container!="POD", node=~".*", pod=~"(server|ws-manager-bridge|usage)-.*"}[30m])) by (pod, node) > 10000000', - 'for': '15m', - labels: { - // sent to the team internal channel until we fine tuned it - severity: 'warning', - team: 'webapp' - }, - annotations: { - runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebAppServicesHighMemoryUsage.md', - summary: 'WebApp services consume excessive amounts of memory. Investigation required.', - description: 'WebApp Services execcisve memory usage', - }, - }, { alert: 'WebAppServicesHighCPUUsage', // Reasoning: high rates of CPU consumption should only be temporary. From 316223fd5c98ddf2b1ff719db31942c28385e52e Mon Sep 17 00:00:00 2001 From: Gero Posmyk-Leinemann Date: Thu, 1 Sep 2022 15:45:29 +0000 Subject: [PATCH 3/3] [ops] WebApp: Fix WebAppServicesCrashlooping --- .../mixins/meta/rules/components/server/alerts.libsonnet | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet b/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet index 600185a293dcfa..a9af57c22489f2 100644 --- a/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet +++ b/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet @@ -134,8 +134,9 @@ }, { alert: 'WebAppServicesCrashlooping', - expr: 'sum(rate(container_cpu_usage_seconds_total{container!="POD", node=~".*", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m])) by (pod, node) > 0.80', - 'for': '15m', + // Reasoning: alert if any pod is restarting more than 3 times / 5 minutes. + expr: 'increase(kube_pod_container_status_restarts_total{container!="POD", pod=~"(content-service|dashboard|db|db-sync|messagebus|payment-endpoint|proxy|server|ws-manager-bridge|usage)-.*"}[5m]) > 3', + 'for': '5m', labels: { // sent to the team internal channel until we fine tuned it severity: 'warning',