diff --git a/components/server/src/prometheus-metrics.ts b/components/server/src/prometheus-metrics.ts index bd719d734e2395..aa240be4630a79 100644 --- a/components/server/src/prometheus-metrics.ts +++ b/components/server/src/prometheus-metrics.ts @@ -123,4 +123,26 @@ const gitpodVersionInfo = new prometheusClient.Gauge({ export function setGitpodVersion(gitpod_version: string){ gitpodVersionInfo.set({gitpod_version}, 1) +} + +const instanceStartsSuccessTotal = new prometheusClient.Counter({ + name: 'gitpod_server_instance_starts_success_total', + help: 'Total amount of successfully performed instance starts', + labelNames: ['retries'], + registers: [prometheusClient.register], +}); + +export function increaseSuccessfulInstanceStartCounter(retries: number = 0) { + instanceStartsSuccessTotal.inc({ retries }); +} + +const instanceStartsFailedTotal = new prometheusClient.Counter({ + name: 'gitpod_server_instance_starts_failed_total', + help: 'Total amount of failed performed instance starts', + labelNames: ['reason'], + registers: [prometheusClient.register], +}); + +export function increaseFailedInstanceStartCounter(reason: "clusterSelectionFailed" | "startOnClusterFailed") { + instanceStartsFailedTotal.inc({ reason }); } \ No newline at end of file diff --git a/components/server/src/workspace/workspace-starter.ts b/components/server/src/workspace/workspace-starter.ts index 56bb9d9155a262..b86c59b32677af 100644 --- a/components/server/src/workspace/workspace-starter.ts +++ b/components/server/src/workspace/workspace-starter.ts @@ -36,6 +36,7 @@ import { WithReferrerContext } from "@gitpod/gitpod-protocol/lib/protocol"; import { IDEOption } from "@gitpod/gitpod-protocol/lib/ide-protocol"; import { Deferred } from "@gitpod/gitpod-protocol/lib/util/deferred"; import { ExtendedUser } from "@gitpod/ws-manager/lib/constraints"; +import { increaseFailedInstanceStartCounter, increaseSuccessfulInstanceStartCounter } from "../prometheus-metrics"; export interface StartWorkspaceOptions { rethrow?: boolean; @@ -43,6 +44,9 @@ export interface StartWorkspaceOptions { excludeFeatureFlags?: NamedWorkspaceFeatureFlag[]; } +const MAX_INSTANCE_START_RETRIES = 2; +const INSTANCE_START_RETRY_INTERVAL_SECONDS = 2; + @injectable() export class WorkspaceStarter { @inject(WorkspaceManagerClientProvider) protected readonly clientProvider: WorkspaceManagerClientProvider; @@ -180,45 +184,29 @@ export class WorkspaceStarter { const euser: ExtendedUser = { ...user, getsMoreResources: await this.userService.userGetsMoreResources(user), - } - - // tell the world we're starting this instance - let resp: StartWorkspaceResponse.AsObject | undefined; - let lastInstallation = ""; - const clusters = await this.clientProvider.getStartClusterSets(euser, workspace, instance); - for await (let cluster of clusters) { - try { - // getStartManager will throw an exception if there's no cluster available and hence exit the loop - const { manager, installation } = cluster; - lastInstallation = installation; - - instance.status.phase = "pending"; - instance.region = installation; - await this.workspaceDb.trace({ span }).storeInstance(instance); - try { - await this.messageBus.notifyOnInstanceUpdate(workspace.ownerId, instance); - } catch (err) { - // if sending the notification fails that's no reason to stop the workspace creation. - // If the dashboard misses this event it will catch up at the next one. - span.log({ "notifyOnInstanceUpdate.error": err }); - log.debug("cannot send instance update - this should be mostly inconsequential", err); - } + }; - // start that thing - log.info({ instanceId: instance.id }, 'starting instance'); - resp = (await manager.startWorkspace({ span }, startRequest)).toObject(); - break; - } catch (err: any) { - if ('code' in err && err.code !== grpc.status.OK && lastInstallation !== "") { - log.error({ instanceId: instance.id }, "cannot start workspace on cluster, might retry", err, { cluster: lastInstallation }); - } else { - throw err; + // choose a cluster and start the instance + let resp: StartWorkspaceResponse.AsObject | undefined = undefined; + let retries = 0; + try { + for (; retries < MAX_INSTANCE_START_RETRIES; retries++) { + resp = await this.tryStartOnCluster({ span }, startRequest, euser, workspace, instance); + if (resp) { + break; } + await new Promise((resolve) => setTimeout(resolve, INSTANCE_START_RETRY_INTERVAL_SECONDS * 1000)); } + } catch (err) { + increaseFailedInstanceStartCounter("startOnClusterFailed"); + throw err; } + if (!resp) { + increaseFailedInstanceStartCounter("clusterSelectionFailed"); throw new Error("cannot start a workspace because no workspace clusters are available"); } + increaseSuccessfulInstanceStartCounter(retries); span.log({ "resp": resp }); @@ -259,6 +247,42 @@ export class WorkspaceStarter { } } + protected async tryStartOnCluster(ctx: TraceContext, startRequest: StartWorkspaceRequest, euser: ExtendedUser, workspace: Workspace, instance: WorkspaceInstance): Promise { + let lastInstallation = ""; + const clusters = await this.clientProvider.getStartClusterSets(euser, workspace, instance); + for await (let cluster of clusters) { + try { + // getStartManager will throw an exception if there's no cluster available and hence exit the loop + const { manager, installation } = cluster; + lastInstallation = installation; + + instance.status.phase = "pending"; + instance.region = installation; + await this.workspaceDb.trace(ctx).storeInstance(instance); + try { + await this.messageBus.notifyOnInstanceUpdate(workspace.ownerId, instance); + } catch (err) { + // if sending the notification fails that's no reason to stop the workspace creation. + // If the dashboard misses this event it will catch up at the next one. + ctx.span?.log({ "notifyOnInstanceUpdate.error": err }); + log.debug("cannot send instance update - this should be mostly inconsequential", err); + } + + // start that thing + log.info({ instanceId: instance.id }, 'starting instance'); + return (await manager.startWorkspace(ctx, startRequest)).toObject(); + } catch (err: any) { + if ('code' in err && err.code !== grpc.status.OK && lastInstallation !== "") { + log.error({ instanceId: instance.id }, "cannot start workspace on cluster, might retry", err, { cluster: lastInstallation }); + } else { + throw err; + } + } + } + + return undefined; + } + protected async notifyOnPrebuildQueued(ctx: TraceContext, workspaceId: string) { const span = TraceContext.startSpan("notifyOnPrebuildQueued", ctx); const prebuild = await this.workspaceDb.trace({ span }).findPrebuildByWorkspaceID(workspaceId); diff --git a/operations/observability/mixins/meta/dashboards/components/meta-overview.json b/operations/observability/mixins/meta/dashboards/components/meta-overview.json index 4a5c3ec69b3127..9487be370c7d23 100644 --- a/operations/observability/mixins/meta/dashboards/components/meta-overview.json +++ b/operations/observability/mixins/meta/dashboards/components/meta-overview.json @@ -21,7 +21,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "iteration": 1639648622302, + "iteration": 1646144275104, "links": [], "liveNow": false, "panels": [ @@ -113,7 +113,8 @@ "placement": "bottom" }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, "pluginVersion": "8.2.2", @@ -206,7 +207,8 @@ "placement": "bottom" }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, "pluginVersion": "8.2.2", @@ -295,7 +297,8 @@ "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "multi", + "sort": "none" } }, "targets": [ @@ -379,7 +382,8 @@ "placement": "bottom" }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, "targets": [ @@ -398,6 +402,125 @@ "title": "Team slot method calls", "type": "timeseries" }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "Failed.*" + }, + "properties": [ + { + "id": "custom.drawStyle", + "value": "line" + }, + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 9, + "w": 11, + "x": 0, + "y": 19 + }, + "id": 38, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum (irate(gitpod_server_instance_starts_success_total{cluster=~\"$cluster\"}[2m]))", + "interval": "", + "legendFormat": "Success", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum (irate(gitpod_server_instance_starts_failed_total{cluster=~\"$cluster\"}[2m])) by (reason)", + "hide": false, + "interval": "", + "legendFormat": "Failed because: {{ reason }}", + "refId": "B" + } + ], + "title": "Instance Start Success/Failure Rates", + "type": "timeseries" + }, { "datasource": { "uid": "$datasource" @@ -456,7 +579,7 @@ "gridPos": { "h": 9, "w": 11, - "x": 0, + "x": 11, "y": 19 }, "id": 4, @@ -467,7 +590,8 @@ "placement": "bottom" }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, "targets": [ @@ -564,7 +688,8 @@ "placement": "bottom" }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, "targets": [ @@ -671,7 +796,8 @@ "placement": "bottom" }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, "pluginVersion": "8.2.2", @@ -762,7 +888,8 @@ "placement": "bottom" }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, "pluginVersion": "8.2.2", @@ -851,7 +978,8 @@ "placement": "bottom" }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, "pluginVersion": "8.2.2", @@ -949,7 +1077,8 @@ "placement": "bottom" }, "tooltip": { - "mode": "single" + "mode": "single", + "sort": "none" } }, "pluginVersion": "8.2.2", @@ -983,7 +1112,7 @@ "type": "timeseries" } ], - "schemaVersion": 33, + "schemaVersion": 35, "style": "dark", "tags": [ "gitpod-mixin" @@ -1045,6 +1174,6 @@ "timezone": "utc", "title": "Meta Overview", "uid": "Gj5DE-O7k", - "version": 16, + "version": 1, "weekStart": "" } \ No newline at end of file diff --git a/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet b/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet index e96466d3a98de5..78478649b9dde0 100644 --- a/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet +++ b/operations/observability/mixins/meta/rules/components/server/alerts.libsonnet @@ -35,6 +35,20 @@ description: 'Server has accumulated {{ printf "%.2f" $value }}ms event loop lag.', }, }, + { + alert: 'InstanceStartFailures', + // Reasoning: 1 failure every 120s should not trigger an incident: 1/120 = 0.00833.. => 0.01 + expr: 'sum (irate(gitpod_server_instance_starts_failed_total[2m])) by (reason) > 0.01', + 'for': '30s', + labels: { + severity: 'critical', + }, + annotations: { + runbook_url: 'https://github.com/gitpod-io/runbooks/blob/main/runbooks/InstanceStartFailures.md', + summary: 'Server tries to start an instance, but cannot for whatever reason. Investigation required.', + description: 'Server cannot start workspace instances on workspace clusters.', + }, + }, ], }, ],