Skip to content

Commit dda2ebd

Browse files
svenefftingeroboquat
authored andcommitted
[server] track more startWorkspace failures
fixes #12332
1 parent fab72bc commit dda2ebd

File tree

2 files changed

+25
-6
lines changed

2 files changed

+25
-6
lines changed

components/server/src/prometheus-metrics.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,8 @@ const instanceStartsFailedTotal = new prometheusClient.Counter({
148148
registers: [prometheusClient.register],
149149
});
150150

151-
export function increaseFailedInstanceStartCounter(reason: "clusterSelectionFailed" | "startOnClusterFailed") {
151+
export type FailedInstanceStartReason = "clusterSelectionFailed" | "startOnClusterFailed" | "other";
152+
export function increaseFailedInstanceStartCounter(reason: FailedInstanceStartReason) {
152153
instanceStartsFailedTotal.inc({ reason });
153154
}
154155

components/server/src/workspace/workspace-starter.ts

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,11 @@ import { WithReferrerContext } from "@gitpod/gitpod-protocol/lib/protocol";
112112
import { IDEOption, IDEOptions } from "@gitpod/gitpod-protocol/lib/ide-protocol";
113113
import { Deferred } from "@gitpod/gitpod-protocol/lib/util/deferred";
114114
import { ExtendedUser } from "@gitpod/ws-manager/lib/constraints";
115-
import { increaseFailedInstanceStartCounter, increaseSuccessfulInstanceStartCounter } from "../prometheus-metrics";
115+
import {
116+
FailedInstanceStartReason,
117+
increaseFailedInstanceStartCounter,
118+
increaseSuccessfulInstanceStartCounter,
119+
} from "../prometheus-metrics";
116120
import { ContextParser } from "./context-parser-service";
117121
import { IDEService } from "../ide-service";
118122
import { WorkspaceClusterImagebuilderClientProvider } from "./workspace-cluster-imagebuilder-client-provider";
@@ -244,6 +248,12 @@ export async function getWorkspaceClassForInstance(
244248
}
245249
}
246250

251+
class StartInstanceError extends Error {
252+
constructor(public readonly reason: FailedInstanceStartReason, public readonly cause: Error) {
253+
super("Starting workspace instance failed: " + cause.message);
254+
}
255+
}
256+
247257
@injectable()
248258
export class WorkspaceStarter {
249259
@inject(WorkspaceManagerClientProvider) protected readonly clientProvider: WorkspaceManagerClientProvider;
@@ -414,6 +424,11 @@ export class WorkspaceStarter {
414424
forceRebuild,
415425
);
416426
} catch (e) {
427+
let failedReason: FailedInstanceStartReason = "other";
428+
if (e instanceof StartInstanceError) {
429+
failedReason = e.reason;
430+
}
431+
increaseFailedInstanceStartCounter(failedReason);
417432
TraceContext.setError({ span }, e);
418433
throw e;
419434
} finally {
@@ -523,16 +538,14 @@ export class WorkspaceStarter {
523538
await new Promise((resolve) => setTimeout(resolve, INSTANCE_START_RETRY_INTERVAL_SECONDS * 1000));
524539
}
525540
} catch (err) {
526-
increaseFailedInstanceStartCounter("startOnClusterFailed");
527541
await this.failInstanceStart({ span }, err, workspace, instance);
528-
throw err;
542+
throw new StartInstanceError("startOnClusterFailed", err);
529543
}
530544

531545
if (!resp) {
532-
increaseFailedInstanceStartCounter("clusterSelectionFailed");
533546
const err = new Error("cannot start a workspace because no workspace clusters are available");
534547
await this.failInstanceStart({ span }, err, workspace, instance);
535-
throw err;
548+
throw new StartInstanceError("clusterSelectionFailed", err);
536549
}
537550
increaseSuccessfulInstanceStartCounter(retries);
538551

@@ -579,6 +592,11 @@ export class WorkspaceStarter {
579592
throw err;
580593
} else {
581594
log.error("error starting instance", err, { instanceId: instance.id });
595+
let failedReason: FailedInstanceStartReason = "other";
596+
if (err instanceof StartInstanceError) {
597+
failedReason = err.reason;
598+
}
599+
increaseFailedInstanceStartCounter(failedReason);
582600
}
583601

584602
return { instanceID: instance.id };

0 commit comments

Comments
 (0)