Skip to content

Commit d030750

Browse files
geroplroboquat
authored andcommitted
[server] FailedInstanceStart metrics: add case "startOnClusterFailed"
1 parent 76ef1af commit d030750

File tree

2 files changed

+11
-6
lines changed

2 files changed

+11
-6
lines changed

components/server/src/prometheus-metrics.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,6 @@ const instanceStartsFailedTotal = new prometheusClient.Counter({
143143
registers: [prometheusClient.register],
144144
});
145145

146-
export function increaseFailedInstanceStartCounter(reason: "clusterSelectionFailed") {
146+
export function increaseFailedInstanceStartCounter(reason: "clusterSelectionFailed" | "startOnClusterFailed") {
147147
instanceStartsFailedTotal.inc({ reason });
148148
}

components/server/src/workspace/workspace-starter.ts

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -189,12 +189,17 @@ export class WorkspaceStarter {
189189
// choose a cluster and start the instance
190190
let resp: StartWorkspaceResponse.AsObject | undefined = undefined;
191191
let retries = 0;
192-
for (; retries < MAX_INSTANCE_START_RETRIES; retries++) {
193-
resp = await this.tryStartOnCluster({ span }, startRequest, euser, workspace, instance);
194-
if (resp) {
195-
break;
192+
try {
193+
for (; retries < MAX_INSTANCE_START_RETRIES; retries++) {
194+
resp = await this.tryStartOnCluster({ span }, startRequest, euser, workspace, instance);
195+
if (resp) {
196+
break;
197+
}
198+
await new Promise((resolve) => setTimeout(resolve, INSTANCE_START_RETRY_INTERVAL_SECONDS * 1000));
196199
}
197-
await new Promise((resolve) => setTimeout(resolve, INSTANCE_START_RETRY_INTERVAL_SECONDS * 1000));
200+
} catch (err) {
201+
increaseFailedInstanceStartCounter("startOnClusterFailed");
202+
throw err;
198203
}
199204

200205
if (!resp) {

0 commit comments

Comments
 (0)