Skip to content

Commit 261201d

Browse files
geroplroboquat
authored andcommitted
[server] Retry to start a WorkspaceInstance 2 times, with 2s break
1 parent 7e3c92e commit 261201d

File tree

1 file changed

+48
-32
lines changed

1 file changed

+48
-32
lines changed

components/server/src/workspace/workspace-starter.ts

Lines changed: 48 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ export interface StartWorkspaceOptions {
4343
excludeFeatureFlags?: NamedWorkspaceFeatureFlag[];
4444
}
4545

46+
const MAX_INSTANCE_START_RETRIES = 2;
47+
const INSTANCE_START_RETRY_INTERVAL_SECONDS = 2;
48+
4649
@injectable()
4750
export class WorkspaceStarter {
4851
@inject(WorkspaceManagerClientProvider) protected readonly clientProvider: WorkspaceManagerClientProvider;
@@ -180,42 +183,19 @@ export class WorkspaceStarter {
180183
const euser: ExtendedUser = {
181184
...user,
182185
getsMoreResources: await this.userService.userGetsMoreResources(user),
183-
}
184-
185-
// tell the world we're starting this instance
186-
let resp: StartWorkspaceResponse.AsObject | undefined;
187-
let lastInstallation = "";
188-
const clusters = await this.clientProvider.getStartClusterSets(euser, workspace, instance);
189-
for await (let cluster of clusters) {
190-
try {
191-
// getStartManager will throw an exception if there's no cluster available and hence exit the loop
192-
const { manager, installation } = cluster;
193-
lastInstallation = installation;
194-
195-
instance.status.phase = "pending";
196-
instance.region = installation;
197-
await this.workspaceDb.trace({ span }).storeInstance(instance);
198-
try {
199-
await this.messageBus.notifyOnInstanceUpdate(workspace.ownerId, instance);
200-
} catch (err) {
201-
// if sending the notification fails that's no reason to stop the workspace creation.
202-
// If the dashboard misses this event it will catch up at the next one.
203-
span.log({ "notifyOnInstanceUpdate.error": err });
204-
log.debug("cannot send instance update - this should be mostly inconsequential", err);
205-
}
186+
};
206187

207-
// start that thing
208-
log.info({ instanceId: instance.id }, 'starting instance');
209-
resp = (await manager.startWorkspace({ span }, startRequest)).toObject();
188+
// choose a cluster and start the instance
189+
let resp: StartWorkspaceResponse.AsObject | undefined = undefined;
190+
let retries = 0;
191+
for (; retries < MAX_INSTANCE_START_RETRIES; retries++) {
192+
resp = await this.tryStartOnCluster({ span }, startRequest, euser, workspace, instance);
193+
if (resp) {
210194
break;
211-
} catch (err: any) {
212-
if ('code' in err && err.code !== grpc.status.OK && lastInstallation !== "") {
213-
log.error({ instanceId: instance.id }, "cannot start workspace on cluster, might retry", err, { cluster: lastInstallation });
214-
} else {
215-
throw err;
216-
}
217195
}
196+
await new Promise((resolve) => setTimeout(resolve, INSTANCE_START_RETRY_INTERVAL_SECONDS * 1000));
218197
}
198+
219199
if (!resp) {
220200
throw new Error("cannot start a workspace because no workspace clusters are available");
221201
}
@@ -259,6 +239,42 @@ export class WorkspaceStarter {
259239
}
260240
}
261241

242+
protected async tryStartOnCluster(ctx: TraceContext, startRequest: StartWorkspaceRequest, euser: ExtendedUser, workspace: Workspace, instance: WorkspaceInstance): Promise<StartWorkspaceResponse.AsObject | undefined> {
243+
let lastInstallation = "";
244+
const clusters = await this.clientProvider.getStartClusterSets(euser, workspace, instance);
245+
for await (let cluster of clusters) {
246+
try {
247+
// getStartManager will throw an exception if there's no cluster available and hence exit the loop
248+
const { manager, installation } = cluster;
249+
lastInstallation = installation;
250+
251+
instance.status.phase = "pending";
252+
instance.region = installation;
253+
await this.workspaceDb.trace(ctx).storeInstance(instance);
254+
try {
255+
await this.messageBus.notifyOnInstanceUpdate(workspace.ownerId, instance);
256+
} catch (err) {
257+
// if sending the notification fails that's no reason to stop the workspace creation.
258+
// If the dashboard misses this event it will catch up at the next one.
259+
ctx.span?.log({ "notifyOnInstanceUpdate.error": err });
260+
log.debug("cannot send instance update - this should be mostly inconsequential", err);
261+
}
262+
263+
// start that thing
264+
log.info({ instanceId: instance.id }, 'starting instance');
265+
return (await manager.startWorkspace(ctx, startRequest)).toObject();
266+
} catch (err: any) {
267+
if ('code' in err && err.code !== grpc.status.OK && lastInstallation !== "") {
268+
log.error({ instanceId: instance.id }, "cannot start workspace on cluster, might retry", err, { cluster: lastInstallation });
269+
} else {
270+
throw err;
271+
}
272+
}
273+
}
274+
275+
return undefined;
276+
}
277+
262278
protected async notifyOnPrebuildQueued(ctx: TraceContext, workspaceId: string) {
263279
const span = TraceContext.startSpan("notifyOnPrebuildQueued", ctx);
264280
const prebuild = await this.workspaceDb.trace({ span }).findPrebuildByWorkspaceID(workspaceId);

0 commit comments

Comments
 (0)