@@ -43,6 +43,9 @@ export interface StartWorkspaceOptions {
43
43
excludeFeatureFlags ?: NamedWorkspaceFeatureFlag [ ] ;
44
44
}
45
45
46
+ const MAX_INSTANCE_START_RETRIES = 2 ;
47
+ const INSTANCE_START_RETRY_INTERVAL_SECONDS = 2 ;
48
+
46
49
@injectable ( )
47
50
export class WorkspaceStarter {
48
51
@inject ( WorkspaceManagerClientProvider ) protected readonly clientProvider : WorkspaceManagerClientProvider ;
@@ -180,42 +183,19 @@ export class WorkspaceStarter {
180
183
const euser : ExtendedUser = {
181
184
...user ,
182
185
getsMoreResources : await this . userService . userGetsMoreResources ( user ) ,
183
- }
184
-
185
- // tell the world we're starting this instance
186
- let resp : StartWorkspaceResponse . AsObject | undefined ;
187
- let lastInstallation = "" ;
188
- const clusters = await this . clientProvider . getStartClusterSets ( euser , workspace , instance ) ;
189
- for await ( let cluster of clusters ) {
190
- try {
191
- // getStartManager will throw an exception if there's no cluster available and hence exit the loop
192
- const { manager, installation } = cluster ;
193
- lastInstallation = installation ;
194
-
195
- instance . status . phase = "pending" ;
196
- instance . region = installation ;
197
- await this . workspaceDb . trace ( { span } ) . storeInstance ( instance ) ;
198
- try {
199
- await this . messageBus . notifyOnInstanceUpdate ( workspace . ownerId , instance ) ;
200
- } catch ( err ) {
201
- // if sending the notification fails that's no reason to stop the workspace creation.
202
- // If the dashboard misses this event it will catch up at the next one.
203
- span . log ( { "notifyOnInstanceUpdate.error" : err } ) ;
204
- log . debug ( "cannot send instance update - this should be mostly inconsequential" , err ) ;
205
- }
186
+ } ;
206
187
207
- // start that thing
208
- log . info ( { instanceId : instance . id } , 'starting instance' ) ;
209
- resp = ( await manager . startWorkspace ( { span } , startRequest ) ) . toObject ( ) ;
188
+ // choose a cluster and start the instance
189
+ let resp : StartWorkspaceResponse . AsObject | undefined = undefined ;
190
+ let retries = 0 ;
191
+ for ( ; retries < MAX_INSTANCE_START_RETRIES ; retries ++ ) {
192
+ resp = await this . tryStartOnCluster ( { span } , startRequest , euser , workspace , instance ) ;
193
+ if ( resp ) {
210
194
break ;
211
- } catch ( err : any ) {
212
- if ( 'code' in err && err . code !== grpc . status . OK && lastInstallation !== "" ) {
213
- log . error ( { instanceId : instance . id } , "cannot start workspace on cluster, might retry" , err , { cluster : lastInstallation } ) ;
214
- } else {
215
- throw err ;
216
- }
217
195
}
196
+ await new Promise ( ( resolve ) => setTimeout ( resolve , INSTANCE_START_RETRY_INTERVAL_SECONDS * 1000 ) ) ;
218
197
}
198
+
219
199
if ( ! resp ) {
220
200
throw new Error ( "cannot start a workspace because no workspace clusters are available" ) ;
221
201
}
@@ -259,6 +239,42 @@ export class WorkspaceStarter {
259
239
}
260
240
}
261
241
242
+ protected async tryStartOnCluster ( ctx : TraceContext , startRequest : StartWorkspaceRequest , euser : ExtendedUser , workspace : Workspace , instance : WorkspaceInstance ) : Promise < StartWorkspaceResponse . AsObject | undefined > {
243
+ let lastInstallation = "" ;
244
+ const clusters = await this . clientProvider . getStartClusterSets ( euser , workspace , instance ) ;
245
+ for await ( let cluster of clusters ) {
246
+ try {
247
+ // getStartManager will throw an exception if there's no cluster available and hence exit the loop
248
+ const { manager, installation } = cluster ;
249
+ lastInstallation = installation ;
250
+
251
+ instance . status . phase = "pending" ;
252
+ instance . region = installation ;
253
+ await this . workspaceDb . trace ( ctx ) . storeInstance ( instance ) ;
254
+ try {
255
+ await this . messageBus . notifyOnInstanceUpdate ( workspace . ownerId , instance ) ;
256
+ } catch ( err ) {
257
+ // if sending the notification fails that's no reason to stop the workspace creation.
258
+ // If the dashboard misses this event it will catch up at the next one.
259
+ ctx . span ?. log ( { "notifyOnInstanceUpdate.error" : err } ) ;
260
+ log . debug ( "cannot send instance update - this should be mostly inconsequential" , err ) ;
261
+ }
262
+
263
+ // start that thing
264
+ log . info ( { instanceId : instance . id } , 'starting instance' ) ;
265
+ return ( await manager . startWorkspace ( ctx , startRequest ) ) . toObject ( ) ;
266
+ } catch ( err : any ) {
267
+ if ( 'code' in err && err . code !== grpc . status . OK && lastInstallation !== "" ) {
268
+ log . error ( { instanceId : instance . id } , "cannot start workspace on cluster, might retry" , err , { cluster : lastInstallation } ) ;
269
+ } else {
270
+ throw err ;
271
+ }
272
+ }
273
+ }
274
+
275
+ return undefined ;
276
+ }
277
+
262
278
protected async notifyOnPrebuildQueued ( ctx : TraceContext , workspaceId : string ) {
263
279
const span = TraceContext . startSpan ( "notifyOnPrebuildQueued" , ctx ) ;
264
280
const prebuild = await this . workspaceDb . trace ( { span } ) . findPrebuildByWorkspaceID ( workspaceId ) ;
0 commit comments