Skip to content

[ws-daemon] Add backup ratelimiting timeout #10384

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 31, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 29 additions & 9 deletions components/ws-daemon/pkg/content/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ import (

// Metrics combine custom metrics exported by WorkspaceService
type metrics struct {
BackupWaitingTimeHist prometheus.Histogram
BackupWaitingTimeHist prometheus.Histogram
BackupWaitingTimeoutCounter prometheus.Counter
}

// WorkspaceService implements the InitService and WorkspaceService
Expand All @@ -55,6 +56,9 @@ type WorkspaceService struct {

metrics *metrics

// channel to limit the number of concurrent backups and uploads.
backupWorkspaceLimiter chan struct{}

api.UnimplementedInWorkspaceServiceServer
api.UnimplementedWorkspaceContentServiceServer
}
Expand Down Expand Up @@ -92,8 +96,15 @@ func NewWorkspaceService(ctx context.Context, cfg Config, kubernetesNamespace st

waitingTimeHist, err := registerConcurrentBackupWaitingTime(reg)
if err != nil {
return nil, xerrors.Errorf("cannot register Prometheus gauge for babkup waiting time: %w", err)

return nil, xerrors.Errorf("cannot register Prometheus histogram for backup waiting time: %w", err)
}
waitingTimeoutCounter := prometheus.NewCounter(prometheus.CounterOpts{
Name: "concurrent_backup_waiting_timeout_total",
Help: "total count of backup rate limiting timeouts",
})
err = reg.Register(waitingTimeoutCounter)
if err != nil {
return nil, xerrors.Errorf("cannot register Prometheus counter for backup waiting timeouts: %w", err)
}

return &WorkspaceService{
Expand All @@ -103,7 +114,12 @@ func NewWorkspaceService(ctx context.Context, cfg Config, kubernetesNamespace st
stopService: stopService,
runtime: runtime,

metrics: &metrics{waitingTimeHist},
metrics: &metrics{
BackupWaitingTimeHist: waitingTimeHist,
BackupWaitingTimeoutCounter: waitingTimeoutCounter,
},
// we permit three concurrent backups at any given time, hence the three in the channel
backupWorkspaceLimiter: make(chan struct{}, 3),
}, nil
}

Expand Down Expand Up @@ -426,9 +442,6 @@ func (s *WorkspaceService) DisposeWorkspace(ctx context.Context, req *api.Dispos
return resp, nil
}

// channel to limit the number of concurrent backups and uploads.
var backupWorkspaceLimiter = make(chan bool, 3)

func (s *WorkspaceService) uploadWorkspaceContent(ctx context.Context, sess *session.Workspace, backupName, mfName string) (err error) {
//nolint:ineffassign
span, ctx := opentracing.StartSpanFromContext(ctx, "uploadWorkspaceContent")
Expand All @@ -443,12 +456,19 @@ func (s *WorkspaceService) uploadWorkspaceContent(ctx context.Context, sess *ses
// TODO: remove once we migrate to PVCs
// Avoid too many simultaneous backups in order to avoid excessive memory utilization.
waitStart := time.Now()
backupWorkspaceLimiter <- true
select {
case s.backupWorkspaceLimiter <- struct{}{}:
case <-time.After(15 * time.Minute):
// we timed out on the rate limit - let's upload anyways, because we don't want to actually block
// an upload. If we reach this point, chances are other things are broken. No upload should ever
// take this long.
s.metrics.BackupWaitingTimeoutCounter.Inc()
}

s.metrics.BackupWaitingTimeHist.Observe(time.Since(waitStart).Seconds())

defer func() {
<-backupWorkspaceLimiter
<-s.backupWorkspaceLimiter
}()

var (
Expand Down