Skip to content

Commit 78e9134

Browse files
authored
Fix EKS control plane getting overloaded when adding nodes to the cluster (#2331)
1 parent 552d7fe commit 78e9134

15 files changed

+1411
-30
lines changed

CONTRIBUTING.md

+3
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,9 @@ Add this to your bash profile (e.g. `~/.bash_profile`, `~/.profile` or `~/.bashr
166166
# set the default image registry
167167
export CORTEX_DEV_DEFAULT_IMAGE_REGISTRY="<account_id>.dkr.ecr.<region>.amazonaws.com/cortexlabs"
168168
169+
# enable api server monitoring in grafana
170+
export CORTEX_DEV_ADD_CONTROL_PLANE_DASHBOARD="true"
171+
169172
# redirect analytics and error reporting to our dev environment
170173
export CORTEX_TELEMETRY_SENTRY_DSN="https://[email protected]/1848098"
171174
export CORTEX_TELEMETRY_SEGMENT_WRITE_KEY="0WvoJyCey9z1W2EW7rYTPJUMRYat46dl"

cli/cmd/cluster.go

+16-1
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,21 @@ var _clusterConfigureCmd = &cobra.Command{
348348
exit.Error(err)
349349
}
350350

351+
restConfig, err := getClusterRESTConfig(awsClient, accessConfig.ClusterName)
352+
if err != nil {
353+
exit.Error(err)
354+
}
355+
356+
scheme := runtime.NewScheme()
357+
if err := clientgoscheme.AddToScheme(scheme); err != nil {
358+
exit.Error(err)
359+
}
360+
361+
k8sClient, err := k8s.New("default", false, restConfig, scheme)
362+
if err != nil {
363+
exit.Error(err)
364+
}
365+
351366
stacks, err := clusterstate.GetClusterStacks(awsClient, accessConfig)
352367
if err != nil {
353368
exit.Error(err)
@@ -362,7 +377,7 @@ var _clusterConfigureCmd = &cobra.Command{
362377

363378
promptIfNotAdmin(awsClient, _flagClusterDisallowPrompt)
364379

365-
newClusterConfig, configureChanges, err := getConfigureClusterConfig(awsClient, stacks, oldClusterConfig, clusterConfigFile)
380+
newClusterConfig, configureChanges, err := getConfigureClusterConfig(awsClient, k8sClient, stacks, oldClusterConfig, clusterConfigFile)
366381
if err != nil {
367382
exit.Error(err)
368383
}

cli/cmd/lib_cluster_config.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
cr "github.com/cortexlabs/cortex/pkg/lib/configreader"
2828
"github.com/cortexlabs/cortex/pkg/lib/errors"
2929
"github.com/cortexlabs/cortex/pkg/lib/files"
30+
"github.com/cortexlabs/cortex/pkg/lib/k8s"
3031
"github.com/cortexlabs/cortex/pkg/lib/maps"
3132
libmath "github.com/cortexlabs/cortex/pkg/lib/math"
3233
"github.com/cortexlabs/cortex/pkg/lib/pointer"
@@ -140,7 +141,7 @@ func getInstallClusterConfig(awsClient *aws.Client, clusterConfigFile string) (*
140141
return clusterConfig, nil
141142
}
142143

143-
func getConfigureClusterConfig(awsClient *aws.Client, stacks clusterstate.ClusterStacks, cachedClusterConfig clusterconfig.Config, newClusterConfigFile string) (*clusterconfig.Config, clusterconfig.ConfigureChanges, error) {
144+
func getConfigureClusterConfig(awsClient *aws.Client, k8sClient *k8s.Client, stacks clusterstate.ClusterStacks, cachedClusterConfig clusterconfig.Config, newClusterConfigFile string) (*clusterconfig.Config, clusterconfig.ConfigureChanges, error) {
144145
newUserClusterConfig := &clusterconfig.Config{}
145146

146147
err := readUserClusterConfigFile(newUserClusterConfig, newClusterConfigFile)
@@ -151,7 +152,7 @@ func getConfigureClusterConfig(awsClient *aws.Client, stacks clusterstate.Cluste
151152
newUserClusterConfig.Telemetry = isTelemetryEnabled()
152153
cachedClusterConfig.Telemetry = newUserClusterConfig.Telemetry
153154

154-
configureChanges, err := newUserClusterConfig.ValidateOnConfigure(awsClient, cachedClusterConfig, stacks.NodeGroupsStacks)
155+
configureChanges, err := newUserClusterConfig.ValidateOnConfigure(awsClient, k8sClient, cachedClusterConfig, stacks.NodeGroupsStacks)
155156
if err != nil {
156157
err = errors.Append(err, fmt.Sprintf("\n\ncluster configuration schema can be found at https://docs.cortex.dev/v/%s/", consts.CortexVersionMinor))
157158
return nil, clusterconfig.ConfigureChanges{}, errors.Wrap(err, newClusterConfigFile)

cli/cmd/lib_manager.go

+1
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,7 @@ func runManagerWithClusterConfig(entrypoint string, clusterConfig *clusterconfig
191191
"CORTEX_TELEMETRY_SENTRY_DSN=" + os.Getenv("CORTEX_TELEMETRY_SENTRY_DSN"),
192192
"CORTEX_TELEMETRY_SEGMENT_WRITE_KEY=" + os.Getenv("CORTEX_TELEMETRY_SEGMENT_WRITE_KEY"),
193193
"CORTEX_DEV_DEFAULT_IMAGE_REGISTRY=" + os.Getenv("CORTEX_DEV_DEFAULT_IMAGE_REGISTRY"),
194+
"CORTEX_DEV_ADD_CONTROL_PLANE_DASHBOARD=" + os.Getenv("CORTEX_DEV_ADD_CONTROL_PLANE_DASHBOARD"),
194195
"CORTEX_CLUSTER_CONFIG_FILE=" + containerClusterConfigPath,
195196
}
196197
envs = append(envs, extraEnvs...)

dev/versions.md

+5-2
Original file line numberDiff line numberDiff line change
@@ -177,11 +177,14 @@ see https://github.com/moby/moby/issues/39302#issuecomment-639687466_
177177

178178
1. Find the latest patch release for our current version of k8s (e.g. k8s v1.17 -> cluster-autocluster v1.17.3)
179179
on [GitHub](https://github.com/kubernetes/autoscaler/releases) and check the changelog
180-
1. Update the base image in `images/cluster-autoscaler/Dockerfile` to the repository URL shown in the GitHub release
181180
1. In the [GitHub Repo](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/aws),
182181
set the tree to the tag for the chosen release, and open `cloudprovider/aws/examples/cluster-autoscaler-autodiscover.yaml`
183182
(e.g. <https://github.com/kubernetes/autoscaler/blob/cluster-autoscaler-1.20.0/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-autodiscover.yaml>)
184-
1. Resolve merge conflicts with the template in `manager/manifests/cluster-autoscaler.yaml.j2`
183+
1. Resolve merge conflicts with the template in `manager/manifests/cluster-autoscaler.yaml.j2`.
184+
1. Pull the release branch from the upstream repo to Cortex's fork on [Github](https://github.com/cortexlabs/autoscaler).
185+
1. Apply the rate-limiter changes from the previous version to the new one (currently sitting on `cluster-autoscaler-release-1.20` branch).
186+
1. Update `-b` flag's value from `git clone` command in `images/cluster-autoscaler/Dockerfile` to the branch name of the latest release from Cortex's fork.
187+
1. Match the Go version of the builder in `images/cluster-autoscaler/Dockerfile` with that of the [cluster autoscaler](https://github.com/kubernetes/autoscaler)'s Dockerfile.
185188

186189
## FluentBit
187190

images/cluster-autoscaler/Dockerfile

+11-1
Original file line numberDiff line numberDiff line change
@@ -1 +1,11 @@
1-
FROM k8s.gcr.io/autoscaling/cluster-autoscaler:v1.20.0
1+
ARG TARGETARCH, TARGETOS
2+
3+
FROM golang:1.15 AS builder
4+
RUN git clone -b cluster-autoscaler-release-1.20 --depth 1 https://github.com/cortexlabs/autoscaler /k8s.io/autoscaler
5+
WORKDIR /k8s.io/autoscaler/cluster-autoscaler
6+
RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build --installsuffix cgo -o cluster-autoscaler k8s.io/autoscaler/cluster-autoscaler \
7+
&& cp cluster-autoscaler /usr/local/bin
8+
9+
FROM alpine:3.8
10+
RUN apk add -U --no-cache ca-certificates && rm -rf /var/cache/apk/*
11+
COPY --from=builder /usr/local/bin/cluster-autoscaler .

manager/install.sh

+7-2
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,9 @@ function setup_prometheus() {
225225
envsubst < manifests/prometheus-kubelet-exporter.yaml | kubectl apply -f - >/dev/null
226226
envsubst < manifests/prometheus-kube-state-metrics.yaml | kubectl apply -f - >/dev/null
227227
envsubst < manifests/prometheus-node-exporter.yaml | kubectl apply -f - >/dev/null
228-
python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/prometheus-monitoring.yaml.j2 | kubectl apply -f - >/dev/null
228+
envsubst < manifests/prometheus-monitoring.yaml | kubectl apply -f - >/dev/null
229+
python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/prometheus-additional-scrape-configs.yaml.j2 > prometheus-additional-scrape-configs.yaml
230+
kubectl create secret generic additional-scrape-configs --from-file=prometheus-additional-scrape-configs.yaml
229231
}
230232

231233
function setup_grafana() {
@@ -235,7 +237,10 @@ function setup_grafana() {
235237
kubectl apply -f manifests/grafana/grafana-dashboard-task.yaml >/dev/null
236238
kubectl apply -f manifests/grafana/grafana-dashboard-cluster.yaml >/dev/null
237239
kubectl apply -f manifests/grafana/grafana-dashboard-nodes.yaml >/dev/null
238-
envsubst < manifests/grafana/grafana.yaml | kubectl apply -f - >/dev/null
240+
if [ "$CORTEX_DEV_ADD_CONTROL_PLANE_DASHBOARD" = "true" ]; then
241+
kubectl apply -f manifests/grafana/grafana-dashboard-control-plane.yaml >/dev/null
242+
fi
243+
python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/grafana/grafana.yaml.j2 | kubectl apply -f - >/dev/null
239244
}
240245

241246
function restart_operator() {

manager/manifests/cluster-autoscaler.yaml.j2

+3
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,9 @@ spec:
197197
- --ok-total-unready-count=30
198198
- --max-node-provision-time=8m
199199
- --scan-interval=20s
200+
- --scale-up-rate-limit-enabled=true
201+
- --scale-up-max-number-nodes-per-min=50
202+
- --scale-up-burst-number-nodes-per-min=75
200203
- --node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/{{ config['cluster_name'] }}
201204
volumeMounts:
202205
- name: ssl-certs

0 commit comments

Comments
 (0)