Skip to content

Commit 56a1b1d

Browse files
romilbhardwajMichaelvll
authored andcommitted
[k8s] sky local up speed up for GPUs (#3664)
wip
1 parent 703b3ce commit 56a1b1d

File tree

1 file changed

+0
-55
lines changed

1 file changed

+0
-55
lines changed

sky/utils/kubernetes/create_cluster.sh

-55
Original file line numberDiff line numberDiff line change
@@ -101,32 +101,6 @@ kind create cluster --config /tmp/skypilot-kind.yaml --name skypilot
101101

102102
echo "Kind cluster created."
103103

104-
# Function to wait for SkyPilot GPU labeling jobs to complete
105-
wait_for_gpu_labeling_jobs() {
106-
echo "Starting wait for SkyPilot GPU labeling jobs to complete..."
107-
108-
SECONDS=0
109-
TIMEOUT=600 # 10 minutes in seconds
110-
111-
while true; do
112-
TOTAL_JOBS=$(kubectl get jobs -n kube-system -l job=sky-gpu-labeler --no-headers | wc -l)
113-
COMPLETED_JOBS=$(kubectl get jobs -n kube-system -l job=sky-gpu-labeler --no-headers | grep "1/1" | wc -l)
114-
115-
if [[ $COMPLETED_JOBS -eq $TOTAL_JOBS ]]; then
116-
echo "All SkyPilot GPU labeling jobs completed ($TOTAL_JOBS)."
117-
break
118-
elif [ $SECONDS -ge $TIMEOUT ]; then
119-
echo "Timeout reached while waiting for GPU labeling jobs."
120-
exit 1
121-
else
122-
echo "Waiting for GPU labeling jobs to complete... ($COMPLETED_JOBS/$TOTAL_JOBS completed)"
123-
echo "To check status, see GPU labeling pods:"
124-
echo "kubectl get jobs -n kube-system -l job=sky-gpu-labeler"
125-
sleep 5
126-
fi
127-
done
128-
}
129-
130104
# Function to wait for GPU operator to be correctly installed
131105
wait_for_gpu_operator_installation() {
132106
echo "Starting wait for GPU operator installation..."
@@ -150,22 +124,6 @@ wait_for_gpu_operator_installation() {
150124
done
151125
}
152126

153-
wait_for_skypilot_gpu_image_pull() {
154-
echo "Pulling SkyPilot GPU image..."
155-
docker pull ${IMAGE_GPU}
156-
echo "Loading SkyPilot GPU image into kind cluster..."
157-
kind load docker-image --name skypilot ${IMAGE_GPU}
158-
echo "SkyPilot GPU image loaded into kind cluster."
159-
}
160-
161-
wait_for_skypilot_cpu_image_pull() {
162-
echo "Pulling SkyPilot CPU image..."
163-
docker pull ${IMAGE}
164-
echo "Loading SkyPilot CPU image into kind cluster..."
165-
kind load docker-image --name skypilot ${IMAGE}
166-
echo "SkyPilot CPU image loaded into kind cluster."
167-
}
168-
169127
wait_for_nginx_ingress_controller_install() {
170128
echo "Starting installation of Nginx Ingress Controller..."
171129

@@ -206,21 +164,8 @@ if $ENABLE_GPUS; then
206164
nvidia/gpu-operator --set driver.enabled=false
207165
# Wait for GPU operator installation to succeed
208166
wait_for_gpu_operator_installation
209-
210-
# Load the SkyPilot GPU image into the cluster for faster labelling
211-
wait_for_skypilot_gpu_image_pull
212-
213-
# Label nodes with GPUs
214-
echo "Labelling nodes with GPUs..."
215-
python -m sky.utils.kubernetes.gpu_labeler
216-
217-
# Wait for all the GPU labeling jobs to complete
218-
wait_for_gpu_labeling_jobs
219167
fi
220168

221-
# Load local skypilot image on to the cluster for faster startup
222-
wait_for_skypilot_cpu_image_pull
223-
224169
# Install the Nginx Ingress Controller
225170
wait_for_nginx_ingress_controller_install
226171

0 commit comments

Comments
 (0)