Skip to content

Commit 97cba00

Browse files
authored
[Core] Install SkyPilot runtime in separate env (#3575)
* Quote the command correctly when source_bashrc is not set * Remove unnecessary source bashrc * format * Fix setup script for conda * Add comment * format * Separate env for skypilot * add test smoke * add system site-packages * add test for default to non-base conda env * Fix controllers and ray node providers * move activate to maybe_skylet * Make axolotl example work for kubernetes * fix axolotl * add test for 3.12 * format * Fix docker PATH * format * add axolotl image in test * address comments * revert grpcio version as it is only installed in our runtime env * refactor command for env set up * switch to curl as CentOS may not have wget installed but have curl * add l4 in command * fix dependency for test * fix python path for ray executable * Fix azure launch * add comments * fix test * fix smoke * fix name * fix * fix usage * fix usage for accelerators * fix event * fix name * fix * address comments
1 parent 211386f commit 97cba00

27 files changed

+195
-85
lines changed

llm/axolotl/axolotl-docker.yaml

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# Usage:
2+
# HF_TOKEN=abc sky launch -c axolotl axolotl.yaml --env HF_TOKEN -y -i30 --down
3+
4+
name: axolotl
5+
6+
resources:
7+
accelerators: L4:1
8+
cloud: gcp # optional
9+
10+
workdir: mistral
11+
12+
setup: |
13+
docker pull winglian/axolotl:main-py3.10-cu118-2.0.1
14+
15+
run: |
16+
docker run --gpus all \
17+
-v ~/sky_workdir:/sky_workdir \
18+
-v /root/.cache:/root/.cache \
19+
winglian/axolotl:main-py3.10-cu118-2.0.1 \
20+
huggingface-cli login --token ${HF_TOKEN}
21+
22+
docker run --gpus all \
23+
-v ~/sky_workdir:/sky_workdir \
24+
-v /root/.cache:/root/.cache \
25+
winglian/axolotl:main-py3.10-cu118-2.0.1 \
26+
accelerate launch -m axolotl.cli.train /sky_workdir/qlora.yaml
27+
28+
envs:
29+
HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.

llm/axolotl/axolotl-spot.yaml

+3-19
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ resources:
1212
accelerators: A100:1
1313
cloud: gcp # optional
1414
use_spot: True
15+
image_id: docker:winglian/axolotl:main-py3.10-cu118-2.0.1
1516

1617
workdir: mistral
1718

@@ -20,29 +21,12 @@ file_mounts:
2021
name: ${BUCKET}
2122
mode: MOUNT
2223

23-
setup: |
24-
docker pull winglian/axolotl:main-py3.10-cu118-2.0.1
25-
2624
run: |
27-
docker run --gpus all \
28-
-v ~/sky_workdir:/sky_workdir \
29-
-v /root/.cache:/root/.cache \
30-
winglian/axolotl:main-py3.10-cu118-2.0.1 \
31-
huggingface-cli login --token ${HF_TOKEN}
25+
huggingface-cli login --token ${HF_TOKEN}
3226
33-
docker run --gpus all \
34-
-v ~/sky_workdir:/sky_workdir \
35-
-v /root/.cache:/root/.cache \
36-
-v /sky-notebook:/sky-notebook \
37-
winglian/axolotl:main-py3.10-cu118-2.0.1 \
38-
accelerate launch -m axolotl.cli.train /sky_workdir/qlora-checkpoint.yaml
27+
accelerate launch -m axolotl.cli.train qlora-checkpoint.yaml
3928
4029
envs:
4130
HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
4231
BUCKET: # TODO: Fill with your unique bucket name, or use --env to pass.
43-
44-
45-
46-
47-
4832

llm/axolotl/axolotl.yaml

+4-15
Original file line numberDiff line numberDiff line change
@@ -5,25 +5,14 @@ name: axolotl
55

66
resources:
77
accelerators: L4:1
8-
cloud: gcp # optional
8+
image_id: docker:winglian/axolotl:main-py3.10-cu118-2.0.1
99

1010
workdir: mistral
1111

12-
setup: |
13-
docker pull winglian/axolotl:main-py3.10-cu118-2.0.1
14-
1512
run: |
16-
docker run --gpus all \
17-
-v ~/sky_workdir:/sky_workdir \
18-
-v /root/.cache:/root/.cache \
19-
winglian/axolotl:main-py3.10-cu118-2.0.1 \
20-
huggingface-cli login --token ${HF_TOKEN}
21-
22-
docker run --gpus all \
23-
-v ~/sky_workdir:/sky_workdir \
24-
-v /root/.cache:/root/.cache \
25-
winglian/axolotl:main-py3.10-cu118-2.0.1 \
26-
accelerate launch -m axolotl.cli.train /sky_workdir/qlora.yaml
13+
huggingface-cli login --token ${HF_TOKEN}
14+
15+
accelerate launch -m axolotl.cli.train qlora.yaml
2716
2817
envs:
2918
HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.

llm/axolotl/mistral/qlora-checkpoint.yaml

+2-1
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ warmup_steps: 10
7171
eval_steps: 0.05
7272
eval_table_size:
7373
eval_table_max_new_tokens: 128
74+
eval_sample_packing: false
7475
save_steps: 2 ## increase based on your dataset
7576
save_strategy: steps
7677
debug:
@@ -81,4 +82,4 @@ fsdp_config:
8182
special_tokens:
8283
bos_token: "<s>"
8384
eos_token: "</s>"
84-
unk_token: "<unk>"
85+
unk_token: "<unk>"

llm/axolotl/mistral/qlora.yaml

+2-1
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ warmup_steps: 10
6969
eval_steps: 0.05
7070
eval_table_size:
7171
eval_table_max_new_tokens: 128
72+
eval_sample_packing: false
7273
save_steps:
7374
debug:
7475
deepspeed:
@@ -78,4 +79,4 @@ fsdp_config:
7879
special_tokens:
7980
bos_token: "<s>"
8081
eos_token: "</s>"
81-
unk_token: "<unk>"
82+
unk_token: "<unk>"

sky/backends/backend_utils.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -925,7 +925,14 @@ def write_cluster_config(
925925
'dump_port_command': dump_port_command,
926926
# Sky-internal constants.
927927
'sky_ray_cmd': constants.SKY_RAY_CMD,
928-
'sky_pip_cmd': constants.SKY_PIP_CMD,
928+
# pip install needs to have python env activated to make sure
929+
# installed packages are within the env path.
930+
'sky_pip_cmd': f'{constants.SKY_PIP_CMD}',
931+
# Activate the SkyPilot runtime environment when starting ray
932+
# cluster, so that ray autoscaler can access cloud SDK and CLIs
933+
# on remote
934+
'sky_activate_python_env':
935+
constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV,
929936
'ray_version': constants.SKY_REMOTE_RAY_VERSION,
930937
# Command for waiting ray cluster to be ready on head.
931938
'ray_head_wait_initialized_command':

sky/jobs/core.py

-1
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,6 @@ def launch(
9898
'dag_name': dag.name,
9999
'retry_until_up': retry_until_up,
100100
'remote_user_config_path': remote_user_config_path,
101-
'sky_python_cmd': skylet_constants.SKY_PYTHON_CMD,
102101
'modified_catalogs':
103102
service_catalog_common.get_modified_catalog_file_mounts(),
104103
**controller_utils.shared_controller_vars_to_fill(

sky/provision/docker_utils.py

+15-1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,17 @@
1515
DOCKER_PERMISSION_DENIED_STR = ('permission denied while trying to connect to '
1616
'the Docker daemon socket')
1717

18+
# Configure environment variables. A docker image can have environment variables
19+
# set in the Dockerfile with `ENV``. We need to export these variables to the
20+
# shell environment, so that our ssh session can access them.
21+
SETUP_ENV_VARS_CMD = (
22+
'prefix_cmd() '
23+
'{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; } && '
24+
'printenv | while IFS=\'=\' read -r key value; do echo "export $key=\\\"$value\\\""; done > ' # pylint: disable=line-too-long
25+
'~/container_env_var.sh && '
26+
'$(prefix_cmd) mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh'
27+
)
28+
1829

1930
@dataclasses.dataclass
2031
class DockerLoginConfig:
@@ -244,6 +255,8 @@ def initialize(self) -> str:
244255
self._run(start_command)
245256

246257
# SkyPilot: Setup Commands.
258+
# TODO(zhwu): the following setups should be aligned with the kubernetes
259+
# pod setup, like provision.kubernetes.instance::_set_env_vars_in_pods
247260
# TODO(tian): These setup commands assumed that the container is
248261
# debian-based. We should make it more general.
249262
# Most of docker images are using root as default user, so we set an
@@ -296,7 +309,8 @@ def initialize(self) -> str:
296309
'mkdir -p ~/.ssh;'
297310
'cat /tmp/host_ssh_authorized_keys >> ~/.ssh/authorized_keys;'
298311
'sudo service ssh start;'
299-
'sudo sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;',
312+
'sudo sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;'
313+
f'{SETUP_ENV_VARS_CMD}',
300314
run_env='docker')
301315

302316
# SkyPilot: End of Setup Commands.

sky/provision/instance_setup.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,10 @@
6161
'done;')
6262

6363
# Restart skylet when the version does not match to keep the skylet up-to-date.
64-
MAYBE_SKYLET_RESTART_CMD = (f'{constants.SKY_PYTHON_CMD} -m '
64+
# We need to activate the python environment to make sure autostop in skylet
65+
# can find the cloud SDK/CLI in PATH.
66+
MAYBE_SKYLET_RESTART_CMD = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV}; '
67+
f'{constants.SKY_PYTHON_CMD} -m '
6568
'sky.skylet.attempt_skylet;')
6669

6770

sky/provision/kubernetes/instance.py

+5-8
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from sky import status_lib
1111
from sky.adaptors import kubernetes
1212
from sky.provision import common
13+
from sky.provision import docker_utils
1314
from sky.provision.kubernetes import config as config_lib
1415
from sky.provision.kubernetes import utils as kubernetes_utils
1516
from sky.utils import common_utils
@@ -241,7 +242,7 @@ def _wait_for_pods_to_run(namespace, new_nodes):
241242
'the node. Error details: '
242243
f'{container_status.state.waiting.message}.')
243244
# Reaching this point means that one of the pods had an issue,
244-
# so break out of the loop
245+
# so break out of the loop, and wait until next second.
245246
break
246247

247248
if all_pods_running:
@@ -301,13 +302,7 @@ def _set_env_vars_in_pods(namespace: str, new_pods: List):
301302
set_k8s_env_var_cmd = [
302303
'/bin/sh',
303304
'-c',
304-
(
305-
'prefix_cmd() '
306-
'{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; } && '
307-
'printenv | while IFS=\'=\' read -r key value; do echo "export $key=\\\"$value\\\""; done > ' # pylint: disable=line-too-long
308-
'~/k8s_env_var.sh && '
309-
'mv ~/k8s_env_var.sh /etc/profile.d/k8s_env_var.sh || '
310-
'$(prefix_cmd) mv ~/k8s_env_var.sh /etc/profile.d/k8s_env_var.sh')
305+
docker_utils.SETUP_ENV_VARS_CMD,
311306
]
312307

313308
for new_pod in new_pods:
@@ -540,6 +535,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
540535
_wait_for_pods_to_schedule(namespace, wait_pods, provision_timeout)
541536
# Wait until the pods and their containers are up and running, and
542537
# fail early if there is an error
538+
logger.debug(f'run_instances: waiting for pods to be running (pulling '
539+
f'images): {list(wait_pods_dict.keys())}')
543540
_wait_for_pods_to_run(namespace, wait_pods)
544541
logger.debug(f'run_instances: all pods are scheduled and running: '
545542
f'{list(wait_pods_dict.keys())}')

sky/skylet/attempt_skylet.py

+3
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ def restart_skylet():
2121
shell=True,
2222
check=False)
2323
subprocess.run(
24+
# We have made sure that `attempt_skylet.py` is executed with the
25+
# skypilot runtime env activated, so that skylet can access the cloud
26+
# CLI tools.
2427
f'nohup {constants.SKY_PYTHON_CMD} -m sky.skylet.skylet'
2528
' >> ~/.sky/skylet.log 2>&1 &',
2629
shell=True,

sky/skylet/constants.py

+30-11
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,18 @@
3737
SKY_PYTHON_CMD = f'$({SKY_GET_PYTHON_PATH_CMD})'
3838
SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip'
3939
# Ray executable, e.g., /opt/conda/bin/ray
40-
SKY_RAY_CMD = (f'$([ -s {SKY_RAY_PATH_FILE} ] && '
40+
# We need to add SKY_PYTHON_CMD before ray executable because:
41+
# The ray executable is a python script with a header like:
42+
# #!/opt/conda/bin/python3
43+
# When we create the skypilot-runtime venv, the previously installed ray
44+
# executable will be reused (due to --system-site-packages), and that will cause
45+
# running ray CLI commands to use the wrong python executable.
46+
SKY_RAY_CMD = (f'{SKY_PYTHON_CMD} $([ -s {SKY_RAY_PATH_FILE} ] && '
4147
f'cat {SKY_RAY_PATH_FILE} 2> /dev/null || which ray)')
48+
# Separate env for SkyPilot runtime dependencies.
49+
SKY_REMOTE_PYTHON_ENV_NAME = 'skypilot-runtime'
50+
SKY_REMOTE_PYTHON_ENV = f'~/{SKY_REMOTE_PYTHON_ENV_NAME}'
51+
ACTIVATE_SKY_REMOTE_PYTHON_ENV = f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate'
4252

4353
# The name for the environment variable that stores the unique ID of the
4454
# current task. This will stay the same across multiple recoveries of the
@@ -91,20 +101,27 @@
91101
# AWS's Deep Learning AMI's default conda environment.
92102
CONDA_INSTALLATION_COMMANDS = (
93103
'which conda > /dev/null 2>&1 || '
94-
'{ wget -nc https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh -O Miniconda3-Linux-x86_64.sh && ' # pylint: disable=line-too-long
104+
'{ curl https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh -o Miniconda3-Linux-x86_64.sh && ' # pylint: disable=line-too-long
95105
'bash Miniconda3-Linux-x86_64.sh -b && '
96106
'eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && '
97107
'conda config --set auto_activate_base true && '
98-
# Use $(echo ~) instead of ~ to avoid the error "no such file or directory".
99-
# Also, not using $HOME to avoid the error HOME variable not set.
100-
f'echo "$(echo ~)/miniconda3/bin/python" > {SKY_PYTHON_PATH_FILE}; }}; '
108+
f'conda activate base; }}; '
101109
'grep "# >>> conda initialize >>>" ~/.bashrc || '
102110
'{ conda init && source ~/.bashrc; };'
103-
'(type -a python | grep -q python3) || '
104-
'echo \'alias python=python3\' >> ~/.bashrc;'
105-
'(type -a pip | grep -q pip3) || echo \'alias pip=pip3\' >> ~/.bashrc;'
106-
# Writes Python path to file if it does not exist or the file is empty.
107-
f'[ -s {SKY_PYTHON_PATH_FILE} ] || which python3 > {SKY_PYTHON_PATH_FILE};')
111+
# If Python version is larger then equal to 3.12, create a new conda env
112+
# with Python 3.10.
113+
# We don't use a separate conda env for SkyPilot dependencies because it is
114+
# costly to create a new conda env, and venv should be a lightweight and
115+
# faster alternative when the python version satisfies the requirement.
116+
'[[ $(python3 --version | cut -d " " -f 2 | cut -d "." -f 2) -ge 12 ]] && '
117+
f'echo "Creating conda env with Python 3.10" && '
118+
f'conda create -y -n {SKY_REMOTE_PYTHON_ENV_NAME} python=3.10 && '
119+
f'conda activate {SKY_REMOTE_PYTHON_ENV_NAME};'
120+
# Create a separate conda environment for SkyPilot dependencies.
121+
f'[ -d {SKY_REMOTE_PYTHON_ENV} ] || '
122+
f'{{ {SKY_PYTHON_CMD} -m venv {SKY_REMOTE_PYTHON_ENV} --system-site-packages && '
123+
f'echo "$(echo {SKY_REMOTE_PYTHON_ENV})/bin/python" > {SKY_PYTHON_PATH_FILE}; }};'
124+
)
108125

109126
_sky_version = str(version.parse(sky.__version__))
110127
RAY_STATUS = f'RAY_ADDRESS=127.0.0.1:{SKY_REMOTE_RAY_PORT} {SKY_RAY_CMD} status'
@@ -142,7 +159,9 @@
142159
# mentioned above are resolved.
143160
'export PATH=$PATH:$HOME/.local/bin; '
144161
# Writes ray path to file if it does not exist or the file is empty.
145-
f'[ -s {SKY_RAY_PATH_FILE} ] || which ray > {SKY_RAY_PATH_FILE}; '
162+
f'[ -s {SKY_RAY_PATH_FILE} ] || '
163+
f'{{ {ACTIVATE_SKY_REMOTE_PYTHON_ENV} && '
164+
f'which ray > {SKY_RAY_PATH_FILE} || exit 1; }}; '
146165
# END ray package check and installation
147166
f'{{ {SKY_PIP_CMD} list | grep "skypilot " && '
148167
'[ "$(cat ~/.sky/wheels/current_sky_wheel_hash)" == "{sky_wheel_hash}" ]; } || ' # pylint: disable=line-too-long

sky/skylet/events.py

+13-2
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import os
44
import re
55
import subprocess
6-
import sys
76
import time
87
import traceback
98

@@ -193,7 +192,10 @@ def _stop_cluster(self, autostop_config):
193192
# Passing env inherited from os.environ is technically not
194193
# needed, because we call `python <script>` rather than `ray
195194
# <cmd>`. We just need the {RAY_USAGE_STATS_ENABLED: 0} part.
196-
subprocess.run([sys.executable, script], check=True, env=env)
195+
subprocess.run(f'{constants.SKY_PYTHON_CMD} {script}',
196+
check=True,
197+
shell=True,
198+
env=env)
197199

198200
logger.info('Running ray down.')
199201
# Stop the workers first to avoid orphan workers.
@@ -206,6 +208,15 @@ def _stop_cluster(self, autostop_config):
206208
# <cmd>`.
207209
env=env)
208210

211+
# Stop the ray autoscaler to avoid scaling up, during
212+
# stopping/terminating of the cluster. We do not rely `ray down`
213+
# below for stopping ray cluster, as it will not use the correct
214+
# ray path.
215+
logger.info('Stopping the ray cluster.')
216+
subprocess.run(f'{constants.SKY_RAY_CMD} stop',
217+
shell=True,
218+
check=True)
219+
209220
logger.info('Running final ray down.')
210221
subprocess.run(
211222
f'{constants.SKY_RAY_CMD} down -y {config_path}',

sky/templates/azure-ray.yml.j2

+2-2
Original file line numberDiff line numberDiff line change
@@ -164,14 +164,14 @@ setup_commands:
164164
# current num items (num SSH connections): 2
165165
head_start_ray_commands:
166166
# NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait.
167-
- {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--num-gpus=%s" % num_gpus if num_gpus}} {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
167+
- {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--num-gpus=%s" % num_gpus if num_gpus}} {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
168168
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
169169
{{dump_port_command}};
170170
{{ray_head_wait_initialized_command}}
171171

172172
{%- if num_nodes > 1 %}
173173
worker_start_ray_commands:
174-
- {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--num-gpus=%s" % num_gpus if num_gpus}} {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
174+
- {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--num-gpus=%s" % num_gpus if num_gpus}} {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
175175
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
176176
{%- else %}
177177
worker_start_ray_commands: []

sky/templates/ibm-ray.yml.j2

+2-2
Original file line numberDiff line numberDiff line change
@@ -118,13 +118,13 @@ head_start_ray_commands:
118118
# NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait.
119119
# Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires
120120
# all the sessions to be reloaded. This is a workaround.
121-
- {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
121+
- {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
122122
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
123123
{{dump_port_command}}; {{ray_head_wait_initialized_command}}
124124

125125
{%- if num_nodes > 1 %}
126126
worker_start_ray_commands:
127-
- {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
127+
- {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
128128
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
129129
{%- else %}
130130
worker_start_ray_commands: []

0 commit comments

Comments
 (0)