Skip to content

Commit a92fa56

Browse files
romilbhardwajSalikovAlex
authored andcommitted
[Core] Exit with non-zero code on launch/exec/logs/jobs launch/jobs logs (skypilot-org#4846)
* Support return code based on job success/failure * Return exit code for tailing managed jobs * Fixes * lint * Create JobExitCode enum * Get JobExitCode from ManagedJobStatus * lint * cleanup * cleanup * Add tests * lint * Managed jobs back compat * Skylet backward compatibility * lint * Update logs --status returncodes * Update logs --status returncodes * lint * fix retcode * Fix tests * lint * Fix --no-follow * Fix cli docs rendering * minor * rename ret_code to returncode * rename SUCCESS to SUCCEEDED * Refactor JobExitCode to exceptions * lint
1 parent a0062c5 commit a92fa56

15 files changed

+296
-66
lines changed

sky/backends/cloud_vm_ray_backend.py

+16-8
Original file line numberDiff line numberDiff line change
@@ -3823,6 +3823,10 @@ def tail_logs(self,
38233823
follow: Whether to follow the logs.
38243824
tail: The number of lines to display from the end of the
38253825
log file. If 0, print all lines.
3826+
3827+
Returns:
3828+
The exit code of the tail command. Returns code 100 if the job has
3829+
failed. See exceptions.JobExitCode for possible return codes.
38263830
"""
38273831
code = job_lib.JobLibCodeGen.tail_logs(job_id,
38283832
managed_job_id=managed_job_id,
@@ -3856,7 +3860,7 @@ def tail_managed_job_logs(self,
38563860
job_id: Optional[int] = None,
38573861
job_name: Optional[str] = None,
38583862
controller: bool = False,
3859-
follow: bool = True) -> None:
3863+
follow: bool = True) -> int:
38603864
# if job_name is not None, job_id should be None
38613865
assert job_name is None or job_id is None, (job_name, job_id)
38623866
code = managed_jobs.ManagedJobCodeGen.stream_logs(
@@ -3869,13 +3873,17 @@ def tail_managed_job_logs(self,
38693873
signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
38703874

38713875
# Refer to the notes in tail_logs.
3872-
self.run_on_head(
3873-
handle,
3874-
code,
3875-
stream_logs=True,
3876-
process_stream=False,
3877-
ssh_mode=command_runner.SshMode.INTERACTIVE,
3878-
)
3876+
try:
3877+
returncode = self.run_on_head(
3878+
handle,
3879+
code,
3880+
stream_logs=True,
3881+
process_stream=False,
3882+
ssh_mode=command_runner.SshMode.INTERACTIVE,
3883+
)
3884+
except SystemExit as e:
3885+
returncode = e.code
3886+
return returncode
38793887

38803888
def sync_down_managed_job_logs(
38813889
self,

sky/cli.py

+34-17
Original file line numberDiff line numberDiff line change
@@ -1227,11 +1227,15 @@ def launch(
12271227
clusters=[handle.get_cluster_name()])
12281228
# job_id will be None if no job was submitted (e.g. no entrypoint
12291229
# provided)
1230+
returncode = 0
12301231
if not detach_run and job_id is not None:
1231-
sdk.tail_logs(handle.get_cluster_name(), job_id, follow=True)
1232+
returncode = sdk.tail_logs(handle.get_cluster_name(),
1233+
job_id,
1234+
follow=True)
12321235
click.secho(
12331236
ux_utils.command_hint_messages(ux_utils.CommandHintType.CLUSTER_JOB,
12341237
job_id, handle.get_cluster_name()))
1238+
sys.exit(returncode)
12351239

12361240

12371241
@cli.command(cls=_DocumentedCodeCommand)
@@ -1377,7 +1381,8 @@ def exec(cluster: Optional[str], cluster_option: Optional[str],
13771381
job_id_handle = _async_call_or_wait(request_id, async_call, 'sky.exec')
13781382
if not async_call and not detach_run:
13791383
job_id, _ = job_id_handle
1380-
sdk.tail_logs(cluster, job_id, follow=True)
1384+
returncode = sdk.tail_logs(cluster, job_id, follow=True)
1385+
sys.exit(returncode)
13811386

13821387

13831388
def _handle_jobs_queue_request(
@@ -2121,12 +2126,20 @@ def logs(
21212126
one job_id can be provided.
21222127
21232128
2. If ``--status`` is specified, print the status of the job and exit with
2124-
returncode 0 if the job succeeded, or 1 otherwise. At most one job_id can
2125-
be specified.
2129+
returncode 0 if the job succeeded. At most one job_id can
2130+
be specified. Other possible return codes:
2131+
2132+
- 100: job failed.
2133+
- 101: job not finished.
2134+
- 102: job not found.
2135+
- 103: job was cancelled by the user.
21262136
21272137
3. If ``--sync-down`` is specified, the logs of the job will be downloaded
21282138
from the cluster and saved to the local machine under
2129-
``~/sky_logs``. Mulitple job_ids can be specified.
2139+
``~/sky_logs``. Multiple job_ids can be specified.
2140+
2141+
4. If the job fails or fetching the logs fails, the command will exit with
2142+
a non-zero return code.
21302143
"""
21312144
if sync_down and status:
21322145
raise click.UsageError(
@@ -2174,17 +2187,18 @@ def logs(
21742187
# it will return {None: None}.
21752188
if job_id is None:
21762189
click.secho(f'No job found on cluster {cluster!r}.', fg='red')
2177-
sys.exit(1)
2190+
sys.exit(exceptions.JobExitCode.NOT_FOUND)
21782191
job_status = list(job_statuses.values())[0]
21792192
job_status_str = job_status.value if job_status is not None else 'None'
21802193
click.echo(f'Job {job_id}: {job_status_str}')
21812194
if job_status == job_lib.JobStatus.SUCCEEDED:
21822195
return
21832196
else:
2197+
returncode = exceptions.JobExitCode.from_job_status(job_status)
21842198
if job_status is None:
21852199
id_str = '' if job_id is None else f'{job_id} '
21862200
click.secho(f'Job {id_str}not found', fg='red')
2187-
sys.exit(1)
2201+
sys.exit(returncode)
21882202

21892203
job_str = f'job {job_id}'
21902204
if job_id is None:
@@ -2194,7 +2208,8 @@ def logs(
21942208
f'{colorama.Style.RESET_ALL}')
21952209

21962210
# Stream logs from the server.
2197-
sdk.tail_logs(cluster, job_id, follow, tail=tail)
2211+
returncode = sdk.tail_logs(cluster, job_id, follow, tail=tail)
2212+
sys.exit(returncode)
21982213

21992214

22002215
@cli.command()
@@ -3893,10 +3908,11 @@ def jobs_launch(
38933908
'sky.jobs.launch')
38943909
if not async_call and not detach_run:
38953910
job_id = job_id_handle[0]
3896-
managed_jobs.tail_logs(name=None,
3897-
job_id=job_id,
3898-
follow=True,
3899-
controller=False)
3911+
returncode = managed_jobs.tail_logs(name=None,
3912+
job_id=job_id,
3913+
follow=True,
3914+
controller=False)
3915+
sys.exit(returncode)
39003916

39013917

39023918
@jobs.command('queue', cls=_DocumentedCodeCommand)
@@ -4127,11 +4143,12 @@ def jobs_logs(name: Optional[str], job_id: Optional[int], follow: bool,
41274143
logger.info(f'{fore.CYAN}Job {job} logs{controller_str}: '
41284144
f'{log_local_path}{style.RESET_ALL}')
41294145
else:
4130-
managed_jobs.tail_logs(name=name,
4131-
job_id=job_id,
4132-
follow=follow,
4133-
controller=controller,
4134-
refresh=refresh)
4146+
returncode = managed_jobs.tail_logs(name=name,
4147+
job_id=job_id,
4148+
follow=follow,
4149+
controller=controller,
4150+
refresh=refresh)
4151+
sys.exit(returncode)
41354152
except exceptions.ClusterNotUpError:
41364153
with ux_utils.print_exception_no_traceback():
41374154
raise

sky/client/sdk.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -583,7 +583,7 @@ def tail_logs(cluster_name: str,
583583
job_id: Optional[int],
584584
follow: bool,
585585
tail: int = 0,
586-
output_stream: Optional['io.TextIOBase'] = None) -> None:
586+
output_stream: Optional['io.TextIOBase'] = None) -> int:
587587
"""Tails the logs of a job.
588588
589589
Args:
@@ -596,7 +596,9 @@ def tail_logs(cluster_name: str,
596596
console.
597597
598598
Returns:
599-
None
599+
Exit code based on success or failure of the job. 0 if success,
600+
100 if the job failed. See exceptions.JobExitCode for possible exit
601+
codes.
600602
601603
Request Raises:
602604
ValueError: if arguments are invalid or the cluster is not supported.
@@ -622,7 +624,7 @@ def tail_logs(cluster_name: str,
622624
timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
623625
None))
624626
request_id = server_common.get_request_id(response)
625-
stream_response(request_id, response, output_stream)
627+
return stream_response(request_id, response, output_stream)
626628

627629

628630
@usage_lib.entrypoint

sky/core.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -826,7 +826,7 @@ def cancel(
826826
def tail_logs(cluster_name: str,
827827
job_id: Optional[int],
828828
follow: bool = True,
829-
tail: int = 0) -> None:
829+
tail: int = 0) -> int:
830830
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
831831
"""Tails the logs of a job.
832832
@@ -842,6 +842,12 @@ def tail_logs(cluster_name: str,
842842
not the same as the user who created the cluster.
843843
sky.exceptions.CloudUserIdentityError: if we fail to get the current
844844
user identity.
845+
846+
Returns:
847+
Return code based on success or failure of the job. 0 if success,
848+
100 if the job failed. Note: This is not the return code of the job
849+
script.
850+
845851
"""
846852
# Check the status of the cluster.
847853
handle = backend_utils.check_cluster_available(
@@ -851,7 +857,7 @@ def tail_logs(cluster_name: str,
851857
backend = backend_utils.get_backend_from_handle(handle)
852858

853859
usage_lib.record_cluster_name_for_current_operation(cluster_name)
854-
backend.tail_logs(handle, job_id, follow=follow, tail=tail)
860+
return backend.tail_logs(handle, job_id, follow=follow, tail=tail)
855861

856862

857863
@usage_lib.entrypoint

sky/exceptions.py

+80-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99
from sky.utils import env_options
1010

1111
if typing.TYPE_CHECKING:
12+
from sky import jobs as managed_jobs
1213
from sky.backends import backend
14+
from sky.skylet import job_lib
1315
from sky.utils import status_lib
1416

1517
# Return code for keyboard interruption and SIGTSTP
@@ -236,7 +238,7 @@ def __init__(self, returncode: int, command: str, error_msg: str,
236238
else:
237239
if (len(command) > 100 and
238240
not env_options.Options.SHOW_DEBUG_INFO.get()):
239-
# Chunck the command to avoid overflow.
241+
# Chunk the command to avoid overflow.
240242
command = command[:100] + '...'
241243
message = (f'Command {command} failed with return code '
242244
f'{returncode}.\n{error_msg}')
@@ -449,3 +451,80 @@ def __init__(self, server_url: str):
449451
f'Could not connect to SkyPilot API server at {server_url}. '
450452
f'Please ensure that the server is running. '
451453
f'Try: curl {server_url}/api/health')
454+
455+
456+
class JobExitCode(enum.IntEnum):
457+
"""Job exit code enum.
458+
459+
These codes are used as return codes for job-related operations and as
460+
process exit codes to indicate job status.
461+
"""
462+
463+
SUCCEEDED = 0
464+
"""The job completed successfully"""
465+
466+
FAILED = 100
467+
"""The job failed (due to user code, setup, or driver failure)"""
468+
469+
NOT_FINISHED = 101
470+
"""The job has not finished yet"""
471+
472+
NOT_FOUND = 102
473+
"""The job was not found"""
474+
475+
CANCELLED = 103
476+
"""The job was cancelled by the user"""
477+
478+
@classmethod
479+
def from_job_status(cls,
480+
status: Optional['job_lib.JobStatus']) -> 'JobExitCode':
481+
"""Convert a job status to an exit code."""
482+
# Import here to avoid circular imports
483+
# pylint: disable=import-outside-toplevel
484+
from sky.skylet import job_lib
485+
486+
if status is None:
487+
return cls.NOT_FOUND
488+
489+
if not status.is_terminal():
490+
return cls.NOT_FINISHED
491+
492+
if status == job_lib.JobStatus.SUCCEEDED:
493+
return cls.SUCCEEDED
494+
495+
if status == job_lib.JobStatus.CANCELLED:
496+
return cls.CANCELLED
497+
498+
if status in job_lib.JobStatus.user_code_failure_states(
499+
) or status == job_lib.JobStatus.FAILED_DRIVER:
500+
return cls.FAILED
501+
502+
# Should not hit this case, but included to avoid errors
503+
return cls.FAILED
504+
505+
@classmethod
506+
def from_managed_job_status(
507+
cls,
508+
status: Optional['managed_jobs.ManagedJobStatus']) -> 'JobExitCode':
509+
"""Convert a managed job status to an exit code."""
510+
# Import here to avoid circular imports
511+
# pylint: disable=import-outside-toplevel
512+
from sky import jobs as managed_jobs
513+
514+
if status is None:
515+
return cls.NOT_FOUND
516+
517+
if not status.is_terminal():
518+
return cls.NOT_FINISHED
519+
520+
if status == managed_jobs.ManagedJobStatus.SUCCEEDED:
521+
return cls.SUCCEEDED
522+
523+
if status == managed_jobs.ManagedJobStatus.CANCELLED:
524+
return cls.CANCELLED
525+
526+
if status.is_failed():
527+
return cls.FAILED
528+
529+
# Should not hit this case, but included to avoid errors
530+
return cls.FAILED

sky/jobs/client/sdk.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ def tail_logs(name: Optional[str] = None,
184184
follow: bool = True,
185185
controller: bool = False,
186186
refresh: bool = False,
187-
output_stream: Optional['io.TextIOBase'] = None) -> None:
187+
output_stream: Optional['io.TextIOBase'] = None) -> int:
188188
"""Tails logs of managed jobs.
189189
190190
You can provide either a job name or a job ID to tail logs. If both are not
@@ -199,6 +199,11 @@ def tail_logs(name: Optional[str] = None,
199199
output_stream: The stream to write the logs to. If None, print to the
200200
console.
201201
202+
Returns:
203+
Exit code based on success or failure of the job. 0 if success,
204+
100 if the job failed. See exceptions.JobExitCode for possible exit
205+
codes.
206+
202207
Request Raises:
203208
ValueError: invalid arguments.
204209
sky.exceptions.ClusterNotUpError: the jobs controller is not up.
@@ -217,7 +222,7 @@ def tail_logs(name: Optional[str] = None,
217222
timeout=(5, None),
218223
)
219224
request_id = server_common.get_request_id(response)
220-
sdk.stream_response(request_id, response, output_stream)
225+
return sdk.stream_response(request_id, response, output_stream)
221226

222227

223228
@usage_lib.entrypoint

sky/jobs/constants.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
# The version of the lib files that jobs/utils use. Whenever there is an API
4141
# change for the jobs/utils, we need to bump this version and update
4242
# job.utils.ManagedJobCodeGen to handle the version update.
43-
MANAGED_JOBS_VERSION = 2
43+
MANAGED_JOBS_VERSION = 3
4444

4545
# The command for setting up the jobs dashboard on the controller. It firstly
4646
# checks if the systemd services are available, and if not (e.g., Kubernetes

sky/jobs/server/core.py

+11-6
Original file line numberDiff line numberDiff line change
@@ -460,12 +460,17 @@ def cancel(name: Optional[str] = None,
460460

461461
@usage_lib.entrypoint
462462
def tail_logs(name: Optional[str], job_id: Optional[int], follow: bool,
463-
controller: bool, refresh: bool) -> None:
463+
controller: bool, refresh: bool) -> int:
464464
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
465465
"""Tail logs of managed jobs.
466466
467467
Please refer to sky.cli.job_logs for documentation.
468468
469+
Returns:
470+
Exit code based on success or failure of the job. 0 if success,
471+
100 if the job failed. See exceptions.JobExitCode for possible exit
472+
codes.
473+
469474
Raises:
470475
ValueError: invalid arguments.
471476
sky.exceptions.ClusterNotUpError: the jobs controller is not up.
@@ -494,11 +499,11 @@ def tail_logs(name: Optional[str], job_id: Optional[int], follow: bool,
494499
backend = backend_utils.get_backend_from_handle(handle)
495500
assert isinstance(backend, backends.CloudVmRayBackend), backend
496501

497-
backend.tail_managed_job_logs(handle,
498-
job_id=job_id,
499-
job_name=name,
500-
follow=follow,
501-
controller=controller)
502+
return backend.tail_managed_job_logs(handle,
503+
job_id=job_id,
504+
job_name=name,
505+
follow=follow,
506+
controller=controller)
502507

503508

504509
def start_dashboard_forwarding(refresh: bool = False) -> Tuple[int, int]:

0 commit comments

Comments
 (0)