20
20
21
21
import abc
22
22
from typing import List
23
+ from pathlib import Path
23
24
24
25
from ray .job_submission import JobSubmissionClient
25
26
from torchx .components .dist import ddp
26
27
from torchx .runner import get_runner , Runner
27
- from torchx .specs import AppHandle , parse_app_handle
28
+ from torchx .specs import AppHandle , parse_app_handle , AppDryRunInfo
28
29
29
30
from .config import JobConfiguration
30
31
35
36
all_jobs : List ["Job" ] = []
36
37
torchx_runner : Runner = get_runner ()
37
38
38
- torchx_runner .run (app , scheduler , cfg , workspace )
39
-
40
- torchx_runner .run_component (component , component_args , scheduler , cfg , workspace )
41
-
42
39
class JobDefinition (metaclass = abc .ABCMeta ):
43
40
"""
44
41
A job definition to be submitted to a generic backend cluster.
45
42
"""
46
43
47
- def submit (self , cluster : Cluster ):
44
+ def _dry_run (self , cluster ) -> str :
45
+ """
46
+ Create job definition, but do not submit.
47
+
48
+ The primary purpose of this function is to facilitate unit testing.
49
+ """
50
+
51
+ def submit (self , cluster : "Cluster" ):
48
52
"""
49
53
Method for creating a job on a specific cluster
50
54
"""
@@ -76,59 +80,70 @@ def __init__(self, config: JobConfiguration):
76
80
"""
77
81
self .config = config
78
82
79
- def submit (self , cluster : Cluster ) -> "TorchXRayJob" :
80
- """
81
- Submit the job definition to a specific cluster, resulting in a Job object.
83
+ def _dry_run (self , cluster : "Cluster" , * script_args ) -> AppDryRunInfo :
82
84
"""
83
- return TorchXRayJob (self , cluster )
84
-
85
+ Create job definition, but do not submit.
85
86
86
- class TorchXRayJob (Job ):
87
- """
88
- Active submission of a dist.ddp job to a Ray cluster which can be used to get logs and status.
89
- """
90
- def __init__ (self , job_definition : TorchXJobDefinition , cluster : Cluster , * script_args ):
87
+ The primary purpose of this function is to facilitate unit testing.
91
88
"""
92
- TODO
93
- """
94
- self .job_definition : TorchXJobDefinition = job_definition
95
- self .cluster : Cluster = cluster
96
89
j = f"{ cluster .config .max_worker } x{ max (cluster .config .gpu , 1 )} " # # of proc. = # of gpus
97
- # TODO: allow user to override resource allocation for job
98
- _app_handle : AppHandle = torchx_runner .run (
90
+ dashboard_address = f" { cluster . cluster_dashboard_uri ( cluster . config . namespace ). lstrip ( 'http://' ) } :8265"
91
+ return torchx_runner .dryrun (
99
92
app = ddp (
100
93
* script_args ,
101
- script = job_definition .config .script ,
102
- m = None , # python module to run (might be worth exposing)
103
- name = job_definition .config .name ,
94
+ script = self .config .script ,
95
+ m = self . config . m ,
96
+ name = self .config .name ,
104
97
h = None , # for custom resource types
105
- cpu = cluster .config .max_cpus , # TODO: get from cluster config
98
+ cpu = cluster .config .max_cpus ,
106
99
gpu = cluster .config .gpu ,
107
100
memMB = 1024 * cluster .config .max_memory , # cluster memory is in GB
108
101
j = j ,
109
- env = None , # TODO: should definitely expose Dict[str, str]
110
- max_retries = 0 , # TODO: maybe expose
111
- mounts = None , # TODO: maybe expose
112
- debug = False # TODO: expose
102
+ env = self .config .env ,
103
+ # max_retries=0, # default
104
+ # mounts=None, # default
113
105
),
114
- scheduler = "ray" , cfg = "fo" ,
106
+ scheduler = "ray" , # can be determined by type of cluster if more are introduced
107
+ cfg = {
108
+ "cluster_name" : cluster .config .name ,
109
+ "dashboard_address" : "localhost:8265" , # dashboard_address,
110
+ "working_dir" : self .config .working_dir ,
111
+ "requirements" : self .config .requirements ,
112
+ },
113
+ workspace = f"file://{ Path .cwd ()} "
115
114
)
116
115
117
- _ , _ , self .job_id = parse_app_handle (_app_handle )
116
+ def submit (self , cluster : "Cluster" ) -> "TorchXRayJob" :
117
+ """
118
+ Submit the job definition to a specific cluster, resulting in a Job object.
119
+ """
120
+ return TorchXRayJob (self , cluster )
121
+
122
+
123
+ class TorchXRayJob (Job ):
124
+ """
125
+ Active submission of a dist.ddp job to a Ray cluster which can be used to get logs and status.
126
+ """
127
+ def __init__ (self , job_definition : TorchXJobDefinition , cluster : "Cluster" , * script_args ):
128
+ """
129
+ Creates job which maximizes resource usage on the passed cluster.
130
+ """
131
+ self .job_definition : TorchXJobDefinition = job_definition
132
+ self .cluster : "Cluster" = cluster
133
+ # dashboard_address = f"{self.cluster.cluster_dashboard_uri(self.cluster.config.namespace).lstrip('http://')}:8265"
134
+ self ._app_handle = torchx_runner .schedule (job_definition ._dry_run (cluster , * script_args ))
135
+ _ , _ , self .job_id = parse_app_handle (self ._app_handle )
136
+ # self.job_id = self.job_id.lstrip(f"{dashboard_address}-")
118
137
all_jobs .append (self )
119
138
120
- def status (self ):
139
+ def status (self ) -> str :
121
140
"""
122
- TODO
141
+ Get running job status.
123
142
"""
124
- dashboard_route = self .cluster .cluster_dashboard_uri ()
125
- client = JobSubmissionClient (dashboard_route )
126
- return client .get_job_status (self .job_id )
143
+ return torchx_runner .status (self ._app_handle )
127
144
128
- def logs (self ):
145
+ def logs (self ) -> str :
129
146
"""
130
- TODO
147
+ Get job logs.
131
148
"""
132
- dashboard_route = self .cluster_dashboard_uri (namespace = self .config .namespace )
133
- client = JobSubmissionClient (dashboard_route )
134
- return client .get_job_logs (job_id )
149
+ return "" .join (torchx_runner .log_lines (self ._app_handle , None ))
0 commit comments