Skip to content

Commit aea6ccd

Browse files
committed
Added namespace retrieval and dashboard route access via kubernetes
1 parent 4618029 commit aea6ccd

File tree

2 files changed

+50
-19
lines changed

2 files changed

+50
-19
lines changed

src/codeflare_sdk/cluster/cluster.py

Lines changed: 48 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ def create_app_wrapper(self):
6969
"""
7070

7171
if self.config.namespace is None:
72-
self.config.namespace = oc.get_project_name()
72+
self.config.namespace = get_current_namespace()
7373
if type(self.config.namespace) is not str:
7474
raise TypeError(
7575
f"Namespace {self.config.namespace} is of type {type(self.config.namespace)}. Check your Kubernetes Authentication."
@@ -267,16 +267,21 @@ def cluster_dashboard_uri(self) -> str:
267267
Returns a string containing the cluster's dashboard URI.
268268
"""
269269
try:
270-
with oc.project(self.config.namespace):
271-
route = oc.invoke(
272-
"get", ["route", "-o", "jsonpath='{$.items[*].spec.host}'"]
273-
)
274-
route = route.out().split(" ")
275-
route = [x for x in route if f"ray-dashboard-{self.config.name}" in x]
276-
route = route[0].strip().strip("'")
277-
return f"http://{route}"
270+
config.load_kube_config()
271+
api_instance = client.CustomObjectsApi()
272+
routes = api_instance.list_namespaced_custom_object(
273+
group="route.openshift.io",
274+
version="v1",
275+
namespace=self.config.namespace,
276+
plural="routes",
277+
)
278278
except:
279-
return "Dashboard route not available yet, have you run cluster.up()?"
279+
pass
280+
281+
for route in routes["items"]:
282+
if route["metadata"]["name"] == f"ray-dashboard-{self.config.name}":
283+
return f"http://{route['spec']['host']}"
284+
return "Dashboard route not available yet, have you run cluster.up()?"
280285

281286
def list_jobs(self) -> List:
282287
"""
@@ -340,6 +345,19 @@ def list_all_queued(namespace: str, print_to_console: bool = True):
340345
return app_wrappers
341346

342347

348+
def get_current_namespace():
349+
try:
350+
_, active_context = config.list_kube_config_contexts()
351+
except config.ConfigException:
352+
raise PermissionError(
353+
"Retrieving current namespace not permitted, have you put in correct/up-to-date auth credentials?"
354+
)
355+
try:
356+
return active_context["context"]["namespace"]
357+
except KeyError:
358+
return "default"
359+
360+
343361
# private methods
344362

345363

@@ -469,12 +487,25 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]:
469487
else:
470488
status = RayClusterStatus.UNKNOWN
471489

472-
with oc.project(rc["metadata"]["namespace"]), oc.timeout(10 * 60):
473-
route = (
474-
oc.selector(f"route/ray-dashboard-{rc['metadata']['name']}")
475-
.object()
476-
.model.spec.host
477-
)
490+
config.load_kube_config()
491+
api_instance = client.CustomObjectsApi()
492+
routes = api_instance.list_namespaced_custom_object(
493+
group="route.openshift.io",
494+
version="v1",
495+
namespace=rc["metadata"]["namespace"],
496+
plural="routes",
497+
)
498+
ray_route = None
499+
for route in routes["items"]:
500+
if route["metadata"]["name"] == f"ray-dashboard-{rc['metadata']['name']}":
501+
ray_route = route["spec"]["host"]
502+
503+
# with oc.project(rc["metadata"]["namespace"]), oc.timeout(10 * 60):
504+
# route = (
505+
# oc.selector(f"route/ray-dashboard-{rc['metadata']['name']}")
506+
# .object()
507+
# .model.spec.host
508+
# )
478509

479510
return RayCluster(
480511
name=rc["metadata"]["name"],
@@ -493,7 +524,7 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]:
493524
]["resources"]["limits"]["cpu"],
494525
worker_gpu=0, # hard to detect currently how many gpus, can override it with what the user asked for
495526
namespace=rc["metadata"]["namespace"],
496-
dashboard=route,
527+
dashboard=ray_route,
497528
)
498529

499530

src/codeflare_sdk/job/jobs.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,13 @@
1717
from typing import TYPE_CHECKING, Optional, Dict, List
1818
from pathlib import Path
1919

20-
import openshift as oc
2120
from torchx.components.dist import ddp
2221
from torchx.runner import get_runner
2322
from torchx.specs import AppHandle, parse_app_handle, AppDryRunInfo
2423

2524
if TYPE_CHECKING:
2625
from ..cluster.cluster import Cluster
26+
from ..cluster.cluster import get_current_namespace
2727

2828
all_jobs: List["Job"] = []
2929
torchx_runner = get_runner()
@@ -124,7 +124,7 @@ def _missing_spec(self, spec: str):
124124
def _dry_run_no_cluster(self):
125125
if self.scheduler_args is not None:
126126
if self.scheduler_args.get("namespace") is None:
127-
self.scheduler_args["namespace"] = oc.get_project_name()
127+
self.scheduler_args["namespace"] = get_current_namespace()
128128
return torchx_runner.dryrun(
129129
app=ddp(
130130
*self.script_args,

0 commit comments

Comments
 (0)