diff --git a/.github/workflows/ui_notebooks_test.yaml b/.github/workflows/ui_notebooks_test.yaml
index 864330b9c..5e8d506d1 100644
--- a/.github/workflows/ui_notebooks_test.yaml
+++ b/.github/workflows/ui_notebooks_test.yaml
@@ -86,7 +86,8 @@ jobs:
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 3_widget_example.ipynb > 3_widget_example.ipynb.tmp && mv 3_widget_example.ipynb.tmp 3_widget_example.ipynb
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 3_widget_example.ipynb > 3_widget_example.ipynb.tmp && mv 3_widget_example.ipynb.tmp 3_widget_example.ipynb
# Set explicit namespace as SDK need it (currently) to resolve local queues
- sed -i "s/head_memory_limits=2,/head_memory_limits=2, namespace='default',/" 3_widget_example.ipynb
+ sed -i "s|head_memory_limits=2,|head_memory_limits=2, namespace='default', image='quay.io/modh/ray:2.35.0-py39-cu121',|" 3_widget_example.ipynb
+ sed -i "s|view_clusters()|view_clusters('default')|" 3_widget_example.ipynb
working-directory: demo-notebooks/guided-demos
- name: Run UI notebook tests
diff --git a/demo-notebooks/guided-demos/3_widget_example.ipynb b/demo-notebooks/guided-demos/3_widget_example.ipynb
index 4d3d6ea70..11521ec72 100644
--- a/demo-notebooks/guided-demos/3_widget_example.ipynb
+++ b/demo-notebooks/guided-demos/3_widget_example.ipynb
@@ -19,7 +19,7 @@
"outputs": [],
"source": [
"# Import pieces from codeflare-sdk\n",
- "from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication"
+ "from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication, view_clusters"
]
},
{
@@ -61,7 +61,7 @@
"# Create and configure our cluster object\n",
"# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n",
"cluster = Cluster(ClusterConfiguration(\n",
- " name='raytest', \n",
+ " name='raytest',\n",
" head_cpu_requests='500m',\n",
" head_cpu_limits='500m',\n",
" head_memory_requests=2,\n",
@@ -73,12 +73,22 @@
" worker_cpu_limits=1,\n",
" worker_memory_requests=2,\n",
" worker_memory_limits=2,\n",
- " # image=\"\", # Optional Field \n",
- " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n",
+ " # image=\"\", # Optional Field\n",
+ " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources\n",
" # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
"))"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3de6403c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "view_clusters()"
+ ]
+ },
{
"cell_type": "code",
"execution_count": null,
@@ -106,7 +116,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.19"
+ "version": "3.9.18"
},
"vscode": {
"interpreter": {
diff --git a/src/codeflare_sdk.egg-info/SOURCES.txt b/src/codeflare_sdk.egg-info/SOURCES.txt
index 42541f1d2..63614a814 100644
--- a/src/codeflare_sdk.egg-info/SOURCES.txt
+++ b/src/codeflare_sdk.egg-info/SOURCES.txt
@@ -12,9 +12,11 @@ src/codeflare_sdk/cluster/awload.py
src/codeflare_sdk/cluster/cluster.py
src/codeflare_sdk/cluster/config.py
src/codeflare_sdk/cluster/model.py
+src/codeflare_sdk/cluster/widgets.py
src/codeflare_sdk/job/__init__.py
src/codeflare_sdk/job/ray_jobs.py
src/codeflare_sdk/utils/__init__.py
+src/codeflare_sdk/utils/demos.py
src/codeflare_sdk/utils/generate_cert.py
src/codeflare_sdk/utils/generate_yaml.py
src/codeflare_sdk/utils/kube_api_helpers.py
diff --git a/src/codeflare_sdk/__init__.py b/src/codeflare_sdk/__init__.py
index 0390a3d2f..29205a36e 100644
--- a/src/codeflare_sdk/__init__.py
+++ b/src/codeflare_sdk/__init__.py
@@ -14,6 +14,7 @@
get_cluster,
list_all_queued,
list_all_clusters,
+ view_clusters,
)
from .job import RayJobClient
diff --git a/src/codeflare_sdk/cluster/__init__.py b/src/codeflare_sdk/cluster/__init__.py
index 0b1849e51..6490a2247 100644
--- a/src/codeflare_sdk/cluster/__init__.py
+++ b/src/codeflare_sdk/cluster/__init__.py
@@ -21,4 +21,8 @@
list_all_clusters,
)
+from .widgets import (
+ view_clusters,
+)
+
from .awload import AWManager
diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py
index 7c652a186..a32d5a4b7 100644
--- a/src/codeflare_sdk/cluster/cluster.py
+++ b/src/codeflare_sdk/cluster/cluster.py
@@ -19,6 +19,7 @@
"""
import re
+import subprocess
from time import sleep
from typing import List, Optional, Tuple, Dict
@@ -862,16 +863,19 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]:
name=rc["metadata"]["name"],
status=status,
# for now we are not using autoscaling so same replicas is fine
- workers=rc["spec"]["workerGroupSpecs"][0]["replicas"],
+ num_workers=rc["spec"]["workerGroupSpecs"][0]["replicas"],
worker_mem_limits=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]["resources"]["limits"]["memory"],
worker_mem_requests=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]["resources"]["requests"]["memory"],
- worker_cpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][
- 0
- ]["resources"]["limits"]["cpu"],
+ worker_cpu_requests=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
+ "containers"
+ ][0]["resources"]["requests"]["cpu"],
+ worker_cpu_limits=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
+ "containers"
+ ][0]["resources"]["limits"]["cpu"],
worker_extended_resources=worker_extended_resources,
namespace=rc["metadata"]["namespace"],
head_cpu_requests=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][
@@ -907,10 +911,11 @@ def _copy_to_ray(cluster: Cluster) -> RayCluster:
ray = RayCluster(
name=cluster.config.name,
status=cluster.status(print_to_console=False)[0],
- workers=cluster.config.num_workers,
+ num_workers=cluster.config.num_workers,
worker_mem_requests=cluster.config.worker_memory_requests,
worker_mem_limits=cluster.config.worker_memory_limits,
- worker_cpu=cluster.config.worker_cpu_requests,
+ worker_cpu_requests=cluster.config.worker_cpu_requests,
+ worker_cpu_limits=cluster.config.worker_cpu_limits,
worker_extended_resources=cluster.config.worker_extended_resource_requests,
namespace=cluster.config.namespace,
dashboard=cluster.cluster_dashboard_uri(),
diff --git a/src/codeflare_sdk/cluster/model.py b/src/codeflare_sdk/cluster/model.py
index ab7b30ede..44be54567 100644
--- a/src/codeflare_sdk/cluster/model.py
+++ b/src/codeflare_sdk/cluster/model.py
@@ -21,6 +21,7 @@
from dataclasses import dataclass, field
from enum import Enum
import typing
+from typing import Union
class RayClusterStatus(Enum):
@@ -77,10 +78,11 @@ class RayCluster:
head_cpu_limits: int
head_mem_requests: str
head_mem_limits: str
- workers: int
+ num_workers: int
worker_mem_requests: str
worker_mem_limits: str
- worker_cpu: int
+ worker_cpu_requests: Union[int, str]
+ worker_cpu_limits: Union[int, str]
namespace: str
dashboard: str
worker_extended_resources: typing.Dict[str, int] = field(default_factory=dict)
diff --git a/src/codeflare_sdk/cluster/widgets.py b/src/codeflare_sdk/cluster/widgets.py
index 351640e04..53afa28be 100644
--- a/src/codeflare_sdk/cluster/widgets.py
+++ b/src/codeflare_sdk/cluster/widgets.py
@@ -15,10 +15,21 @@
"""
The widgets sub-module contains the ui widgets created using the ipywidgets package.
"""
-import ipywidgets as widgets
-from IPython.display import display
+import contextlib
+import io
import os
+import warnings
+import time
import codeflare_sdk
+from kubernetes import client
+from kubernetes.client.rest import ApiException
+import ipywidgets as widgets
+from IPython.display import display, HTML, Javascript
+import pandas as pd
+from .config import ClusterConfiguration
+from .model import RayClusterStatus
+from ..utils.kube_api_helpers import _kube_api_error_handling
+from .auth import config_check, api_config_handler
def cluster_up_down_buttons(cluster: "codeflare_sdk.cluster.Cluster") -> widgets.Button:
@@ -89,3 +100,393 @@ def is_notebook() -> bool:
return True
else:
return False
+
+
+def view_clusters(namespace: str = None):
+ """
+ view_clusters function will display existing clusters with their specs, and handle user interactions.
+ """
+ if not is_notebook():
+ warnings.warn(
+ "view_clusters can only be used in a Jupyter Notebook environment."
+ )
+ return # Exit function if not in Jupyter Notebook
+
+ from .cluster import get_current_namespace
+
+ if not namespace:
+ namespace = get_current_namespace()
+
+ user_output = widgets.Output()
+ raycluster_data_output = widgets.Output()
+ url_output = widgets.Output()
+
+ ray_clusters_df = _fetch_cluster_data(namespace)
+ if ray_clusters_df.empty:
+ print(f"No clusters found in the {namespace} namespace.")
+ return
+
+ classification_widget = widgets.ToggleButtons(
+ options=ray_clusters_df["Name"].tolist(),
+ value=ray_clusters_df["Name"].tolist()[0],
+ description="Select an existing cluster:",
+ )
+ # Setting the initial value to trigger the event handler to display the cluster details.
+ initial_value = classification_widget.value
+ _on_cluster_click(
+ {"new": initial_value}, raycluster_data_output, namespace, classification_widget
+ )
+ classification_widget.observe(
+ lambda selection_change: _on_cluster_click(
+ selection_change, raycluster_data_output, namespace, classification_widget
+ ),
+ names="value",
+ )
+
+ # UI table buttons
+ delete_button = widgets.Button(
+ description="Delete Cluster",
+ icon="trash",
+ tooltip="Delete the selected cluster",
+ )
+ delete_button.on_click(
+ lambda b: _on_delete_button_click(
+ b,
+ classification_widget,
+ ray_clusters_df,
+ raycluster_data_output,
+ user_output,
+ delete_button,
+ list_jobs_button,
+ ray_dashboard_button,
+ )
+ )
+
+ list_jobs_button = widgets.Button(
+ description="View Jobs", icon="suitcase", tooltip="Open the Ray Job Dashboard"
+ )
+ list_jobs_button.on_click(
+ lambda b: _on_list_jobs_button_click(
+ b, classification_widget, ray_clusters_df, user_output, url_output
+ )
+ )
+
+ ray_dashboard_button = widgets.Button(
+ description="Open Ray Dashboard",
+ icon="dashboard",
+ tooltip="Open the Ray Dashboard in a new tab",
+ layout=widgets.Layout(width="auto"),
+ )
+ ray_dashboard_button.on_click(
+ lambda b: _on_ray_dashboard_button_click(
+ b, classification_widget, ray_clusters_df, user_output, url_output
+ )
+ )
+
+ display(widgets.VBox([classification_widget, raycluster_data_output]))
+ display(
+ widgets.HBox([delete_button, list_jobs_button, ray_dashboard_button]),
+ url_output,
+ user_output,
+ )
+
+
+def _on_cluster_click(
+ selection_change,
+ raycluster_data_output: widgets.Output,
+ namespace: str,
+ classification_widget: widgets.ToggleButtons,
+):
+ """
+ _on_cluster_click handles the event when a cluster is selected from the toggle buttons, updating the output with cluster details.
+ """
+ new_value = selection_change["new"]
+ raycluster_data_output.clear_output()
+ ray_clusters_df = _fetch_cluster_data(namespace)
+ classification_widget.options = ray_clusters_df["Name"].tolist()
+ with raycluster_data_output:
+ display(
+ HTML(
+ ray_clusters_df[ray_clusters_df["Name"] == new_value][
+ [
+ "Name",
+ "Namespace",
+ "Num Workers",
+ "Head GPUs",
+ "Head CPU Req~Lim",
+ "Head Memory Req~Lim",
+ "Worker GPUs",
+ "Worker CPU Req~Lim",
+ "Worker Memory Req~Lim",
+ "status",
+ ]
+ ].to_html(escape=False, index=False, border=2)
+ )
+ )
+
+
+def _on_delete_button_click(
+ b,
+ classification_widget: widgets.ToggleButtons,
+ ray_clusters_df: pd.DataFrame,
+ raycluster_data_output: widgets.Output,
+ user_output: widgets.Output,
+ delete_button: widgets.Button,
+ list_jobs_button: widgets.Button,
+ ray_dashboard_button: widgets.Button,
+):
+ """
+ _on_delete_button_click handles the event when the Delete Button is clicked, deleting the selected cluster.
+ """
+ cluster_name = classification_widget.value
+ namespace = ray_clusters_df[ray_clusters_df["Name"] == classification_widget.value][
+ "Namespace"
+ ].values[0]
+
+ _delete_cluster(cluster_name, namespace)
+
+ with user_output:
+ user_output.clear_output()
+ print(
+ f"Cluster {cluster_name} in the {namespace} namespace was deleted successfully."
+ )
+
+ # Refresh the dataframe
+ new_df = _fetch_cluster_data(namespace)
+ if new_df.empty:
+ classification_widget.close()
+ delete_button.close()
+ list_jobs_button.close()
+ ray_dashboard_button.close()
+ with raycluster_data_output:
+ raycluster_data_output.clear_output()
+ print(f"No clusters found in the {namespace} namespace.")
+ else:
+ classification_widget.options = new_df["Name"].tolist()
+
+
+def _on_ray_dashboard_button_click(
+ b,
+ classification_widget: widgets.ToggleButtons,
+ ray_clusters_df: pd.DataFrame,
+ user_output: widgets.Output,
+ url_output: widgets.Output,
+):
+ """
+ _on_ray_dashboard_button_click handles the event when the Open Ray Dashboard button is clicked, opening the Ray Dashboard in a new tab
+ """
+ from codeflare_sdk.cluster import Cluster
+
+ cluster_name = classification_widget.value
+ namespace = ray_clusters_df[ray_clusters_df["Name"] == classification_widget.value][
+ "Namespace"
+ ].values[0]
+
+ # Suppress from Cluster Object initialisation widgets and outputs
+ with widgets.Output(), contextlib.redirect_stdout(
+ io.StringIO()
+ ), contextlib.redirect_stderr(io.StringIO()):
+ cluster = Cluster(ClusterConfiguration(cluster_name, namespace))
+ dashboard_url = cluster.cluster_dashboard_uri()
+
+ with user_output:
+ user_output.clear_output()
+ print(f"Opening Ray Dashboard for {cluster_name} cluster:\n{dashboard_url}")
+ with url_output:
+ display(Javascript(f'window.open("{dashboard_url}", "_blank");'))
+
+
+def _on_list_jobs_button_click(
+ b,
+ classification_widget: widgets.ToggleButtons,
+ ray_clusters_df: pd.DataFrame,
+ user_output: widgets.Output,
+ url_output: widgets.Output,
+):
+ """
+ _on_list_jobs_button_click handles the event when the View Jobs button is clicked, opening the Ray Jobs Dashboard in a new tab
+ """
+ from codeflare_sdk.cluster import Cluster
+
+ cluster_name = classification_widget.value
+ namespace = ray_clusters_df[ray_clusters_df["Name"] == classification_widget.value][
+ "Namespace"
+ ].values[0]
+
+ # Suppress from Cluster Object initialisation widgets and outputs
+ with widgets.Output(), contextlib.redirect_stdout(
+ io.StringIO()
+ ), contextlib.redirect_stderr(io.StringIO()):
+ cluster = Cluster(ClusterConfiguration(cluster_name, namespace))
+ dashboard_url = cluster.cluster_dashboard_uri()
+
+ with user_output:
+ user_output.clear_output()
+ print(
+ f"Opening Ray Jobs Dashboard for {cluster_name} cluster:\n{dashboard_url}/#/jobs"
+ )
+ with url_output:
+ display(Javascript(f'window.open("{dashboard_url}/#/jobs", "_blank");'))
+
+
+def _delete_cluster(
+ cluster_name: str,
+ namespace: str,
+ timeout: int = 5,
+ interval: int = 1,
+):
+ """
+ _delete_cluster function deletes the cluster with the given name and namespace.
+ It optionally waits for the cluster to be deleted.
+ """
+ from .cluster import _check_aw_exists
+
+ try:
+ config_check()
+ api_instance = client.CustomObjectsApi(api_config_handler())
+
+ if _check_aw_exists(cluster_name, namespace):
+ api_instance.delete_namespaced_custom_object(
+ group="workload.codeflare.dev",
+ version="v1beta2",
+ namespace=namespace,
+ plural="appwrappers",
+ name=cluster_name,
+ )
+ group = "workload.codeflare.dev"
+ version = "v1beta2"
+ plural = "appwrappers"
+ else:
+ api_instance.delete_namespaced_custom_object(
+ group="ray.io",
+ version="v1",
+ namespace=namespace,
+ plural="rayclusters",
+ name=cluster_name,
+ )
+ group = "ray.io"
+ version = "v1"
+ plural = "rayclusters"
+
+ # Wait for the resource to be deleted
+ while timeout > 0:
+ try:
+ api_instance.get_namespaced_custom_object(
+ group=group,
+ version=version,
+ namespace=namespace,
+ plural=plural,
+ name=cluster_name,
+ )
+ # Retry if resource still exists
+ time.sleep(interval)
+ timeout -= interval
+ if timeout <= 0:
+ raise TimeoutError(
+ f"Timeout waiting for {cluster_name} to be deleted."
+ )
+ except ApiException as e:
+ # Resource is deleted
+ if e.status == 404:
+ break
+ except Exception as e: # pragma: no cover
+ return _kube_api_error_handling(e)
+
+
+def _fetch_cluster_data(namespace):
+ """
+ _fetch_cluster_data function fetches all clusters and their spec in a given namespace and returns a DataFrame.
+ """
+ from .cluster import list_all_clusters
+
+ rayclusters = list_all_clusters(namespace, False)
+ if not rayclusters:
+ return pd.DataFrame()
+ names = [item.name for item in rayclusters]
+ namespaces = [item.namespace for item in rayclusters]
+ num_workers = [item.num_workers for item in rayclusters]
+ head_extended_resources = [
+ f"{list(item.head_extended_resources.keys())[0]}: {list(item.head_extended_resources.values())[0]}"
+ if item.head_extended_resources
+ else "0"
+ for item in rayclusters
+ ]
+ worker_extended_resources = [
+ f"{list(item.worker_extended_resources.keys())[0]}: {list(item.worker_extended_resources.values())[0]}"
+ if item.worker_extended_resources
+ else "0"
+ for item in rayclusters
+ ]
+ head_cpu_requests = [
+ item.head_cpu_requests if item.head_cpu_requests else 0 for item in rayclusters
+ ]
+ head_cpu_limits = [
+ item.head_cpu_limits if item.head_cpu_limits else 0 for item in rayclusters
+ ]
+ head_cpu_rl = [
+ f"{requests}~{limits}"
+ for requests, limits in zip(head_cpu_requests, head_cpu_limits)
+ ]
+ head_mem_requests = [
+ item.head_mem_requests if item.head_mem_requests else 0 for item in rayclusters
+ ]
+ head_mem_limits = [
+ item.head_mem_limits if item.head_mem_limits else 0 for item in rayclusters
+ ]
+ head_mem_rl = [
+ f"{requests}~{limits}"
+ for requests, limits in zip(head_mem_requests, head_mem_limits)
+ ]
+ worker_cpu_requests = [
+ item.worker_cpu_requests if item.worker_cpu_requests else 0
+ for item in rayclusters
+ ]
+ worker_cpu_limits = [
+ item.worker_cpu_limits if item.worker_cpu_limits else 0 for item in rayclusters
+ ]
+ worker_cpu_rl = [
+ f"{requests}~{limits}"
+ for requests, limits in zip(worker_cpu_requests, worker_cpu_limits)
+ ]
+ worker_mem_requests = [
+ item.worker_mem_requests if item.worker_mem_requests else 0
+ for item in rayclusters
+ ]
+ worker_mem_limits = [
+ item.worker_mem_limits if item.worker_mem_limits else 0 for item in rayclusters
+ ]
+ worker_mem_rl = [
+ f"{requests}~{limits}"
+ for requests, limits in zip(worker_mem_requests, worker_mem_limits)
+ ]
+ status = [item.status.name for item in rayclusters]
+
+ status = [_format_status(item.status) for item in rayclusters]
+
+ data = {
+ "Name": names,
+ "Namespace": namespaces,
+ "Num Workers": num_workers,
+ "Head GPUs": head_extended_resources,
+ "Worker GPUs": worker_extended_resources,
+ "Head CPU Req~Lim": head_cpu_rl,
+ "Head Memory Req~Lim": head_mem_rl,
+ "Worker CPU Req~Lim": worker_cpu_rl,
+ "Worker Memory Req~Lim": worker_mem_rl,
+ "status": status,
+ }
+ return pd.DataFrame(data)
+
+
+def _format_status(status):
+ """
+ _format_status function formats the status enum.
+ """
+ status_map = {
+ RayClusterStatus.READY: 'Ready ✓',
+ RayClusterStatus.SUSPENDED: 'Suspended ❄️',
+ RayClusterStatus.FAILED: 'Failed ✗',
+ RayClusterStatus.UNHEALTHY: 'Unhealthy',
+ RayClusterStatus.UNKNOWN: 'Unknown',
+ }
+ return status_map.get(status, status)
diff --git a/src/codeflare_sdk/utils/pretty_print.py b/src/codeflare_sdk/utils/pretty_print.py
index 4842c9cd2..303313199 100644
--- a/src/codeflare_sdk/utils/pretty_print.py
+++ b/src/codeflare_sdk/utils/pretty_print.py
@@ -135,9 +135,9 @@ def print_clusters(clusters: List[RayCluster]):
)
name = cluster.name
dashboard = cluster.dashboard
- workers = str(cluster.workers)
+ workers = str(cluster.num_workers)
memory = f"{cluster.worker_mem_requests}~{cluster.worker_mem_limits}"
- cpu = str(cluster.worker_cpu)
+ cpu = f"{cluster.worker_cpu_requests}~{cluster.worker_cpu_limits}"
gpu = str(cluster.worker_extended_resources.get("nvidia.com/gpu", 0))
#'table0' to display the cluster name, status, url, and dashboard link
diff --git a/tests/unit_test.py b/tests/unit_test.py
index 388723c50..ae2af6591 100644
--- a/tests/unit_test.py
+++ b/tests/unit_test.py
@@ -76,7 +76,9 @@
gen_names,
is_openshift_cluster,
)
-from codeflare_sdk.cluster.widgets import cluster_up_down_buttons
+
+import codeflare_sdk.cluster.widgets as cf_widgets
+import pandas as pd
import openshift
from openshift.selector import Selector
@@ -88,9 +90,6 @@
from ray.job_submission import JobSubmissionClient
from codeflare_sdk.job.ray_jobs import RayJobClient
-import ipywidgets as widgets
-from IPython.display import display
-
# For mocking openshift client results
fake_res = openshift.Result("fake")
@@ -941,10 +940,11 @@ def test_ray_details(mocker, capsys):
ray1 = RayCluster(
name="raytest1",
status=RayClusterStatus.READY,
- workers=1,
+ num_workers=1,
worker_mem_requests="2G",
worker_mem_limits="2G",
- worker_cpu=1,
+ worker_cpu_requests=1,
+ worker_cpu_limits=1,
namespace="ns",
dashboard="fake-uri",
head_cpu_requests=2,
@@ -979,10 +979,11 @@ def test_ray_details(mocker, capsys):
assert details == ray2
assert ray2.name == "raytest2"
assert ray1.namespace == ray2.namespace
- assert ray1.workers == ray2.workers
+ assert ray1.num_workers == ray2.num_workers
assert ray1.worker_mem_requests == ray2.worker_mem_requests
assert ray1.worker_mem_limits == ray2.worker_mem_limits
- assert ray1.worker_cpu == ray2.worker_cpu
+ assert ray1.worker_cpu_requests == ray2.worker_cpu_requests
+ assert ray1.worker_cpu_limits == ray2.worker_cpu_limits
assert ray1.worker_extended_resources == ray2.worker_extended_resources
try:
print_clusters([ray1, ray2])
@@ -1006,7 +1007,7 @@ def test_ray_details(mocker, capsys):
" │ ╭── Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n"
" │ │ # Workers │ │ Memory CPU GPU │ │ \n"
" │ │ │ │ │ │ \n"
- " │ │ 1 │ │ 2G~2G 1 0 │ │ \n"
+ " │ │ 1 │ │ 2G~2G 1~1 0 │ │ \n"
" │ │ │ │ │ │ \n"
" │ ╰─────────────╯ ╰──────────────────────────────────────╯ │ \n"
" ╰───────────────────────────────────────────────────────────────╯ \n"
@@ -1024,7 +1025,7 @@ def test_ray_details(mocker, capsys):
" │ ╭── Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n"
" │ │ # Workers │ │ Memory CPU GPU │ │ \n"
" │ │ │ │ │ │ \n"
- " │ │ 1 │ │ 2G~2G 1 0 │ │ \n"
+ " │ │ 1 │ │ 2G~2G 1~1 0 │ │ \n"
" │ │ │ │ │ │ \n"
" │ ╰─────────────╯ ╰──────────────────────────────────────╯ │ \n"
" ╰───────────────────────────────────────────────────────────────╯ \n"
@@ -1040,7 +1041,7 @@ def test_ray_details(mocker, capsys):
"│ ╭── Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │\n"
"│ │ # Workers │ │ Memory CPU GPU │ │\n"
"│ │ │ │ │ │\n"
- "│ │ 1 │ │ 2G~2G 1 0 │ │\n"
+ "│ │ 1 │ │ 2G~2G 1~1 0 │ │\n"
"│ │ │ │ │ │\n"
"│ ╰─────────────╯ ╰──────────────────────────────────────╯ │\n"
"╰───────────────────────────────────────────────────────────────╯\n"
@@ -2245,7 +2246,7 @@ def test_list_clusters(mocker, capsys):
" │ ╭── Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n"
" │ │ # Workers │ │ Memory CPU GPU │ │ \n"
" │ │ │ │ │ │ \n"
- " │ │ 1 │ │ 2G~2G 1 0 │ │ \n"
+ " │ │ 1 │ │ 2G~2G 1~1 0 │ │ \n"
" │ │ │ │ │ │ \n"
" │ ╰─────────────╯ ╰──────────────────────────────────────╯ │ \n"
" ╰───────────────────────────────────────────────────────────────╯ \n"
@@ -2261,7 +2262,7 @@ def test_list_clusters(mocker, capsys):
"│ ╭── Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │\n"
"│ │ # Workers │ │ Memory CPU GPU │ │\n"
"│ │ │ │ │ │\n"
- "│ │ 1 │ │ 2G~2G 1 0 │ │\n"
+ "│ │ 1 │ │ 2G~2G 1~1 0 │ │\n"
"│ │ │ │ │ │\n"
"│ ╰─────────────╯ ╰──────────────────────────────────────╯ │\n"
"╰───────────────────────────────────────────────────────────────╯\n"
@@ -2357,10 +2358,11 @@ def test_cluster_status(mocker):
fake_ray = RayCluster(
name="test",
status=RayClusterStatus.UNKNOWN,
- workers=1,
+ num_workers=1,
worker_mem_requests=2,
worker_mem_limits=2,
- worker_cpu=1,
+ worker_cpu_requests=1,
+ worker_cpu_limits=1,
namespace="ns",
dashboard="fake-uri",
head_cpu_requests=2,
@@ -2922,7 +2924,7 @@ def test_cluster_up_down_buttons(mocker):
MockButton.side_effect = [mock_up_button, mock_down_button]
# Call the method under test
- cluster_up_down_buttons(cluster)
+ cf_widgets.cluster_up_down_buttons(cluster)
# Simulate checkbox being checked or unchecked
mock_wait_ready_check_box.value = True # Simulate checkbox being checked
@@ -2955,6 +2957,247 @@ def test_is_notebook_true():
assert is_notebook() is True
+def test_view_clusters(mocker, capsys):
+ from kubernetes.client.rest import ApiException
+
+ mocker.patch("codeflare_sdk.cluster.widgets.is_notebook", return_value=False)
+ with pytest.warns(
+ UserWarning,
+ match="view_clusters can only be used in a Jupyter Notebook environment.",
+ ):
+ result = cf_widgets.view_clusters(namespace="default")
+ # Assert the function returns None when not in a notebook environment
+ assert result is None
+
+ mocker.patch("codeflare_sdk.cluster.widgets.is_notebook", return_value=True)
+
+ # Mock Kubernetes API responses
+ mocker.patch("kubernetes.client.ApisApi.get_api_versions")
+ mocker.patch(
+ "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
+ return_value={"items": []},
+ )
+ mocker.patch("codeflare_sdk.cluster.cluster._check_aw_exists", return_value=False)
+
+ # Return empty dataframe when no clusters are found
+ mocker.patch("codeflare_sdk.cluster.cluster.list_all_clusters", return_value=[])
+ mocker.patch(
+ "codeflare_sdk.cluster.cluster.get_current_namespace",
+ return_value="default",
+ )
+ df = cf_widgets._fetch_cluster_data(namespace="default")
+ assert df.empty
+
+ cf_widgets.view_clusters()
+ captured = capsys.readouterr()
+ assert f"No clusters found in the default namespace." in captured.out
+
+ # Assert the function returns None
+ assert result is None
+
+ test_df = pd.DataFrame(
+ {
+ "Name": ["test-cluster"],
+ "Namespace": ["default"],
+ "Num Workers": ["1"],
+ "Head GPUs": ["0"],
+ "Worker GPUs": ["0"],
+ "Head CPU Req~Lim": ["1~1"],
+ "Head Memory Req~Lim": ["1Gi~1Gi"],
+ "Worker CPU Req~Lim": ["1~1"],
+ "Worker Memory Req~Lim": ["1Gi~1Gi"],
+ "status": ['Ready ✓'],
+ }
+ )
+
+ # Mock the _fetch_cluster_data function to return a test DataFrame
+ mocker.patch(
+ "codeflare_sdk.cluster.widgets._fetch_cluster_data", return_value=test_df
+ )
+
+ # Mock the Cluster class and related methods
+ mocker.patch("codeflare_sdk.cluster.Cluster")
+ mocker.patch("codeflare_sdk.cluster.ClusterConfiguration")
+
+ with patch("ipywidgets.ToggleButtons") as MockToggleButtons, patch(
+ "ipywidgets.Button"
+ ) as MockButton, patch("ipywidgets.Output") as MockOutput, patch(
+ "ipywidgets.HBox"
+ ), patch(
+ "ipywidgets.VBox"
+ ), patch(
+ "IPython.display.display"
+ ) as mock_display, patch(
+ "IPython.display.HTML"
+ ), patch(
+ "codeflare_sdk.cluster.widgets.Javascript"
+ ) as mock_javascript:
+ # Create mock widget instances
+ mock_toggle = MagicMock()
+ mock_delete_button = MagicMock()
+ mock_list_jobs_button = MagicMock()
+ mock_ray_dashboard_button = MagicMock()
+ mock_output = MagicMock()
+
+ # Set the return values for the mocked widgets
+ MockToggleButtons.return_value = mock_toggle
+ MockButton.side_effect = [
+ mock_delete_button,
+ mock_list_jobs_button,
+ mock_ray_dashboard_button,
+ ]
+ MockOutput.return_value = mock_output
+
+ # Call the function under test
+ cf_widgets.view_clusters()
+
+ # Simulate selecting a cluster
+ mock_toggle.value = "test-cluster"
+ selection_change = {"new": "test-cluster"}
+ cf_widgets._on_cluster_click(
+ selection_change, mock_output, "default", mock_toggle
+ )
+
+ # Assert that the toggle options are set correctly
+ mock_toggle.observe.assert_called()
+
+ # Simulate clicking the list jobs button
+ cf_widgets._on_list_jobs_button_click(
+ None, mock_toggle, test_df, mock_output, mock_output
+ )
+ mock_javascript.assert_called_once()
+
+ # Simulate clicking the Ray dashboard button
+ cf_widgets._on_ray_dashboard_button_click(
+ None, mock_toggle, test_df, mock_output, mock_output
+ )
+ mock_javascript.call_count = 2
+
+ mocker.patch(
+ "kubernetes.client.CustomObjectsApi.delete_namespaced_custom_object",
+ )
+ mock_response = mocker.MagicMock()
+ mock_response.status = 404
+ mock_exception = ApiException(http_resp=mock_response)
+ mocker.patch(
+ "kubernetes.client.CustomObjectsApi.get_namespaced_custom_object",
+ side_effect=mock_exception,
+ )
+
+ # Simulate clicking the delete button
+ cf_widgets._on_delete_button_click(
+ None,
+ mock_toggle,
+ test_df,
+ mock_output,
+ mock_output,
+ mock_delete_button,
+ mock_list_jobs_button,
+ mock_ray_dashboard_button,
+ )
+ MockButton.call_count = 3
+
+
+def test_fetch_cluster_data(mocker):
+ # Return empty dataframe when no clusters are found
+ mocker.patch("codeflare_sdk.cluster.cluster.list_all_clusters", return_value=[])
+ df = cf_widgets._fetch_cluster_data(namespace="default")
+ assert df.empty
+
+ # Create mock RayCluster objects
+ mock_raycluster1 = MagicMock(spec=RayCluster)
+ mock_raycluster1.name = "test-cluster-1"
+ mock_raycluster1.namespace = "default"
+ mock_raycluster1.num_workers = 1
+ mock_raycluster1.head_extended_resources = {"nvidia.com/gpu": "1"}
+ mock_raycluster1.worker_extended_resources = {"nvidia.com/gpu": "2"}
+ mock_raycluster1.head_cpu_requests = "500m"
+ mock_raycluster1.head_cpu_limits = "1000m"
+ mock_raycluster1.head_mem_requests = "1Gi"
+ mock_raycluster1.head_mem_limits = "2Gi"
+ mock_raycluster1.worker_cpu_requests = "1000m"
+ mock_raycluster1.worker_cpu_limits = "2000m"
+ mock_raycluster1.worker_mem_requests = "2Gi"
+ mock_raycluster1.worker_mem_limits = "4Gi"
+ mock_raycluster1.status = MagicMock()
+ mock_raycluster1.status.name = "READY"
+ mock_raycluster1.status = RayClusterStatus.READY
+
+ mock_raycluster2 = MagicMock(spec=RayCluster)
+ mock_raycluster2.name = "test-cluster-2"
+ mock_raycluster2.namespace = "default"
+ mock_raycluster2.num_workers = 2
+ mock_raycluster2.head_extended_resources = {}
+ mock_raycluster2.worker_extended_resources = {}
+ mock_raycluster2.head_cpu_requests = None
+ mock_raycluster2.head_cpu_limits = None
+ mock_raycluster2.head_mem_requests = None
+ mock_raycluster2.head_mem_limits = None
+ mock_raycluster2.worker_cpu_requests = None
+ mock_raycluster2.worker_cpu_limits = None
+ mock_raycluster2.worker_mem_requests = None
+ mock_raycluster2.worker_mem_limits = None
+ mock_raycluster2.status = MagicMock()
+ mock_raycluster2.status.name = "SUSPENDED"
+ mock_raycluster2.status = RayClusterStatus.SUSPENDED
+
+ with patch(
+ "codeflare_sdk.cluster.cluster.list_all_clusters",
+ return_value=[mock_raycluster1, mock_raycluster2],
+ ):
+ # Call the function under test
+ df = cf_widgets._fetch_cluster_data(namespace="default")
+
+ # Expected DataFrame
+ expected_data = {
+ "Name": ["test-cluster-1", "test-cluster-2"],
+ "Namespace": ["default", "default"],
+ "Num Workers": [1, 2],
+ "Head GPUs": ["nvidia.com/gpu: 1", "0"],
+ "Worker GPUs": ["nvidia.com/gpu: 2", "0"],
+ "Head CPU Req~Lim": ["500m~1000m", "0~0"],
+ "Head Memory Req~Lim": ["1Gi~2Gi", "0~0"],
+ "Worker CPU Req~Lim": ["1000m~2000m", "0~0"],
+ "Worker Memory Req~Lim": ["2Gi~4Gi", "0~0"],
+ "status": [
+ 'Ready ✓',
+ 'Suspended ❄️',
+ ],
+ }
+
+ expected_df = pd.DataFrame(expected_data)
+
+ # Assert that the DataFrame matches expected
+ pd.testing.assert_frame_equal(
+ df.reset_index(drop=True), expected_df.reset_index(drop=True)
+ )
+
+
+def test_format_status():
+ # Test each possible status
+ test_cases = [
+ (RayClusterStatus.READY, 'Ready ✓'),
+ (
+ RayClusterStatus.SUSPENDED,
+ 'Suspended ❄️',
+ ),
+ (RayClusterStatus.FAILED, 'Failed ✗'),
+ (RayClusterStatus.UNHEALTHY, 'Unhealthy'),
+ (RayClusterStatus.UNKNOWN, 'Unknown'),
+ ]
+
+ for status, expected_output in test_cases:
+ assert (
+ cf_widgets._format_status(status) == expected_output
+ ), f"Failed for status: {status}"
+
+ # Test an unrecognized status
+ unrecognized_status = "NotAStatus"
+ assert (
+ cf_widgets._format_status(unrecognized_status) == "NotAStatus"
+ ), "Failed for unrecognized status"
+
+
# Make sure to always keep this function last
def test_cleanup():
os.remove(f"{aw_dir}unit-test-no-kueue.yaml")
diff --git a/ui-tests/tests/widget_notebook_example.test.ts b/ui-tests/tests/widget_notebook_example.test.ts
index 798c2eb60..823a73f47 100644
--- a/ui-tests/tests/widget_notebook_example.test.ts
+++ b/ui-tests/tests/widget_notebook_example.test.ts
@@ -30,11 +30,16 @@ test.describe("Visual Regression", () => {
tmpPath,
}) => {
const notebook = "3_widget_example.ipynb";
+ const namespace = 'default';
await page.notebook.openByPath(`${tmpPath}/${notebook}`);
await page.notebook.activate(notebook);
+ // Hide the cell toolbar before capturing the screenshots
+ await page.addStyleTag({ content: '.jp-cell-toolbar { display: none !important; }' });
+
const captures: (Buffer | null)[] = []; // Array to store cell screenshots
const cellCount = await page.notebook.getCellCount();
+ console.log(`Cell count: ${cellCount}`);
// Run all cells and capture their screenshots
await page.notebook.runCellByCell({
@@ -43,7 +48,6 @@ test.describe("Visual Regression", () => {
if (cell && (await cell.isVisible())) {
captures[cellIndex] = await cell.screenshot(); // Save the screenshot by cell index
}
- await page.addStyleTag({ content: '.jp-cell-toolbar { display: none !important; }' });
},
});
@@ -59,25 +63,27 @@ test.describe("Visual Regression", () => {
}
}
- const widgetCellIndex = 3;
+ // At this point, all cells have been ran, and their screenshots have been captured.
+ // We now interact with the widgets in the notebook.
+ const upDownWidgetCellIndex = 3; // 4 on OpenShift
- await waitForWidget(page, widgetCellIndex, 'input[type="checkbox"]');
- await waitForWidget(page, widgetCellIndex, 'button:has-text("Cluster Down")');
- await waitForWidget(page, widgetCellIndex, 'button:has-text("Cluster Up")');
+ await waitForWidget(page, upDownWidgetCellIndex, 'input[type="checkbox"]');
+ await waitForWidget(page, upDownWidgetCellIndex, 'button:has-text("Cluster Down")');
+ await waitForWidget(page, upDownWidgetCellIndex, 'button:has-text("Cluster Up")');
- await interactWithWidget(page, widgetCellIndex, 'input[type="checkbox"]', async (checkbox) => {
+ await interactWithWidget(page, upDownWidgetCellIndex, 'input[type="checkbox"]', async (checkbox) => {
await checkbox.click();
const isChecked = await checkbox.isChecked();
expect(isChecked).toBe(true);
});
- await interactWithWidget(page, widgetCellIndex, 'button:has-text("Cluster Down")', async (button) => {
+ await interactWithWidget(page, upDownWidgetCellIndex, 'button:has-text("Cluster Down")', async (button) => {
await button.click();
const clusterDownMessage = await page.waitForSelector('text=No instances found, nothing to be done.', { timeout: 5000 });
expect(clusterDownMessage).not.toBeNull();
});
- await interactWithWidget(page, widgetCellIndex, 'button:has-text("Cluster Up")', async (button) => {
+ await interactWithWidget(page, upDownWidgetCellIndex, 'button:has-text("Cluster Up")', async (button) => {
await button.click();
const successMessage = await page.waitForSelector('text=Ray Cluster: \'raytest\' has successfully been created', { timeout: 10000 });
@@ -95,13 +101,51 @@ test.describe("Visual Regression", () => {
await runPreviousCell(page, cellCount, '(, True)');
- await interactWithWidget(page, widgetCellIndex, 'button:has-text("Cluster Down")', async (button) => {
+ await interactWithWidget(page, upDownWidgetCellIndex, 'button:has-text("Cluster Down")', async (button) => {
await button.click();
const clusterDownMessage = await page.waitForSelector('text=Ray Cluster: \'raytest\' has successfully been deleted', { timeout: 5000 });
expect(clusterDownMessage).not.toBeNull();
});
await runPreviousCell(page, cellCount, '(, False)');
+
+ // view_clusters table with buttons
+ await interactWithWidget(page, upDownWidgetCellIndex, 'input[type="checkbox"]', async (checkbox) => {
+ await checkbox.click();
+ const isChecked = await checkbox.isChecked();
+ expect(isChecked).toBe(false);
+ });
+
+ await interactWithWidget(page, upDownWidgetCellIndex, 'button:has-text("Cluster Up")', async (button) => {
+ await button.click();
+ const successMessage = await page.waitForSelector('text=Ray Cluster: \'raytest\' has successfully been created', { timeout: 10000 });
+ expect(successMessage).not.toBeNull();
+ });
+
+ const viewClustersCellIndex = 4; // 5 on OpenShift
+ await page.notebook.runCell(cellCount - 2, true);
+ await interactWithWidget(page, viewClustersCellIndex, 'button:has-text("Open Ray Dashboard")', async (button) => {
+ await button.click();
+ const successMessage = await page.waitForSelector('text=Opening Ray Dashboard for raytest cluster', { timeout: 5000 });
+ expect(successMessage).not.toBeNull();
+ });
+
+ await interactWithWidget(page, viewClustersCellIndex, 'button:has-text("View Jobs")', async (button) => {
+ await button.click();
+ const successMessage = await page.waitForSelector('text=Opening Ray Jobs Dashboard for raytest cluster', { timeout: 5000 });
+ expect(successMessage).not.toBeNull();
+ });
+
+ await interactWithWidget(page, viewClustersCellIndex, 'button:has-text("Delete Cluster")', async (button) => {
+ await button.click();
+
+ const noClustersMessage = await page.waitForSelector(`text=No clusters found in the ${namespace} namespace.`, { timeout: 5000 });
+ expect(noClustersMessage).not.toBeNull();
+ const successMessage = await page.waitForSelector(`text=Cluster raytest in the ${namespace} namespace was deleted successfully.`, { timeout: 5000 });
+ expect(successMessage).not.toBeNull();
+ });
+
+ await runPreviousCell(page, cellCount, '(, False)');
});
});
diff --git a/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-4-linux.png b/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-4-linux.png
index 9d881da28..691e7124f 100644
Binary files a/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-4-linux.png and b/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-4-linux.png differ
diff --git a/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-5-linux.png b/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-5-linux.png
new file mode 100644
index 000000000..46861fd8a
Binary files /dev/null and b/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-5-linux.png differ