diff --git a/.github/workflows/ui_notebooks_test.yaml b/.github/workflows/ui_notebooks_test.yaml index 864330b9c..5e8d506d1 100644 --- a/.github/workflows/ui_notebooks_test.yaml +++ b/.github/workflows/ui_notebooks_test.yaml @@ -86,7 +86,8 @@ jobs: jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 3_widget_example.ipynb > 3_widget_example.ipynb.tmp && mv 3_widget_example.ipynb.tmp 3_widget_example.ipynb jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 3_widget_example.ipynb > 3_widget_example.ipynb.tmp && mv 3_widget_example.ipynb.tmp 3_widget_example.ipynb # Set explicit namespace as SDK need it (currently) to resolve local queues - sed -i "s/head_memory_limits=2,/head_memory_limits=2, namespace='default',/" 3_widget_example.ipynb + sed -i "s|head_memory_limits=2,|head_memory_limits=2, namespace='default', image='quay.io/modh/ray:2.35.0-py39-cu121',|" 3_widget_example.ipynb + sed -i "s|view_clusters()|view_clusters('default')|" 3_widget_example.ipynb working-directory: demo-notebooks/guided-demos - name: Run UI notebook tests diff --git a/demo-notebooks/guided-demos/3_widget_example.ipynb b/demo-notebooks/guided-demos/3_widget_example.ipynb index 4d3d6ea70..11521ec72 100644 --- a/demo-notebooks/guided-demos/3_widget_example.ipynb +++ b/demo-notebooks/guided-demos/3_widget_example.ipynb @@ -19,7 +19,7 @@ "outputs": [], "source": [ "# Import pieces from codeflare-sdk\n", - "from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication" + "from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication, view_clusters" ] }, { @@ -61,7 +61,7 @@ "# Create and configure our cluster object\n", "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n", "cluster = Cluster(ClusterConfiguration(\n", - " name='raytest', \n", + " name='raytest',\n", " head_cpu_requests='500m',\n", " head_cpu_limits='500m',\n", " head_memory_requests=2,\n", @@ -73,12 +73,22 @@ " worker_cpu_limits=1,\n", " worker_memory_requests=2,\n", " worker_memory_limits=2,\n", - " # image=\"\", # Optional Field \n", - " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n", + " # image=\"\", # Optional Field\n", + " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources\n", " # local_queue=\"local-queue-name\" # Specify the local queue manually\n", "))" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "3de6403c", + "metadata": {}, + "outputs": [], + "source": [ + "view_clusters()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -106,7 +116,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.19" + "version": "3.9.18" }, "vscode": { "interpreter": { diff --git a/src/codeflare_sdk.egg-info/SOURCES.txt b/src/codeflare_sdk.egg-info/SOURCES.txt index 42541f1d2..63614a814 100644 --- a/src/codeflare_sdk.egg-info/SOURCES.txt +++ b/src/codeflare_sdk.egg-info/SOURCES.txt @@ -12,9 +12,11 @@ src/codeflare_sdk/cluster/awload.py src/codeflare_sdk/cluster/cluster.py src/codeflare_sdk/cluster/config.py src/codeflare_sdk/cluster/model.py +src/codeflare_sdk/cluster/widgets.py src/codeflare_sdk/job/__init__.py src/codeflare_sdk/job/ray_jobs.py src/codeflare_sdk/utils/__init__.py +src/codeflare_sdk/utils/demos.py src/codeflare_sdk/utils/generate_cert.py src/codeflare_sdk/utils/generate_yaml.py src/codeflare_sdk/utils/kube_api_helpers.py diff --git a/src/codeflare_sdk/__init__.py b/src/codeflare_sdk/__init__.py index 0390a3d2f..29205a36e 100644 --- a/src/codeflare_sdk/__init__.py +++ b/src/codeflare_sdk/__init__.py @@ -14,6 +14,7 @@ get_cluster, list_all_queued, list_all_clusters, + view_clusters, ) from .job import RayJobClient diff --git a/src/codeflare_sdk/cluster/__init__.py b/src/codeflare_sdk/cluster/__init__.py index 0b1849e51..6490a2247 100644 --- a/src/codeflare_sdk/cluster/__init__.py +++ b/src/codeflare_sdk/cluster/__init__.py @@ -21,4 +21,8 @@ list_all_clusters, ) +from .widgets import ( + view_clusters, +) + from .awload import AWManager diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 7c652a186..a32d5a4b7 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -19,6 +19,7 @@ """ import re +import subprocess from time import sleep from typing import List, Optional, Tuple, Dict @@ -862,16 +863,19 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]: name=rc["metadata"]["name"], status=status, # for now we are not using autoscaling so same replicas is fine - workers=rc["spec"]["workerGroupSpecs"][0]["replicas"], + num_workers=rc["spec"]["workerGroupSpecs"][0]["replicas"], worker_mem_limits=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["resources"]["limits"]["memory"], worker_mem_requests=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["resources"]["requests"]["memory"], - worker_cpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ - 0 - ]["resources"]["limits"]["cpu"], + worker_cpu_requests=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + "containers" + ][0]["resources"]["requests"]["cpu"], + worker_cpu_limits=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + "containers" + ][0]["resources"]["limits"]["cpu"], worker_extended_resources=worker_extended_resources, namespace=rc["metadata"]["namespace"], head_cpu_requests=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][ @@ -907,10 +911,11 @@ def _copy_to_ray(cluster: Cluster) -> RayCluster: ray = RayCluster( name=cluster.config.name, status=cluster.status(print_to_console=False)[0], - workers=cluster.config.num_workers, + num_workers=cluster.config.num_workers, worker_mem_requests=cluster.config.worker_memory_requests, worker_mem_limits=cluster.config.worker_memory_limits, - worker_cpu=cluster.config.worker_cpu_requests, + worker_cpu_requests=cluster.config.worker_cpu_requests, + worker_cpu_limits=cluster.config.worker_cpu_limits, worker_extended_resources=cluster.config.worker_extended_resource_requests, namespace=cluster.config.namespace, dashboard=cluster.cluster_dashboard_uri(), diff --git a/src/codeflare_sdk/cluster/model.py b/src/codeflare_sdk/cluster/model.py index ab7b30ede..44be54567 100644 --- a/src/codeflare_sdk/cluster/model.py +++ b/src/codeflare_sdk/cluster/model.py @@ -21,6 +21,7 @@ from dataclasses import dataclass, field from enum import Enum import typing +from typing import Union class RayClusterStatus(Enum): @@ -77,10 +78,11 @@ class RayCluster: head_cpu_limits: int head_mem_requests: str head_mem_limits: str - workers: int + num_workers: int worker_mem_requests: str worker_mem_limits: str - worker_cpu: int + worker_cpu_requests: Union[int, str] + worker_cpu_limits: Union[int, str] namespace: str dashboard: str worker_extended_resources: typing.Dict[str, int] = field(default_factory=dict) diff --git a/src/codeflare_sdk/cluster/widgets.py b/src/codeflare_sdk/cluster/widgets.py index 351640e04..53afa28be 100644 --- a/src/codeflare_sdk/cluster/widgets.py +++ b/src/codeflare_sdk/cluster/widgets.py @@ -15,10 +15,21 @@ """ The widgets sub-module contains the ui widgets created using the ipywidgets package. """ -import ipywidgets as widgets -from IPython.display import display +import contextlib +import io import os +import warnings +import time import codeflare_sdk +from kubernetes import client +from kubernetes.client.rest import ApiException +import ipywidgets as widgets +from IPython.display import display, HTML, Javascript +import pandas as pd +from .config import ClusterConfiguration +from .model import RayClusterStatus +from ..utils.kube_api_helpers import _kube_api_error_handling +from .auth import config_check, api_config_handler def cluster_up_down_buttons(cluster: "codeflare_sdk.cluster.Cluster") -> widgets.Button: @@ -89,3 +100,393 @@ def is_notebook() -> bool: return True else: return False + + +def view_clusters(namespace: str = None): + """ + view_clusters function will display existing clusters with their specs, and handle user interactions. + """ + if not is_notebook(): + warnings.warn( + "view_clusters can only be used in a Jupyter Notebook environment." + ) + return # Exit function if not in Jupyter Notebook + + from .cluster import get_current_namespace + + if not namespace: + namespace = get_current_namespace() + + user_output = widgets.Output() + raycluster_data_output = widgets.Output() + url_output = widgets.Output() + + ray_clusters_df = _fetch_cluster_data(namespace) + if ray_clusters_df.empty: + print(f"No clusters found in the {namespace} namespace.") + return + + classification_widget = widgets.ToggleButtons( + options=ray_clusters_df["Name"].tolist(), + value=ray_clusters_df["Name"].tolist()[0], + description="Select an existing cluster:", + ) + # Setting the initial value to trigger the event handler to display the cluster details. + initial_value = classification_widget.value + _on_cluster_click( + {"new": initial_value}, raycluster_data_output, namespace, classification_widget + ) + classification_widget.observe( + lambda selection_change: _on_cluster_click( + selection_change, raycluster_data_output, namespace, classification_widget + ), + names="value", + ) + + # UI table buttons + delete_button = widgets.Button( + description="Delete Cluster", + icon="trash", + tooltip="Delete the selected cluster", + ) + delete_button.on_click( + lambda b: _on_delete_button_click( + b, + classification_widget, + ray_clusters_df, + raycluster_data_output, + user_output, + delete_button, + list_jobs_button, + ray_dashboard_button, + ) + ) + + list_jobs_button = widgets.Button( + description="View Jobs", icon="suitcase", tooltip="Open the Ray Job Dashboard" + ) + list_jobs_button.on_click( + lambda b: _on_list_jobs_button_click( + b, classification_widget, ray_clusters_df, user_output, url_output + ) + ) + + ray_dashboard_button = widgets.Button( + description="Open Ray Dashboard", + icon="dashboard", + tooltip="Open the Ray Dashboard in a new tab", + layout=widgets.Layout(width="auto"), + ) + ray_dashboard_button.on_click( + lambda b: _on_ray_dashboard_button_click( + b, classification_widget, ray_clusters_df, user_output, url_output + ) + ) + + display(widgets.VBox([classification_widget, raycluster_data_output])) + display( + widgets.HBox([delete_button, list_jobs_button, ray_dashboard_button]), + url_output, + user_output, + ) + + +def _on_cluster_click( + selection_change, + raycluster_data_output: widgets.Output, + namespace: str, + classification_widget: widgets.ToggleButtons, +): + """ + _on_cluster_click handles the event when a cluster is selected from the toggle buttons, updating the output with cluster details. + """ + new_value = selection_change["new"] + raycluster_data_output.clear_output() + ray_clusters_df = _fetch_cluster_data(namespace) + classification_widget.options = ray_clusters_df["Name"].tolist() + with raycluster_data_output: + display( + HTML( + ray_clusters_df[ray_clusters_df["Name"] == new_value][ + [ + "Name", + "Namespace", + "Num Workers", + "Head GPUs", + "Head CPU Req~Lim", + "Head Memory Req~Lim", + "Worker GPUs", + "Worker CPU Req~Lim", + "Worker Memory Req~Lim", + "status", + ] + ].to_html(escape=False, index=False, border=2) + ) + ) + + +def _on_delete_button_click( + b, + classification_widget: widgets.ToggleButtons, + ray_clusters_df: pd.DataFrame, + raycluster_data_output: widgets.Output, + user_output: widgets.Output, + delete_button: widgets.Button, + list_jobs_button: widgets.Button, + ray_dashboard_button: widgets.Button, +): + """ + _on_delete_button_click handles the event when the Delete Button is clicked, deleting the selected cluster. + """ + cluster_name = classification_widget.value + namespace = ray_clusters_df[ray_clusters_df["Name"] == classification_widget.value][ + "Namespace" + ].values[0] + + _delete_cluster(cluster_name, namespace) + + with user_output: + user_output.clear_output() + print( + f"Cluster {cluster_name} in the {namespace} namespace was deleted successfully." + ) + + # Refresh the dataframe + new_df = _fetch_cluster_data(namespace) + if new_df.empty: + classification_widget.close() + delete_button.close() + list_jobs_button.close() + ray_dashboard_button.close() + with raycluster_data_output: + raycluster_data_output.clear_output() + print(f"No clusters found in the {namespace} namespace.") + else: + classification_widget.options = new_df["Name"].tolist() + + +def _on_ray_dashboard_button_click( + b, + classification_widget: widgets.ToggleButtons, + ray_clusters_df: pd.DataFrame, + user_output: widgets.Output, + url_output: widgets.Output, +): + """ + _on_ray_dashboard_button_click handles the event when the Open Ray Dashboard button is clicked, opening the Ray Dashboard in a new tab + """ + from codeflare_sdk.cluster import Cluster + + cluster_name = classification_widget.value + namespace = ray_clusters_df[ray_clusters_df["Name"] == classification_widget.value][ + "Namespace" + ].values[0] + + # Suppress from Cluster Object initialisation widgets and outputs + with widgets.Output(), contextlib.redirect_stdout( + io.StringIO() + ), contextlib.redirect_stderr(io.StringIO()): + cluster = Cluster(ClusterConfiguration(cluster_name, namespace)) + dashboard_url = cluster.cluster_dashboard_uri() + + with user_output: + user_output.clear_output() + print(f"Opening Ray Dashboard for {cluster_name} cluster:\n{dashboard_url}") + with url_output: + display(Javascript(f'window.open("{dashboard_url}", "_blank");')) + + +def _on_list_jobs_button_click( + b, + classification_widget: widgets.ToggleButtons, + ray_clusters_df: pd.DataFrame, + user_output: widgets.Output, + url_output: widgets.Output, +): + """ + _on_list_jobs_button_click handles the event when the View Jobs button is clicked, opening the Ray Jobs Dashboard in a new tab + """ + from codeflare_sdk.cluster import Cluster + + cluster_name = classification_widget.value + namespace = ray_clusters_df[ray_clusters_df["Name"] == classification_widget.value][ + "Namespace" + ].values[0] + + # Suppress from Cluster Object initialisation widgets and outputs + with widgets.Output(), contextlib.redirect_stdout( + io.StringIO() + ), contextlib.redirect_stderr(io.StringIO()): + cluster = Cluster(ClusterConfiguration(cluster_name, namespace)) + dashboard_url = cluster.cluster_dashboard_uri() + + with user_output: + user_output.clear_output() + print( + f"Opening Ray Jobs Dashboard for {cluster_name} cluster:\n{dashboard_url}/#/jobs" + ) + with url_output: + display(Javascript(f'window.open("{dashboard_url}/#/jobs", "_blank");')) + + +def _delete_cluster( + cluster_name: str, + namespace: str, + timeout: int = 5, + interval: int = 1, +): + """ + _delete_cluster function deletes the cluster with the given name and namespace. + It optionally waits for the cluster to be deleted. + """ + from .cluster import _check_aw_exists + + try: + config_check() + api_instance = client.CustomObjectsApi(api_config_handler()) + + if _check_aw_exists(cluster_name, namespace): + api_instance.delete_namespaced_custom_object( + group="workload.codeflare.dev", + version="v1beta2", + namespace=namespace, + plural="appwrappers", + name=cluster_name, + ) + group = "workload.codeflare.dev" + version = "v1beta2" + plural = "appwrappers" + else: + api_instance.delete_namespaced_custom_object( + group="ray.io", + version="v1", + namespace=namespace, + plural="rayclusters", + name=cluster_name, + ) + group = "ray.io" + version = "v1" + plural = "rayclusters" + + # Wait for the resource to be deleted + while timeout > 0: + try: + api_instance.get_namespaced_custom_object( + group=group, + version=version, + namespace=namespace, + plural=plural, + name=cluster_name, + ) + # Retry if resource still exists + time.sleep(interval) + timeout -= interval + if timeout <= 0: + raise TimeoutError( + f"Timeout waiting for {cluster_name} to be deleted." + ) + except ApiException as e: + # Resource is deleted + if e.status == 404: + break + except Exception as e: # pragma: no cover + return _kube_api_error_handling(e) + + +def _fetch_cluster_data(namespace): + """ + _fetch_cluster_data function fetches all clusters and their spec in a given namespace and returns a DataFrame. + """ + from .cluster import list_all_clusters + + rayclusters = list_all_clusters(namespace, False) + if not rayclusters: + return pd.DataFrame() + names = [item.name for item in rayclusters] + namespaces = [item.namespace for item in rayclusters] + num_workers = [item.num_workers for item in rayclusters] + head_extended_resources = [ + f"{list(item.head_extended_resources.keys())[0]}: {list(item.head_extended_resources.values())[0]}" + if item.head_extended_resources + else "0" + for item in rayclusters + ] + worker_extended_resources = [ + f"{list(item.worker_extended_resources.keys())[0]}: {list(item.worker_extended_resources.values())[0]}" + if item.worker_extended_resources + else "0" + for item in rayclusters + ] + head_cpu_requests = [ + item.head_cpu_requests if item.head_cpu_requests else 0 for item in rayclusters + ] + head_cpu_limits = [ + item.head_cpu_limits if item.head_cpu_limits else 0 for item in rayclusters + ] + head_cpu_rl = [ + f"{requests}~{limits}" + for requests, limits in zip(head_cpu_requests, head_cpu_limits) + ] + head_mem_requests = [ + item.head_mem_requests if item.head_mem_requests else 0 for item in rayclusters + ] + head_mem_limits = [ + item.head_mem_limits if item.head_mem_limits else 0 for item in rayclusters + ] + head_mem_rl = [ + f"{requests}~{limits}" + for requests, limits in zip(head_mem_requests, head_mem_limits) + ] + worker_cpu_requests = [ + item.worker_cpu_requests if item.worker_cpu_requests else 0 + for item in rayclusters + ] + worker_cpu_limits = [ + item.worker_cpu_limits if item.worker_cpu_limits else 0 for item in rayclusters + ] + worker_cpu_rl = [ + f"{requests}~{limits}" + for requests, limits in zip(worker_cpu_requests, worker_cpu_limits) + ] + worker_mem_requests = [ + item.worker_mem_requests if item.worker_mem_requests else 0 + for item in rayclusters + ] + worker_mem_limits = [ + item.worker_mem_limits if item.worker_mem_limits else 0 for item in rayclusters + ] + worker_mem_rl = [ + f"{requests}~{limits}" + for requests, limits in zip(worker_mem_requests, worker_mem_limits) + ] + status = [item.status.name for item in rayclusters] + + status = [_format_status(item.status) for item in rayclusters] + + data = { + "Name": names, + "Namespace": namespaces, + "Num Workers": num_workers, + "Head GPUs": head_extended_resources, + "Worker GPUs": worker_extended_resources, + "Head CPU Req~Lim": head_cpu_rl, + "Head Memory Req~Lim": head_mem_rl, + "Worker CPU Req~Lim": worker_cpu_rl, + "Worker Memory Req~Lim": worker_mem_rl, + "status": status, + } + return pd.DataFrame(data) + + +def _format_status(status): + """ + _format_status function formats the status enum. + """ + status_map = { + RayClusterStatus.READY: 'Ready ✓', + RayClusterStatus.SUSPENDED: 'Suspended ❄️', + RayClusterStatus.FAILED: 'Failed ✗', + RayClusterStatus.UNHEALTHY: 'Unhealthy', + RayClusterStatus.UNKNOWN: 'Unknown', + } + return status_map.get(status, status) diff --git a/src/codeflare_sdk/utils/pretty_print.py b/src/codeflare_sdk/utils/pretty_print.py index 4842c9cd2..303313199 100644 --- a/src/codeflare_sdk/utils/pretty_print.py +++ b/src/codeflare_sdk/utils/pretty_print.py @@ -135,9 +135,9 @@ def print_clusters(clusters: List[RayCluster]): ) name = cluster.name dashboard = cluster.dashboard - workers = str(cluster.workers) + workers = str(cluster.num_workers) memory = f"{cluster.worker_mem_requests}~{cluster.worker_mem_limits}" - cpu = str(cluster.worker_cpu) + cpu = f"{cluster.worker_cpu_requests}~{cluster.worker_cpu_limits}" gpu = str(cluster.worker_extended_resources.get("nvidia.com/gpu", 0)) #'table0' to display the cluster name, status, url, and dashboard link diff --git a/tests/unit_test.py b/tests/unit_test.py index 388723c50..ae2af6591 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -76,7 +76,9 @@ gen_names, is_openshift_cluster, ) -from codeflare_sdk.cluster.widgets import cluster_up_down_buttons + +import codeflare_sdk.cluster.widgets as cf_widgets +import pandas as pd import openshift from openshift.selector import Selector @@ -88,9 +90,6 @@ from ray.job_submission import JobSubmissionClient from codeflare_sdk.job.ray_jobs import RayJobClient -import ipywidgets as widgets -from IPython.display import display - # For mocking openshift client results fake_res = openshift.Result("fake") @@ -941,10 +940,11 @@ def test_ray_details(mocker, capsys): ray1 = RayCluster( name="raytest1", status=RayClusterStatus.READY, - workers=1, + num_workers=1, worker_mem_requests="2G", worker_mem_limits="2G", - worker_cpu=1, + worker_cpu_requests=1, + worker_cpu_limits=1, namespace="ns", dashboard="fake-uri", head_cpu_requests=2, @@ -979,10 +979,11 @@ def test_ray_details(mocker, capsys): assert details == ray2 assert ray2.name == "raytest2" assert ray1.namespace == ray2.namespace - assert ray1.workers == ray2.workers + assert ray1.num_workers == ray2.num_workers assert ray1.worker_mem_requests == ray2.worker_mem_requests assert ray1.worker_mem_limits == ray2.worker_mem_limits - assert ray1.worker_cpu == ray2.worker_cpu + assert ray1.worker_cpu_requests == ray2.worker_cpu_requests + assert ray1.worker_cpu_limits == ray2.worker_cpu_limits assert ray1.worker_extended_resources == ray2.worker_extended_resources try: print_clusters([ray1, ray2]) @@ -1006,7 +1007,7 @@ def test_ray_details(mocker, capsys): " │ ╭── Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n" " │ │ # Workers │ │ Memory CPU GPU │ │ \n" " │ │ │ │ │ │ \n" - " │ │ 1 │ │ 2G~2G 1 0 │ │ \n" + " │ │ 1 │ │ 2G~2G 1~1 0 │ │ \n" " │ │ │ │ │ │ \n" " │ ╰─────────────╯ ╰──────────────────────────────────────╯ │ \n" " ╰───────────────────────────────────────────────────────────────╯ \n" @@ -1024,7 +1025,7 @@ def test_ray_details(mocker, capsys): " │ ╭── Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n" " │ │ # Workers │ │ Memory CPU GPU │ │ \n" " │ │ │ │ │ │ \n" - " │ │ 1 │ │ 2G~2G 1 0 │ │ \n" + " │ │ 1 │ │ 2G~2G 1~1 0 │ │ \n" " │ │ │ │ │ │ \n" " │ ╰─────────────╯ ╰──────────────────────────────────────╯ │ \n" " ╰───────────────────────────────────────────────────────────────╯ \n" @@ -1040,7 +1041,7 @@ def test_ray_details(mocker, capsys): "│ ╭── Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │\n" "│ │ # Workers │ │ Memory CPU GPU │ │\n" "│ │ │ │ │ │\n" - "│ │ 1 │ │ 2G~2G 1 0 │ │\n" + "│ │ 1 │ │ 2G~2G 1~1 0 │ │\n" "│ │ │ │ │ │\n" "│ ╰─────────────╯ ╰──────────────────────────────────────╯ │\n" "╰───────────────────────────────────────────────────────────────╯\n" @@ -2245,7 +2246,7 @@ def test_list_clusters(mocker, capsys): " │ ╭── Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │ \n" " │ │ # Workers │ │ Memory CPU GPU │ │ \n" " │ │ │ │ │ │ \n" - " │ │ 1 │ │ 2G~2G 1 0 │ │ \n" + " │ │ 1 │ │ 2G~2G 1~1 0 │ │ \n" " │ │ │ │ │ │ \n" " │ ╰─────────────╯ ╰──────────────────────────────────────╯ │ \n" " ╰───────────────────────────────────────────────────────────────╯ \n" @@ -2261,7 +2262,7 @@ def test_list_clusters(mocker, capsys): "│ ╭── Workers ──╮ ╭───────── Worker specs(each) ─────────╮ │\n" "│ │ # Workers │ │ Memory CPU GPU │ │\n" "│ │ │ │ │ │\n" - "│ │ 1 │ │ 2G~2G 1 0 │ │\n" + "│ │ 1 │ │ 2G~2G 1~1 0 │ │\n" "│ │ │ │ │ │\n" "│ ╰─────────────╯ ╰──────────────────────────────────────╯ │\n" "╰───────────────────────────────────────────────────────────────╯\n" @@ -2357,10 +2358,11 @@ def test_cluster_status(mocker): fake_ray = RayCluster( name="test", status=RayClusterStatus.UNKNOWN, - workers=1, + num_workers=1, worker_mem_requests=2, worker_mem_limits=2, - worker_cpu=1, + worker_cpu_requests=1, + worker_cpu_limits=1, namespace="ns", dashboard="fake-uri", head_cpu_requests=2, @@ -2922,7 +2924,7 @@ def test_cluster_up_down_buttons(mocker): MockButton.side_effect = [mock_up_button, mock_down_button] # Call the method under test - cluster_up_down_buttons(cluster) + cf_widgets.cluster_up_down_buttons(cluster) # Simulate checkbox being checked or unchecked mock_wait_ready_check_box.value = True # Simulate checkbox being checked @@ -2955,6 +2957,247 @@ def test_is_notebook_true(): assert is_notebook() is True +def test_view_clusters(mocker, capsys): + from kubernetes.client.rest import ApiException + + mocker.patch("codeflare_sdk.cluster.widgets.is_notebook", return_value=False) + with pytest.warns( + UserWarning, + match="view_clusters can only be used in a Jupyter Notebook environment.", + ): + result = cf_widgets.view_clusters(namespace="default") + # Assert the function returns None when not in a notebook environment + assert result is None + + mocker.patch("codeflare_sdk.cluster.widgets.is_notebook", return_value=True) + + # Mock Kubernetes API responses + mocker.patch("kubernetes.client.ApisApi.get_api_versions") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + return_value={"items": []}, + ) + mocker.patch("codeflare_sdk.cluster.cluster._check_aw_exists", return_value=False) + + # Return empty dataframe when no clusters are found + mocker.patch("codeflare_sdk.cluster.cluster.list_all_clusters", return_value=[]) + mocker.patch( + "codeflare_sdk.cluster.cluster.get_current_namespace", + return_value="default", + ) + df = cf_widgets._fetch_cluster_data(namespace="default") + assert df.empty + + cf_widgets.view_clusters() + captured = capsys.readouterr() + assert f"No clusters found in the default namespace." in captured.out + + # Assert the function returns None + assert result is None + + test_df = pd.DataFrame( + { + "Name": ["test-cluster"], + "Namespace": ["default"], + "Num Workers": ["1"], + "Head GPUs": ["0"], + "Worker GPUs": ["0"], + "Head CPU Req~Lim": ["1~1"], + "Head Memory Req~Lim": ["1Gi~1Gi"], + "Worker CPU Req~Lim": ["1~1"], + "Worker Memory Req~Lim": ["1Gi~1Gi"], + "status": ['Ready ✓'], + } + ) + + # Mock the _fetch_cluster_data function to return a test DataFrame + mocker.patch( + "codeflare_sdk.cluster.widgets._fetch_cluster_data", return_value=test_df + ) + + # Mock the Cluster class and related methods + mocker.patch("codeflare_sdk.cluster.Cluster") + mocker.patch("codeflare_sdk.cluster.ClusterConfiguration") + + with patch("ipywidgets.ToggleButtons") as MockToggleButtons, patch( + "ipywidgets.Button" + ) as MockButton, patch("ipywidgets.Output") as MockOutput, patch( + "ipywidgets.HBox" + ), patch( + "ipywidgets.VBox" + ), patch( + "IPython.display.display" + ) as mock_display, patch( + "IPython.display.HTML" + ), patch( + "codeflare_sdk.cluster.widgets.Javascript" + ) as mock_javascript: + # Create mock widget instances + mock_toggle = MagicMock() + mock_delete_button = MagicMock() + mock_list_jobs_button = MagicMock() + mock_ray_dashboard_button = MagicMock() + mock_output = MagicMock() + + # Set the return values for the mocked widgets + MockToggleButtons.return_value = mock_toggle + MockButton.side_effect = [ + mock_delete_button, + mock_list_jobs_button, + mock_ray_dashboard_button, + ] + MockOutput.return_value = mock_output + + # Call the function under test + cf_widgets.view_clusters() + + # Simulate selecting a cluster + mock_toggle.value = "test-cluster" + selection_change = {"new": "test-cluster"} + cf_widgets._on_cluster_click( + selection_change, mock_output, "default", mock_toggle + ) + + # Assert that the toggle options are set correctly + mock_toggle.observe.assert_called() + + # Simulate clicking the list jobs button + cf_widgets._on_list_jobs_button_click( + None, mock_toggle, test_df, mock_output, mock_output + ) + mock_javascript.assert_called_once() + + # Simulate clicking the Ray dashboard button + cf_widgets._on_ray_dashboard_button_click( + None, mock_toggle, test_df, mock_output, mock_output + ) + mock_javascript.call_count = 2 + + mocker.patch( + "kubernetes.client.CustomObjectsApi.delete_namespaced_custom_object", + ) + mock_response = mocker.MagicMock() + mock_response.status = 404 + mock_exception = ApiException(http_resp=mock_response) + mocker.patch( + "kubernetes.client.CustomObjectsApi.get_namespaced_custom_object", + side_effect=mock_exception, + ) + + # Simulate clicking the delete button + cf_widgets._on_delete_button_click( + None, + mock_toggle, + test_df, + mock_output, + mock_output, + mock_delete_button, + mock_list_jobs_button, + mock_ray_dashboard_button, + ) + MockButton.call_count = 3 + + +def test_fetch_cluster_data(mocker): + # Return empty dataframe when no clusters are found + mocker.patch("codeflare_sdk.cluster.cluster.list_all_clusters", return_value=[]) + df = cf_widgets._fetch_cluster_data(namespace="default") + assert df.empty + + # Create mock RayCluster objects + mock_raycluster1 = MagicMock(spec=RayCluster) + mock_raycluster1.name = "test-cluster-1" + mock_raycluster1.namespace = "default" + mock_raycluster1.num_workers = 1 + mock_raycluster1.head_extended_resources = {"nvidia.com/gpu": "1"} + mock_raycluster1.worker_extended_resources = {"nvidia.com/gpu": "2"} + mock_raycluster1.head_cpu_requests = "500m" + mock_raycluster1.head_cpu_limits = "1000m" + mock_raycluster1.head_mem_requests = "1Gi" + mock_raycluster1.head_mem_limits = "2Gi" + mock_raycluster1.worker_cpu_requests = "1000m" + mock_raycluster1.worker_cpu_limits = "2000m" + mock_raycluster1.worker_mem_requests = "2Gi" + mock_raycluster1.worker_mem_limits = "4Gi" + mock_raycluster1.status = MagicMock() + mock_raycluster1.status.name = "READY" + mock_raycluster1.status = RayClusterStatus.READY + + mock_raycluster2 = MagicMock(spec=RayCluster) + mock_raycluster2.name = "test-cluster-2" + mock_raycluster2.namespace = "default" + mock_raycluster2.num_workers = 2 + mock_raycluster2.head_extended_resources = {} + mock_raycluster2.worker_extended_resources = {} + mock_raycluster2.head_cpu_requests = None + mock_raycluster2.head_cpu_limits = None + mock_raycluster2.head_mem_requests = None + mock_raycluster2.head_mem_limits = None + mock_raycluster2.worker_cpu_requests = None + mock_raycluster2.worker_cpu_limits = None + mock_raycluster2.worker_mem_requests = None + mock_raycluster2.worker_mem_limits = None + mock_raycluster2.status = MagicMock() + mock_raycluster2.status.name = "SUSPENDED" + mock_raycluster2.status = RayClusterStatus.SUSPENDED + + with patch( + "codeflare_sdk.cluster.cluster.list_all_clusters", + return_value=[mock_raycluster1, mock_raycluster2], + ): + # Call the function under test + df = cf_widgets._fetch_cluster_data(namespace="default") + + # Expected DataFrame + expected_data = { + "Name": ["test-cluster-1", "test-cluster-2"], + "Namespace": ["default", "default"], + "Num Workers": [1, 2], + "Head GPUs": ["nvidia.com/gpu: 1", "0"], + "Worker GPUs": ["nvidia.com/gpu: 2", "0"], + "Head CPU Req~Lim": ["500m~1000m", "0~0"], + "Head Memory Req~Lim": ["1Gi~2Gi", "0~0"], + "Worker CPU Req~Lim": ["1000m~2000m", "0~0"], + "Worker Memory Req~Lim": ["2Gi~4Gi", "0~0"], + "status": [ + 'Ready ✓', + 'Suspended ❄️', + ], + } + + expected_df = pd.DataFrame(expected_data) + + # Assert that the DataFrame matches expected + pd.testing.assert_frame_equal( + df.reset_index(drop=True), expected_df.reset_index(drop=True) + ) + + +def test_format_status(): + # Test each possible status + test_cases = [ + (RayClusterStatus.READY, 'Ready ✓'), + ( + RayClusterStatus.SUSPENDED, + 'Suspended ❄️', + ), + (RayClusterStatus.FAILED, 'Failed ✗'), + (RayClusterStatus.UNHEALTHY, 'Unhealthy'), + (RayClusterStatus.UNKNOWN, 'Unknown'), + ] + + for status, expected_output in test_cases: + assert ( + cf_widgets._format_status(status) == expected_output + ), f"Failed for status: {status}" + + # Test an unrecognized status + unrecognized_status = "NotAStatus" + assert ( + cf_widgets._format_status(unrecognized_status) == "NotAStatus" + ), "Failed for unrecognized status" + + # Make sure to always keep this function last def test_cleanup(): os.remove(f"{aw_dir}unit-test-no-kueue.yaml") diff --git a/ui-tests/tests/widget_notebook_example.test.ts b/ui-tests/tests/widget_notebook_example.test.ts index 798c2eb60..823a73f47 100644 --- a/ui-tests/tests/widget_notebook_example.test.ts +++ b/ui-tests/tests/widget_notebook_example.test.ts @@ -30,11 +30,16 @@ test.describe("Visual Regression", () => { tmpPath, }) => { const notebook = "3_widget_example.ipynb"; + const namespace = 'default'; await page.notebook.openByPath(`${tmpPath}/${notebook}`); await page.notebook.activate(notebook); + // Hide the cell toolbar before capturing the screenshots + await page.addStyleTag({ content: '.jp-cell-toolbar { display: none !important; }' }); + const captures: (Buffer | null)[] = []; // Array to store cell screenshots const cellCount = await page.notebook.getCellCount(); + console.log(`Cell count: ${cellCount}`); // Run all cells and capture their screenshots await page.notebook.runCellByCell({ @@ -43,7 +48,6 @@ test.describe("Visual Regression", () => { if (cell && (await cell.isVisible())) { captures[cellIndex] = await cell.screenshot(); // Save the screenshot by cell index } - await page.addStyleTag({ content: '.jp-cell-toolbar { display: none !important; }' }); }, }); @@ -59,25 +63,27 @@ test.describe("Visual Regression", () => { } } - const widgetCellIndex = 3; + // At this point, all cells have been ran, and their screenshots have been captured. + // We now interact with the widgets in the notebook. + const upDownWidgetCellIndex = 3; // 4 on OpenShift - await waitForWidget(page, widgetCellIndex, 'input[type="checkbox"]'); - await waitForWidget(page, widgetCellIndex, 'button:has-text("Cluster Down")'); - await waitForWidget(page, widgetCellIndex, 'button:has-text("Cluster Up")'); + await waitForWidget(page, upDownWidgetCellIndex, 'input[type="checkbox"]'); + await waitForWidget(page, upDownWidgetCellIndex, 'button:has-text("Cluster Down")'); + await waitForWidget(page, upDownWidgetCellIndex, 'button:has-text("Cluster Up")'); - await interactWithWidget(page, widgetCellIndex, 'input[type="checkbox"]', async (checkbox) => { + await interactWithWidget(page, upDownWidgetCellIndex, 'input[type="checkbox"]', async (checkbox) => { await checkbox.click(); const isChecked = await checkbox.isChecked(); expect(isChecked).toBe(true); }); - await interactWithWidget(page, widgetCellIndex, 'button:has-text("Cluster Down")', async (button) => { + await interactWithWidget(page, upDownWidgetCellIndex, 'button:has-text("Cluster Down")', async (button) => { await button.click(); const clusterDownMessage = await page.waitForSelector('text=No instances found, nothing to be done.', { timeout: 5000 }); expect(clusterDownMessage).not.toBeNull(); }); - await interactWithWidget(page, widgetCellIndex, 'button:has-text("Cluster Up")', async (button) => { + await interactWithWidget(page, upDownWidgetCellIndex, 'button:has-text("Cluster Up")', async (button) => { await button.click(); const successMessage = await page.waitForSelector('text=Ray Cluster: \'raytest\' has successfully been created', { timeout: 10000 }); @@ -95,13 +101,51 @@ test.describe("Visual Regression", () => { await runPreviousCell(page, cellCount, '(, True)'); - await interactWithWidget(page, widgetCellIndex, 'button:has-text("Cluster Down")', async (button) => { + await interactWithWidget(page, upDownWidgetCellIndex, 'button:has-text("Cluster Down")', async (button) => { await button.click(); const clusterDownMessage = await page.waitForSelector('text=Ray Cluster: \'raytest\' has successfully been deleted', { timeout: 5000 }); expect(clusterDownMessage).not.toBeNull(); }); await runPreviousCell(page, cellCount, '(, False)'); + + // view_clusters table with buttons + await interactWithWidget(page, upDownWidgetCellIndex, 'input[type="checkbox"]', async (checkbox) => { + await checkbox.click(); + const isChecked = await checkbox.isChecked(); + expect(isChecked).toBe(false); + }); + + await interactWithWidget(page, upDownWidgetCellIndex, 'button:has-text("Cluster Up")', async (button) => { + await button.click(); + const successMessage = await page.waitForSelector('text=Ray Cluster: \'raytest\' has successfully been created', { timeout: 10000 }); + expect(successMessage).not.toBeNull(); + }); + + const viewClustersCellIndex = 4; // 5 on OpenShift + await page.notebook.runCell(cellCount - 2, true); + await interactWithWidget(page, viewClustersCellIndex, 'button:has-text("Open Ray Dashboard")', async (button) => { + await button.click(); + const successMessage = await page.waitForSelector('text=Opening Ray Dashboard for raytest cluster', { timeout: 5000 }); + expect(successMessage).not.toBeNull(); + }); + + await interactWithWidget(page, viewClustersCellIndex, 'button:has-text("View Jobs")', async (button) => { + await button.click(); + const successMessage = await page.waitForSelector('text=Opening Ray Jobs Dashboard for raytest cluster', { timeout: 5000 }); + expect(successMessage).not.toBeNull(); + }); + + await interactWithWidget(page, viewClustersCellIndex, 'button:has-text("Delete Cluster")', async (button) => { + await button.click(); + + const noClustersMessage = await page.waitForSelector(`text=No clusters found in the ${namespace} namespace.`, { timeout: 5000 }); + expect(noClustersMessage).not.toBeNull(); + const successMessage = await page.waitForSelector(`text=Cluster raytest in the ${namespace} namespace was deleted successfully.`, { timeout: 5000 }); + expect(successMessage).not.toBeNull(); + }); + + await runPreviousCell(page, cellCount, '(, False)'); }); }); diff --git a/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-4-linux.png b/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-4-linux.png index 9d881da28..691e7124f 100644 Binary files a/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-4-linux.png and b/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-4-linux.png differ diff --git a/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-5-linux.png b/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-5-linux.png new file mode 100644 index 000000000..46861fd8a Binary files /dev/null and b/ui-tests/tests/widget_notebook_example.test.ts-snapshots/widgets-cell-5-linux.png differ