diff --git a/docs/detailed-documentation/cluster/cluster.html b/docs/detailed-documentation/cluster/cluster.html index 1e49cf6d6..5c8688eb5 100644 --- a/docs/detailed-documentation/cluster/cluster.html +++ b/docs/detailed-documentation/cluster/cluster.html @@ -61,6 +61,7 @@

Module codeflare_sdk.cluster.cluster

from ..utils import pretty_print from ..utils.generate_yaml import ( generate_appwrapper, + head_worker_gpu_count_from_cluster, ) from ..utils.kube_api_helpers import _kube_api_error_handling from ..utils.generate_yaml import is_openshift_cluster @@ -135,16 +136,6 @@

Module codeflare_sdk.cluster.cluster

) return self._job_submission_client - def validate_image_config(self): - """ - Validates that the image configuration is not empty. - - :param image: The image string to validate - :raises ValueError: If the image is not specified - """ - if self.config.image == "" or self.config.image == None: - raise ValueError("Image must be specified in the ClusterConfiguration") - def create_app_wrapper(self): """ Called upon cluster object creation, creates an AppWrapper yaml based on @@ -160,51 +151,7 @@

Module codeflare_sdk.cluster.cluster

f"Namespace {self.config.namespace} is of type {type(self.config.namespace)}. Check your Kubernetes Authentication." ) - # Validate image configuration - self.validate_image_config() - - # Before attempting to create the cluster AW, let's evaluate the ClusterConfig - - name = self.config.name - namespace = self.config.namespace - head_cpus = self.config.head_cpus - head_memory = self.config.head_memory - head_gpus = self.config.head_gpus - min_cpu = self.config.min_cpus - max_cpu = self.config.max_cpus - min_memory = self.config.min_memory - max_memory = self.config.max_memory - gpu = self.config.num_gpus - workers = self.config.num_workers - template = self.config.template - image = self.config.image - appwrapper = self.config.appwrapper - env = self.config.envs - image_pull_secrets = self.config.image_pull_secrets - write_to_file = self.config.write_to_file - local_queue = self.config.local_queue - labels = self.config.labels - return generate_appwrapper( - name=name, - namespace=namespace, - head_cpus=head_cpus, - head_memory=head_memory, - head_gpus=head_gpus, - min_cpu=min_cpu, - max_cpu=max_cpu, - min_memory=min_memory, - max_memory=max_memory, - gpu=gpu, - workers=workers, - template=template, - image=image, - appwrapper=appwrapper, - env=env, - image_pull_secrets=image_pull_secrets, - write_to_file=write_to_file, - local_queue=local_queue, - labels=labels, - ) + return generate_appwrapper(self) # creates a new cluster with the provided or default spec def up(self): @@ -350,7 +297,7 @@

Module codeflare_sdk.cluster.cluster

if print_to_console: # overriding the number of gpus with requested - cluster.worker_gpu = self.config.num_gpus + _, cluster.worker_gpu = head_worker_gpu_count_from_cluster(self) pretty_print.print_cluster_status(cluster) elif print_to_console: if status == CodeFlareClusterStatus.UNKNOWN: @@ -488,6 +435,29 @@

Module codeflare_sdk.cluster.cluster

""" return self.job_client.get_job_logs(job_id) + @staticmethod + def _head_worker_extended_resources_from_rc_dict(rc: Dict) -> Tuple[dict, dict]: + head_extended_resources, worker_extended_resources = {}, {} + for resource in rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + "containers" + ][0]["resources"]["limits"].keys(): + if resource in ["memory", "cpu"]: + continue + worker_extended_resources[resource] = rc["spec"]["workerGroupSpecs"][0][ + "template" + ]["spec"]["containers"][0]["resources"]["limits"][resource] + + for resource in rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][ + 0 + ]["resources"]["limits"].keys(): + if resource in ["memory", "cpu"]: + continue + head_extended_resources[resource] = rc["spec"]["headGroupSpec"]["template"][ + "spec" + ]["containers"][0]["resources"]["limits"][resource] + + return head_extended_resources, worker_extended_resources + def from_k8_cluster_object( rc, appwrapper=True, @@ -501,28 +471,30 @@

Module codeflare_sdk.cluster.cluster

else [] ) + ( + head_extended_resources, + worker_extended_resources, + ) = Cluster._head_worker_extended_resources_from_rc_dict(rc) + cluster_config = ClusterConfiguration( name=rc["metadata"]["name"], namespace=rc["metadata"]["namespace"], machine_types=machine_types, num_workers=rc["spec"]["workerGroupSpecs"][0]["minReplicas"], - min_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + worker_cpu_requests=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["resources"]["requests"]["cpu"], - max_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + worker_cpu_limits=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["resources"]["limits"]["cpu"], - min_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ - "containers" - ][0]["resources"]["requests"]["memory"], - max_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + worker_memory_requests=rc["spec"]["workerGroupSpecs"][0]["template"][ + "spec" + ]["containers"][0]["resources"]["requests"]["memory"], + worker_memory_limits=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["resources"]["limits"]["memory"], - num_gpus=int( - rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][ - "resources" - ]["limits"]["nvidia.com/gpu"] - ), + worker_extended_resource_requests=worker_extended_resources, + head_extended_resource_requests=head_extended_resources, image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ 0 ]["image"], @@ -903,6 +875,11 @@

Module codeflare_sdk.cluster.cluster

protocol = "https" dashboard_url = f"{protocol}://{ingress.spec.rules[0].host}" + ( + head_extended_resources, + worker_extended_resources, + ) = Cluster._head_worker_extended_resources_from_rc_dict(rc) + return RayCluster( name=rc["metadata"]["name"], status=status, @@ -917,7 +894,7 @@

Module codeflare_sdk.cluster.cluster

worker_cpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ 0 ]["resources"]["limits"]["cpu"], - worker_gpu=0, # hard to detect currently how many gpus, can override it with what the user asked for + worker_extended_resources=worker_extended_resources, namespace=rc["metadata"]["namespace"], head_cpus=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][ "resources" @@ -925,9 +902,7 @@

Module codeflare_sdk.cluster.cluster

head_mem=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][ "resources" ]["limits"]["memory"], - head_gpu=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][ - "resources" - ]["limits"]["nvidia.com/gpu"], + head_extended_resources=head_extended_resources, dashboard=dashboard_url, ) @@ -949,15 +924,15 @@

Module codeflare_sdk.cluster.cluster

name=cluster.config.name, status=cluster.status(print_to_console=False)[0], workers=cluster.config.num_workers, - worker_mem_min=cluster.config.min_memory, - worker_mem_max=cluster.config.max_memory, - worker_cpu=cluster.config.min_cpus, - worker_gpu=cluster.config.num_gpus, + worker_mem_min=cluster.config.worker_memory_requests, + worker_mem_max=cluster.config.worker_memory_limits, + worker_cpu=cluster.config.worker_cpu_requests, + worker_extended_resources=cluster.config.worker_extended_resource_requests, namespace=cluster.config.namespace, dashboard=cluster.cluster_dashboard_uri(), head_cpus=cluster.config.head_cpus, head_mem=cluster.config.head_memory, - head_gpu=cluster.config.head_gpus, + head_extended_resources=cluster.config.head_extended_resource_requests, ) if ray.status == CodeFlareClusterStatus.READY: ray.status = RayClusterStatus.READY @@ -1181,16 +1156,6 @@

Classes

) return self._job_submission_client - def validate_image_config(self): - """ - Validates that the image configuration is not empty. - - :param image: The image string to validate - :raises ValueError: If the image is not specified - """ - if self.config.image == "" or self.config.image == None: - raise ValueError("Image must be specified in the ClusterConfiguration") - def create_app_wrapper(self): """ Called upon cluster object creation, creates an AppWrapper yaml based on @@ -1206,51 +1171,7 @@

Classes

f"Namespace {self.config.namespace} is of type {type(self.config.namespace)}. Check your Kubernetes Authentication." ) - # Validate image configuration - self.validate_image_config() - - # Before attempting to create the cluster AW, let's evaluate the ClusterConfig - - name = self.config.name - namespace = self.config.namespace - head_cpus = self.config.head_cpus - head_memory = self.config.head_memory - head_gpus = self.config.head_gpus - min_cpu = self.config.min_cpus - max_cpu = self.config.max_cpus - min_memory = self.config.min_memory - max_memory = self.config.max_memory - gpu = self.config.num_gpus - workers = self.config.num_workers - template = self.config.template - image = self.config.image - appwrapper = self.config.appwrapper - env = self.config.envs - image_pull_secrets = self.config.image_pull_secrets - write_to_file = self.config.write_to_file - local_queue = self.config.local_queue - labels = self.config.labels - return generate_appwrapper( - name=name, - namespace=namespace, - head_cpus=head_cpus, - head_memory=head_memory, - head_gpus=head_gpus, - min_cpu=min_cpu, - max_cpu=max_cpu, - min_memory=min_memory, - max_memory=max_memory, - gpu=gpu, - workers=workers, - template=template, - image=image, - appwrapper=appwrapper, - env=env, - image_pull_secrets=image_pull_secrets, - write_to_file=write_to_file, - local_queue=local_queue, - labels=labels, - ) + return generate_appwrapper(self) # creates a new cluster with the provided or default spec def up(self): @@ -1396,7 +1317,7 @@

Classes

if print_to_console: # overriding the number of gpus with requested - cluster.worker_gpu = self.config.num_gpus + _, cluster.worker_gpu = head_worker_gpu_count_from_cluster(self) pretty_print.print_cluster_status(cluster) elif print_to_console: if status == CodeFlareClusterStatus.UNKNOWN: @@ -1534,6 +1455,29 @@

Classes

""" return self.job_client.get_job_logs(job_id) + @staticmethod + def _head_worker_extended_resources_from_rc_dict(rc: Dict) -> Tuple[dict, dict]: + head_extended_resources, worker_extended_resources = {}, {} + for resource in rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + "containers" + ][0]["resources"]["limits"].keys(): + if resource in ["memory", "cpu"]: + continue + worker_extended_resources[resource] = rc["spec"]["workerGroupSpecs"][0][ + "template" + ]["spec"]["containers"][0]["resources"]["limits"][resource] + + for resource in rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][ + 0 + ]["resources"]["limits"].keys(): + if resource in ["memory", "cpu"]: + continue + head_extended_resources[resource] = rc["spec"]["headGroupSpec"]["template"][ + "spec" + ]["containers"][0]["resources"]["limits"][resource] + + return head_extended_resources, worker_extended_resources + def from_k8_cluster_object( rc, appwrapper=True, @@ -1547,28 +1491,30 @@

Classes

else [] ) + ( + head_extended_resources, + worker_extended_resources, + ) = Cluster._head_worker_extended_resources_from_rc_dict(rc) + cluster_config = ClusterConfiguration( name=rc["metadata"]["name"], namespace=rc["metadata"]["namespace"], machine_types=machine_types, num_workers=rc["spec"]["workerGroupSpecs"][0]["minReplicas"], - min_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + worker_cpu_requests=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["resources"]["requests"]["cpu"], - max_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + worker_cpu_limits=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["resources"]["limits"]["cpu"], - min_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ - "containers" - ][0]["resources"]["requests"]["memory"], - max_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + worker_memory_requests=rc["spec"]["workerGroupSpecs"][0]["template"][ + "spec" + ]["containers"][0]["resources"]["requests"]["memory"], + worker_memory_limits=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["resources"]["limits"]["memory"], - num_gpus=int( - rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][ - "resources" - ]["limits"]["nvidia.com/gpu"] - ), + worker_extended_resource_requests=worker_extended_resources, + head_extended_resource_requests=head_extended_resources, image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ 0 ]["image"], @@ -1749,51 +1695,7 @@

Methods

f"Namespace {self.config.namespace} is of type {type(self.config.namespace)}. Check your Kubernetes Authentication." ) - # Validate image configuration - self.validate_image_config() - - # Before attempting to create the cluster AW, let's evaluate the ClusterConfig - - name = self.config.name - namespace = self.config.namespace - head_cpus = self.config.head_cpus - head_memory = self.config.head_memory - head_gpus = self.config.head_gpus - min_cpu = self.config.min_cpus - max_cpu = self.config.max_cpus - min_memory = self.config.min_memory - max_memory = self.config.max_memory - gpu = self.config.num_gpus - workers = self.config.num_workers - template = self.config.template - image = self.config.image - appwrapper = self.config.appwrapper - env = self.config.envs - image_pull_secrets = self.config.image_pull_secrets - write_to_file = self.config.write_to_file - local_queue = self.config.local_queue - labels = self.config.labels - return generate_appwrapper( - name=name, - namespace=namespace, - head_cpus=head_cpus, - head_memory=head_memory, - head_gpus=head_gpus, - min_cpu=min_cpu, - max_cpu=max_cpu, - min_memory=min_memory, - max_memory=max_memory, - gpu=gpu, - workers=workers, - template=template, - image=image, - appwrapper=appwrapper, - env=env, - image_pull_secrets=image_pull_secrets, - write_to_file=write_to_file, - local_queue=local_queue, - labels=labels, - ) + return generate_appwrapper(self)
@@ -1868,28 +1770,30 @@

Methods

else [] ) + ( + head_extended_resources, + worker_extended_resources, + ) = Cluster._head_worker_extended_resources_from_rc_dict(rc) + cluster_config = ClusterConfiguration( name=rc["metadata"]["name"], namespace=rc["metadata"]["namespace"], machine_types=machine_types, num_workers=rc["spec"]["workerGroupSpecs"][0]["minReplicas"], - min_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + worker_cpu_requests=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["resources"]["requests"]["cpu"], - max_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + worker_cpu_limits=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["resources"]["limits"]["cpu"], - min_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ - "containers" - ][0]["resources"]["requests"]["memory"], - max_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + worker_memory_requests=rc["spec"]["workerGroupSpecs"][0]["template"][ + "spec" + ]["containers"][0]["resources"]["requests"]["memory"], + worker_memory_limits=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["resources"]["limits"]["memory"], - num_gpus=int( - rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][ - "resources" - ]["limits"]["nvidia.com/gpu"] - ), + worker_extended_resource_requests=worker_extended_resources, + head_extended_resource_requests=head_extended_resources, image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ 0 ]["image"], @@ -2063,7 +1967,7 @@

Methods

if print_to_console: # overriding the number of gpus with requested - cluster.worker_gpu = self.config.num_gpus + _, cluster.worker_gpu = head_worker_gpu_count_from_cluster(self) pretty_print.print_cluster_status(cluster) elif print_to_console: if status == CodeFlareClusterStatus.UNKNOWN: @@ -2124,28 +2028,6 @@

Methods

return _kube_api_error_handling(e)
-
-def validate_image_config(self) -
-
-

Validates that the image configuration is not empty.

-

:param image: The image string to validate -:raises ValueError: If the image is not specified

-
- -Expand source code - -
def validate_image_config(self):
-    """
-    Validates that the image configuration is not empty.
-
-    :param image: The image string to validate
-    :raises ValueError: If the image is not specified
-    """
-    if self.config.image == "" or self.config.image == None:
-        raise ValueError("Image must be specified in the ClusterConfiguration")
-
-
def wait_ready(self, timeout: Optional[int] = None, dashboard_check: bool = True)
@@ -2234,7 +2116,6 @@

local_client_url
  • status
  • up
  • -
  • validate_image_config
  • wait_ready
  • diff --git a/docs/detailed-documentation/cluster/config.html b/docs/detailed-documentation/cluster/config.html index 60b6dc95f..87ad3b77c 100644 --- a/docs/detailed-documentation/cluster/config.html +++ b/docs/detailed-documentation/cluster/config.html @@ -50,67 +50,225 @@

    Module codeflare_sdk.cluster.config

    Cluster object. """ -from dataclasses import dataclass, field import pathlib -import typing +import warnings +from dataclasses import dataclass, field, fields +from typing import Dict, List, Optional, Union, get_args, get_origin dir = pathlib.Path(__file__).parent.parent.resolve() +# https://docs.ray.io/en/latest/ray-core/scheduling/accelerators.html +DEFAULT_RESOURCE_MAPPING = { + "nvidia.com/gpu": "GPU", + "intel.com/gpu": "GPU", + "amd.com/gpu": "GPU", + "aws.amazon.com/neuroncore": "neuron_cores", + "google.com/tpu": "TPU", + "habana.ai/gaudi": "HPU", + "huawei.com/Ascend910": "NPU", + "huawei.com/Ascend310": "NPU", +} + @dataclass class ClusterConfiguration: """ This dataclass is used to specify resource requirements and other details, and is passed in as an argument when creating a Cluster object. + + Attributes: + - name: The name of the cluster. + - namespace: The namespace in which the cluster should be created. + - head_info: A list of strings containing information about the head node. + - head_cpus: The number of CPUs to allocate to the head node. + - head_memory: The amount of memory to allocate to the head node. + - head_gpus: The number of GPUs to allocate to the head node. (Deprecated, use head_extended_resource_requests) + - head_extended_resource_requests: A dictionary of extended resource requests for the head node. ex: {"nvidia.com/gpu": 1} + - machine_types: A list of machine types to use for the cluster. + - min_cpus: The minimum number of CPUs to allocate to each worker. + - max_cpus: The maximum number of CPUs to allocate to each worker. + - num_workers: The number of workers to create. + - min_memory: The minimum amount of memory to allocate to each worker. + - max_memory: The maximum amount of memory to allocate to each worker. + - num_gpus: The number of GPUs to allocate to each worker. (Deprecated, use worker_extended_resource_requests) + - template: The path to the template file to use for the cluster. + - appwrapper: A boolean indicating whether to use an AppWrapper. + - envs: A dictionary of environment variables to set for the cluster. + - image: The image to use for the cluster. + - image_pull_secrets: A list of image pull secrets to use for the cluster. + - write_to_file: A boolean indicating whether to write the cluster configuration to a file. + - verify_tls: A boolean indicating whether to verify TLS when connecting to the cluster. + - labels: A dictionary of labels to apply to the cluster. + - worker_extended_resource_requests: A dictionary of extended resource requests for each worker. ex: {"nvidia.com/gpu": 1} + - extended_resource_mapping: A dictionary of custom resource mappings to map extended resource requests to RayCluster resource names + - overwrite_default_resource_mapping: A boolean indicating whether to overwrite the default resource mapping. """ name: str - namespace: str = None - head_info: list = field(default_factory=list) - head_cpus: typing.Union[int, str] = 2 - head_memory: typing.Union[int, str] = 8 - head_gpus: int = 0 - machine_types: list = field(default_factory=list) # ["m4.xlarge", "g4dn.xlarge"] - min_cpus: typing.Union[int, str] = 1 - max_cpus: typing.Union[int, str] = 1 + namespace: Optional[str] = None + head_info: List[str] = field(default_factory=list) + head_cpus: Union[int, str] = 2 + head_memory: Union[int, str] = 8 + head_gpus: Optional[int] = None # Deprecating + head_extended_resource_requests: Dict[str, int] = field(default_factory=dict) + machine_types: List[str] = field( + default_factory=list + ) # ["m4.xlarge", "g4dn.xlarge"] + worker_cpu_requests: Union[int, str] = 1 + worker_cpu_limits: Union[int, str] = 1 + min_cpus: Optional[Union[int, str]] = None # Deprecating + max_cpus: Optional[Union[int, str]] = None # Deprecating num_workers: int = 1 - min_memory: typing.Union[int, str] = 2 - max_memory: typing.Union[int, str] = 2 - num_gpus: int = 0 + worker_memory_requests: Union[int, str] = 2 + worker_memory_limits: Union[int, str] = 2 + min_memory: Optional[Union[int, str]] = None # Deprecating + max_memory: Optional[Union[int, str]] = None # Deprecating + num_gpus: Optional[int] = None # Deprecating template: str = f"{dir}/templates/base-template.yaml" appwrapper: bool = False - envs: dict = field(default_factory=dict) + envs: Dict[str, str] = field(default_factory=dict) image: str = "" - image_pull_secrets: list = field(default_factory=list) + image_pull_secrets: List[str] = field(default_factory=list) write_to_file: bool = False verify_tls: bool = True - labels: dict = field(default_factory=dict) + labels: Dict[str, str] = field(default_factory=dict) + worker_extended_resource_requests: Dict[str, int] = field(default_factory=dict) + extended_resource_mapping: Dict[str, str] = field(default_factory=dict) + overwrite_default_resource_mapping: bool = False + local_queue: Optional[str] = None def __post_init__(self): if not self.verify_tls: print( "Warning: TLS verification has been disabled - Endpoint checks will be bypassed" ) + + self._validate_types() self._memory_to_string() self._str_mem_no_unit_add_GB() + self._memory_to_resource() + self._cpu_to_resource() + self._gpu_to_resource() + self._combine_extended_resource_mapping() + self._validate_extended_resource_requests(self.head_extended_resource_requests) + self._validate_extended_resource_requests( + self.worker_extended_resource_requests + ) + + def _combine_extended_resource_mapping(self): + if overwritten := set(self.extended_resource_mapping.keys()).intersection( + DEFAULT_RESOURCE_MAPPING.keys() + ): + if self.overwrite_default_resource_mapping: + warnings.warn( + f"Overwriting default resource mapping for {overwritten}", + UserWarning, + ) + else: + raise ValueError( + f"Resource mapping already exists for {overwritten}, set overwrite_default_resource_mapping to True to overwrite" + ) + self.extended_resource_mapping = { + **DEFAULT_RESOURCE_MAPPING, + **self.extended_resource_mapping, + } + + def _validate_extended_resource_requests(self, extended_resources: Dict[str, int]): + for k in extended_resources.keys(): + if k not in self.extended_resource_mapping.keys(): + raise ValueError( + f"extended resource '{k}' not found in extended_resource_mapping, available resources are {list(self.extended_resource_mapping.keys())}, to add more supported resources use extended_resource_mapping. i.e. extended_resource_mapping = {{'{k}': 'FOO_BAR'}}" + ) + + def _gpu_to_resource(self): + if self.head_gpus: + warnings.warn( + f"head_gpus is being deprecated, replacing with head_extended_resource_requests['nvidia.com/gpu'] = {self.head_gpus}" + ) + if "nvidia.com/gpu" in self.head_extended_resource_requests: + raise ValueError( + "nvidia.com/gpu already exists in head_extended_resource_requests" + ) + self.head_extended_resource_requests["nvidia.com/gpu"] = self.head_gpus + if self.num_gpus: + warnings.warn( + f"num_gpus is being deprecated, replacing with worker_extended_resource_requests['nvidia.com/gpu'] = {self.num_gpus}" + ) + if "nvidia.com/gpu" in self.worker_extended_resource_requests: + raise ValueError( + "nvidia.com/gpu already exists in worker_extended_resource_requests" + ) + self.worker_extended_resource_requests["nvidia.com/gpu"] = self.num_gpus def _str_mem_no_unit_add_GB(self): if isinstance(self.head_memory, str) and self.head_memory.isdecimal(): self.head_memory = f"{self.head_memory}G" - if isinstance(self.min_memory, str) and self.min_memory.isdecimal(): - self.min_memory = f"{self.min_memory}G" - if isinstance(self.max_memory, str) and self.max_memory.isdecimal(): - self.max_memory = f"{self.max_memory}G" + if ( + isinstance(self.worker_memory_requests, str) + and self.worker_memory_requests.isdecimal() + ): + self.worker_memory_requests = f"{self.worker_memory_requests}G" + if ( + isinstance(self.worker_memory_limits, str) + and self.worker_memory_limits.isdecimal() + ): + self.worker_memory_limits = f"{self.worker_memory_limits}G" def _memory_to_string(self): if isinstance(self.head_memory, int): self.head_memory = f"{self.head_memory}G" - if isinstance(self.min_memory, int): - self.min_memory = f"{self.min_memory}G" - if isinstance(self.max_memory, int): - self.max_memory = f"{self.max_memory}G" + if isinstance(self.worker_memory_requests, int): + self.worker_memory_requests = f"{self.worker_memory_requests}G" + if isinstance(self.worker_memory_limits, int): + self.worker_memory_limits = f"{self.worker_memory_limits}G" + + def _cpu_to_resource(self): + if self.min_cpus: + warnings.warn("min_cpus is being deprecated, use worker_cpu_requests") + self.worker_cpu_requests = self.min_cpus + if self.max_cpus: + warnings.warn("max_cpus is being deprecated, use worker_cpu_limits") + self.worker_cpu_limits = self.max_cpus + + def _memory_to_resource(self): + if self.min_memory: + warnings.warn("min_memory is being deprecated, use worker_memory_requests") + self.worker_memory_requests = f"{self.min_memory}G" + if self.max_memory: + warnings.warn("max_memory is being deprecated, use worker_memory_limits") + self.worker_memory_limits = f"{self.max_memory}G" + + def _validate_types(self): + """Validate the types of all fields in the ClusterConfiguration dataclass.""" + for field_info in fields(self): + value = getattr(self, field_info.name) + expected_type = field_info.type + if not self._is_type(value, expected_type): + raise TypeError( + f"'{field_info.name}' should be of type {expected_type}" + ) + + @staticmethod + def _is_type(value, expected_type): + """Check if the value matches the expected type.""" + + def check_type(value, expected_type): + origin_type = get_origin(expected_type) + args = get_args(expected_type) + if origin_type is Union: + return any(check_type(value, union_type) for union_type in args) + if origin_type is list: + return all(check_type(elem, args[0]) for elem in value) + if origin_type is dict: + return all( + check_type(k, args[0]) and check_type(v, args[1]) + for k, v in value.items() + ) + if origin_type is tuple: + return all(check_type(elem, etype) for elem, etype in zip(value, args)) + return isinstance(value, expected_type) - local_queue: str = None + return check_type(value, expected_type)
    @@ -124,11 +282,37 @@

    Classes

    class ClusterConfiguration -(name: str, namespace: str = None, head_info: list = <factory>, head_cpus: Union[int, str] = 2, head_memory: Union[int, str] = 8, head_gpus: int = 0, machine_types: list = <factory>, min_cpus: Union[int, str] = 1, max_cpus: Union[int, str] = 1, num_workers: int = 1, min_memory: Union[int, str] = 2, max_memory: Union[int, str] = 2, num_gpus: int = 0, template: str = '/home/runner/work/codeflare-sdk/codeflare-sdk/src/codeflare_sdk/templates/base-template.yaml', appwrapper: bool = False, envs: dict = <factory>, image: str = '', image_pull_secrets: list = <factory>, write_to_file: bool = False, verify_tls: bool = True, labels: dict = <factory>, local_queue: str = None) +(name: str, namespace: Optional[str] = None, head_info: List[str] = <factory>, head_cpus: Union[int, str] = 2, head_memory: Union[int, str] = 8, head_gpus: Optional[int] = None, head_extended_resource_requests: Dict[str, int] = <factory>, machine_types: List[str] = <factory>, worker_cpu_requests: Union[int, str] = 1, worker_cpu_limits: Union[int, str] = 1, min_cpus: Union[int, str, ForwardRef(None)] = None, max_cpus: Union[int, str, ForwardRef(None)] = None, num_workers: int = 1, worker_memory_requests: Union[int, str] = 2, worker_memory_limits: Union[int, str] = 2, min_memory: Union[int, str, ForwardRef(None)] = None, max_memory: Union[int, str, ForwardRef(None)] = None, num_gpus: Optional[int] = None, template: str = '/home/runner/work/codeflare-sdk/codeflare-sdk/src/codeflare_sdk/templates/base-template.yaml', appwrapper: bool = False, envs: Dict[str, str] = <factory>, image: str = '', image_pull_secrets: List[str] = <factory>, write_to_file: bool = False, verify_tls: bool = True, labels: Dict[str, str] = <factory>, worker_extended_resource_requests: Dict[str, int] = <factory>, extended_resource_mapping: Dict[str, str] = <factory>, overwrite_default_resource_mapping: bool = False, local_queue: Optional[str] = None)

    This dataclass is used to specify resource requirements and other details, and -is passed in as an argument when creating a Cluster object.

    +is passed in as an argument when creating a Cluster object.

    +

    Attributes: +- name: The name of the cluster. +- namespace: The namespace in which the cluster should be created. +- head_info: A list of strings containing information about the head node. +- head_cpus: The number of CPUs to allocate to the head node. +- head_memory: The amount of memory to allocate to the head node. +- head_gpus: The number of GPUs to allocate to the head node. (Deprecated, use head_extended_resource_requests) +- head_extended_resource_requests: A dictionary of extended resource requests for the head node. ex: {"nvidia.com/gpu": 1} +- machine_types: A list of machine types to use for the cluster. +- min_cpus: The minimum number of CPUs to allocate to each worker. +- max_cpus: The maximum number of CPUs to allocate to each worker. +- num_workers: The number of workers to create. +- min_memory: The minimum amount of memory to allocate to each worker. +- max_memory: The maximum amount of memory to allocate to each worker. +- num_gpus: The number of GPUs to allocate to each worker. (Deprecated, use worker_extended_resource_requests) +- template: The path to the template file to use for the cluster. +- appwrapper: A boolean indicating whether to use an AppWrapper. +- envs: A dictionary of environment variables to set for the cluster. +- image: The image to use for the cluster. +- image_pull_secrets: A list of image pull secrets to use for the cluster. +- write_to_file: A boolean indicating whether to write the cluster configuration to a file. +- verify_tls: A boolean indicating whether to verify TLS when connecting to the cluster. +- labels: A dictionary of labels to apply to the cluster. +- worker_extended_resource_requests: A dictionary of extended resource requests for each worker. ex: {"nvidia.com/gpu": 1} +- extended_resource_mapping: A dictionary of custom resource mappings to map extended resource requests to RayCluster resource names +- overwrite_default_resource_mapping: A boolean indicating whether to overwrite the default resource mapping.

    Expand source code @@ -138,55 +322,200 @@

    Classes

    """ This dataclass is used to specify resource requirements and other details, and is passed in as an argument when creating a Cluster object. + + Attributes: + - name: The name of the cluster. + - namespace: The namespace in which the cluster should be created. + - head_info: A list of strings containing information about the head node. + - head_cpus: The number of CPUs to allocate to the head node. + - head_memory: The amount of memory to allocate to the head node. + - head_gpus: The number of GPUs to allocate to the head node. (Deprecated, use head_extended_resource_requests) + - head_extended_resource_requests: A dictionary of extended resource requests for the head node. ex: {"nvidia.com/gpu": 1} + - machine_types: A list of machine types to use for the cluster. + - min_cpus: The minimum number of CPUs to allocate to each worker. + - max_cpus: The maximum number of CPUs to allocate to each worker. + - num_workers: The number of workers to create. + - min_memory: The minimum amount of memory to allocate to each worker. + - max_memory: The maximum amount of memory to allocate to each worker. + - num_gpus: The number of GPUs to allocate to each worker. (Deprecated, use worker_extended_resource_requests) + - template: The path to the template file to use for the cluster. + - appwrapper: A boolean indicating whether to use an AppWrapper. + - envs: A dictionary of environment variables to set for the cluster. + - image: The image to use for the cluster. + - image_pull_secrets: A list of image pull secrets to use for the cluster. + - write_to_file: A boolean indicating whether to write the cluster configuration to a file. + - verify_tls: A boolean indicating whether to verify TLS when connecting to the cluster. + - labels: A dictionary of labels to apply to the cluster. + - worker_extended_resource_requests: A dictionary of extended resource requests for each worker. ex: {"nvidia.com/gpu": 1} + - extended_resource_mapping: A dictionary of custom resource mappings to map extended resource requests to RayCluster resource names + - overwrite_default_resource_mapping: A boolean indicating whether to overwrite the default resource mapping. """ name: str - namespace: str = None - head_info: list = field(default_factory=list) - head_cpus: typing.Union[int, str] = 2 - head_memory: typing.Union[int, str] = 8 - head_gpus: int = 0 - machine_types: list = field(default_factory=list) # ["m4.xlarge", "g4dn.xlarge"] - min_cpus: typing.Union[int, str] = 1 - max_cpus: typing.Union[int, str] = 1 + namespace: Optional[str] = None + head_info: List[str] = field(default_factory=list) + head_cpus: Union[int, str] = 2 + head_memory: Union[int, str] = 8 + head_gpus: Optional[int] = None # Deprecating + head_extended_resource_requests: Dict[str, int] = field(default_factory=dict) + machine_types: List[str] = field( + default_factory=list + ) # ["m4.xlarge", "g4dn.xlarge"] + worker_cpu_requests: Union[int, str] = 1 + worker_cpu_limits: Union[int, str] = 1 + min_cpus: Optional[Union[int, str]] = None # Deprecating + max_cpus: Optional[Union[int, str]] = None # Deprecating num_workers: int = 1 - min_memory: typing.Union[int, str] = 2 - max_memory: typing.Union[int, str] = 2 - num_gpus: int = 0 + worker_memory_requests: Union[int, str] = 2 + worker_memory_limits: Union[int, str] = 2 + min_memory: Optional[Union[int, str]] = None # Deprecating + max_memory: Optional[Union[int, str]] = None # Deprecating + num_gpus: Optional[int] = None # Deprecating template: str = f"{dir}/templates/base-template.yaml" appwrapper: bool = False - envs: dict = field(default_factory=dict) + envs: Dict[str, str] = field(default_factory=dict) image: str = "" - image_pull_secrets: list = field(default_factory=list) + image_pull_secrets: List[str] = field(default_factory=list) write_to_file: bool = False verify_tls: bool = True - labels: dict = field(default_factory=dict) + labels: Dict[str, str] = field(default_factory=dict) + worker_extended_resource_requests: Dict[str, int] = field(default_factory=dict) + extended_resource_mapping: Dict[str, str] = field(default_factory=dict) + overwrite_default_resource_mapping: bool = False + local_queue: Optional[str] = None def __post_init__(self): if not self.verify_tls: print( "Warning: TLS verification has been disabled - Endpoint checks will be bypassed" ) + + self._validate_types() self._memory_to_string() self._str_mem_no_unit_add_GB() + self._memory_to_resource() + self._cpu_to_resource() + self._gpu_to_resource() + self._combine_extended_resource_mapping() + self._validate_extended_resource_requests(self.head_extended_resource_requests) + self._validate_extended_resource_requests( + self.worker_extended_resource_requests + ) + + def _combine_extended_resource_mapping(self): + if overwritten := set(self.extended_resource_mapping.keys()).intersection( + DEFAULT_RESOURCE_MAPPING.keys() + ): + if self.overwrite_default_resource_mapping: + warnings.warn( + f"Overwriting default resource mapping for {overwritten}", + UserWarning, + ) + else: + raise ValueError( + f"Resource mapping already exists for {overwritten}, set overwrite_default_resource_mapping to True to overwrite" + ) + self.extended_resource_mapping = { + **DEFAULT_RESOURCE_MAPPING, + **self.extended_resource_mapping, + } + + def _validate_extended_resource_requests(self, extended_resources: Dict[str, int]): + for k in extended_resources.keys(): + if k not in self.extended_resource_mapping.keys(): + raise ValueError( + f"extended resource '{k}' not found in extended_resource_mapping, available resources are {list(self.extended_resource_mapping.keys())}, to add more supported resources use extended_resource_mapping. i.e. extended_resource_mapping = {{'{k}': 'FOO_BAR'}}" + ) + + def _gpu_to_resource(self): + if self.head_gpus: + warnings.warn( + f"head_gpus is being deprecated, replacing with head_extended_resource_requests['nvidia.com/gpu'] = {self.head_gpus}" + ) + if "nvidia.com/gpu" in self.head_extended_resource_requests: + raise ValueError( + "nvidia.com/gpu already exists in head_extended_resource_requests" + ) + self.head_extended_resource_requests["nvidia.com/gpu"] = self.head_gpus + if self.num_gpus: + warnings.warn( + f"num_gpus is being deprecated, replacing with worker_extended_resource_requests['nvidia.com/gpu'] = {self.num_gpus}" + ) + if "nvidia.com/gpu" in self.worker_extended_resource_requests: + raise ValueError( + "nvidia.com/gpu already exists in worker_extended_resource_requests" + ) + self.worker_extended_resource_requests["nvidia.com/gpu"] = self.num_gpus def _str_mem_no_unit_add_GB(self): if isinstance(self.head_memory, str) and self.head_memory.isdecimal(): self.head_memory = f"{self.head_memory}G" - if isinstance(self.min_memory, str) and self.min_memory.isdecimal(): - self.min_memory = f"{self.min_memory}G" - if isinstance(self.max_memory, str) and self.max_memory.isdecimal(): - self.max_memory = f"{self.max_memory}G" + if ( + isinstance(self.worker_memory_requests, str) + and self.worker_memory_requests.isdecimal() + ): + self.worker_memory_requests = f"{self.worker_memory_requests}G" + if ( + isinstance(self.worker_memory_limits, str) + and self.worker_memory_limits.isdecimal() + ): + self.worker_memory_limits = f"{self.worker_memory_limits}G" def _memory_to_string(self): if isinstance(self.head_memory, int): self.head_memory = f"{self.head_memory}G" - if isinstance(self.min_memory, int): - self.min_memory = f"{self.min_memory}G" - if isinstance(self.max_memory, int): - self.max_memory = f"{self.max_memory}G" + if isinstance(self.worker_memory_requests, int): + self.worker_memory_requests = f"{self.worker_memory_requests}G" + if isinstance(self.worker_memory_limits, int): + self.worker_memory_limits = f"{self.worker_memory_limits}G" + + def _cpu_to_resource(self): + if self.min_cpus: + warnings.warn("min_cpus is being deprecated, use worker_cpu_requests") + self.worker_cpu_requests = self.min_cpus + if self.max_cpus: + warnings.warn("max_cpus is being deprecated, use worker_cpu_limits") + self.worker_cpu_limits = self.max_cpus + + def _memory_to_resource(self): + if self.min_memory: + warnings.warn("min_memory is being deprecated, use worker_memory_requests") + self.worker_memory_requests = f"{self.min_memory}G" + if self.max_memory: + warnings.warn("max_memory is being deprecated, use worker_memory_limits") + self.worker_memory_limits = f"{self.max_memory}G" + + def _validate_types(self): + """Validate the types of all fields in the ClusterConfiguration dataclass.""" + for field_info in fields(self): + value = getattr(self, field_info.name) + expected_type = field_info.type + if not self._is_type(value, expected_type): + raise TypeError( + f"'{field_info.name}' should be of type {expected_type}" + ) + + @staticmethod + def _is_type(value, expected_type): + """Check if the value matches the expected type.""" + + def check_type(value, expected_type): + origin_type = get_origin(expected_type) + args = get_args(expected_type) + if origin_type is Union: + return any(check_type(value, union_type) for union_type in args) + if origin_type is list: + return all(check_type(elem, args[0]) for elem in value) + if origin_type is dict: + return all( + check_type(k, args[0]) and check_type(v, args[1]) + for k, v in value.items() + ) + if origin_type is tuple: + return all(check_type(elem, etype) for elem, etype in zip(value, args)) + return isinstance(value, expected_type) - local_queue: str = None + return check_type(value, expected_type)

    Class variables

    @@ -194,7 +523,11 @@

    Class variables

    -
    var envs : dict
    +
    var envs : Dict[str, str]
    +
    +
    +
    +
    var extended_resource_mapping : Dict[str, str]
    @@ -202,11 +535,15 @@

    Class variables

    -
    var head_gpus : int
    +
    var head_extended_resource_requests : Dict[str, int]
    +
    +
    +
    +
    var head_gpus : Optional[int]
    -
    var head_info : list
    +
    var head_info : List[str]
    @@ -218,35 +555,35 @@

    Class variables

    -
    var image_pull_secrets : list
    +
    var image_pull_secrets : List[str]
    -
    var labels : dict
    +
    var labels : Dict[str, str]
    -
    var local_queue : str
    +
    var local_queue : Optional[str]
    -
    var machine_types : list
    +
    var machine_types : List[str]
    -
    var max_cpus : Union[int, str]
    +
    var max_cpus : Union[int, str, ForwardRef(None)]
    -
    var max_memory : Union[int, str]
    +
    var max_memory : Union[int, str, ForwardRef(None)]
    -
    var min_cpus : Union[int, str]
    +
    var min_cpus : Union[int, str, ForwardRef(None)]
    -
    var min_memory : Union[int, str]
    +
    var min_memory : Union[int, str, ForwardRef(None)]
    @@ -254,11 +591,11 @@

    Class variables

    -
    var namespace : str
    +
    var namespace : Optional[str]
    -
    var num_gpus : int
    +
    var num_gpus : Optional[int]
    @@ -266,6 +603,10 @@

    Class variables

    +
    var overwrite_default_resource_mapping : bool
    +
    +
    +
    var template : str
    @@ -274,6 +615,26 @@

    Class variables

    +
    var worker_cpu_limits : Union[int, str]
    +
    +
    +
    +
    var worker_cpu_requests : Union[int, str]
    +
    +
    +
    +
    var worker_extended_resource_requests : Dict[str, int]
    +
    +
    +
    +
    var worker_memory_limits : Union[int, str]
    +
    +
    +
    +
    var worker_memory_requests : Union[int, str]
    +
    +
    +
    var write_to_file : bool
    @@ -298,10 +659,12 @@

    Index

    class RayCluster -(name: str, status: RayClusterStatus, head_cpus: int, head_mem: str, head_gpu: int, workers: int, worker_mem_min: str, worker_mem_max: str, worker_cpu: int, worker_gpu: int, namespace: str, dashboard: str) +(name: str, status: RayClusterStatus, head_cpus: int, head_mem: str, workers: int, worker_mem_min: str, worker_mem_max: str, worker_cpu: int, namespace: str, dashboard: str, worker_extended_resources: Dict[str, int] = <factory>, head_extended_resources: Dict[str, int] = <factory>)

    For storing information about a Ray cluster.

    @@ -310,14 +311,14 @@

    Class variables

    status: RayClusterStatus head_cpus: int head_mem: str - head_gpu: int workers: int worker_mem_min: str worker_mem_max: str worker_cpu: int - worker_gpu: int namespace: str - dashboard: str + dashboard: str + worker_extended_resources: typing.Dict[str, int] = field(default_factory=dict) + head_extended_resources: typing.Dict[str, int] = field(default_factory=dict)

    Class variables

    @@ -329,7 +330,7 @@

    Class variables

    -
    var head_gpu : int
    +
    var head_extended_resources : Dict[str, int]
    @@ -353,7 +354,7 @@

    Class variables

    -
    var worker_gpu : int
    +
    var worker_extended_resources : Dict[str, int]
    @@ -471,16 +472,16 @@

    RayCluster

    -
      +
      • dashboard
      • head_cpus
      • -
      • head_gpu
      • +
      • head_extended_resources
      • head_mem
      • name
      • namespace
      • status
      • worker_cpu
      • -
      • worker_gpu
      • +
      • worker_extended_resources
      • worker_mem_max
      • worker_mem_min
      • workers
      • diff --git a/docs/detailed-documentation/index.html b/docs/detailed-documentation/index.html index 21bab3d1c..fd74344f1 100644 --- a/docs/detailed-documentation/index.html +++ b/docs/detailed-documentation/index.html @@ -46,7 +46,16 @@

        Package codeflare_sdk

        from .job import RayJobClient -from .utils import generate_cert +from .utils import generate_cert +from .utils.demos import copy_demo_nbs + +from importlib.metadata import version, PackageNotFoundError + +try: + __version__ = version("codeflare-sdk") # use metadata associated with built package + +except PackageNotFoundError: + __version__ = "v0.0.0"
    diff --git a/docs/detailed-documentation/utils/demos.html b/docs/detailed-documentation/utils/demos.html new file mode 100644 index 000000000..e0dc5a8e7 --- /dev/null +++ b/docs/detailed-documentation/utils/demos.html @@ -0,0 +1,138 @@ + + + + + + +codeflare_sdk.utils.demos API documentation + + + + + + + + + + + +
    +
    +
    +

    Module codeflare_sdk.utils.demos

    +
    +
    +
    + +Expand source code + +
    import pathlib
    +import shutil
    +
    +package_dir = pathlib.Path(__file__).parent.parent.resolve()
    +demo_dir = f"{package_dir}/demo-notebooks"
    +
    +
    +def copy_demo_nbs(dir: str = "./demo-notebooks", overwrite: bool = False):
    +    """
    +    Copy the demo notebooks from the package to the current working directory
    +
    +    overwrite=True will overwrite any files that exactly match files written by copy_demo_nbs in the target directory.
    +    Any files that exist in the directory that don't match these values will remain untouched.
    +
    +    Args:
    +        dir (str): The directory to copy the demo notebooks to. Defaults to "./demo-notebooks". overwrite (bool):
    +        overwrite (bool): Whether to overwrite files in the directory if it already exists. Defaults to False.
    +    Raises:
    +        FileExistsError: If the directory already exists.
    +    """
    +    # does dir exist already?
    +    if overwrite is False and pathlib.Path(dir).exists():
    +        raise FileExistsError(
    +            f"Directory {dir} already exists. Please remove it or provide a different location."
    +        )
    +
    +    shutil.copytree(demo_dir, dir, dirs_exist_ok=True)
    +
    +
    +
    +
    +
    +
    +
    +

    Functions

    +
    +
    +def copy_demo_nbs(dir: str = './demo-notebooks', overwrite: bool = False) +
    +
    +

    Copy the demo notebooks from the package to the current working directory

    +

    overwrite=True will overwrite any files that exactly match files written by copy_demo_nbs in the target directory. +Any files that exist in the directory that don't match these values will remain untouched.

    +

    Args

    +
    +
    dir : str
    +
    The directory to copy the demo notebooks to. Defaults to "./demo-notebooks". overwrite (bool):
    +
    overwrite : bool
    +
    Whether to overwrite files in the directory if it already exists. Defaults to False.
    +
    +

    Raises

    +
    +
    FileExistsError
    +
    If the directory already exists.
    +
    +
    + +Expand source code + +
    def copy_demo_nbs(dir: str = "./demo-notebooks", overwrite: bool = False):
    +    """
    +    Copy the demo notebooks from the package to the current working directory
    +
    +    overwrite=True will overwrite any files that exactly match files written by copy_demo_nbs in the target directory.
    +    Any files that exist in the directory that don't match these values will remain untouched.
    +
    +    Args:
    +        dir (str): The directory to copy the demo notebooks to. Defaults to "./demo-notebooks". overwrite (bool):
    +        overwrite (bool): Whether to overwrite files in the directory if it already exists. Defaults to False.
    +    Raises:
    +        FileExistsError: If the directory already exists.
    +    """
    +    # does dir exist already?
    +    if overwrite is False and pathlib.Path(dir).exists():
    +        raise FileExistsError(
    +            f"Directory {dir} already exists. Please remove it or provide a different location."
    +        )
    +
    +    shutil.copytree(demo_dir, dir, dirs_exist_ok=True)
    +
    +
    +
    +
    +
    +
    +
    + +
    + + + diff --git a/docs/detailed-documentation/utils/generate_yaml.html b/docs/detailed-documentation/utils/generate_yaml.html index 1e4217629..7b41e3c99 100644 --- a/docs/detailed-documentation/utils/generate_yaml.html +++ b/docs/detailed-documentation/utils/generate_yaml.html @@ -48,6 +48,7 @@

    Module codeflare_sdk.utils.generate_yaml

    (in the cluster sub-module) for AppWrapper generation. """ +import json from typing import Optional import typing import yaml @@ -61,6 +62,8 @@

    Module codeflare_sdk.utils.generate_yaml

    from os import urandom from base64 import b64encode from urllib3.util import parse_url +from kubernetes.client.exceptions import ApiException +import codeflare_sdk def read_template(template): @@ -108,16 +111,20 @@

    Module codeflare_sdk.utils.generate_yaml

    return False -def update_names(cluster_yaml, cluster_name, namespace): - meta = cluster_yaml.get("metadata") - meta["name"] = cluster_name - meta["namespace"] = namespace +def update_names( + cluster_yaml: dict, + cluster: "codeflare_sdk.cluster.Cluster", +): + metadata = cluster_yaml.get("metadata") + metadata["name"] = cluster.config.name + metadata["namespace"] = cluster.config.namespace def update_image(spec, image): containers = spec.get("containers") - for container in containers: - container["image"] = image + if image != "": + for container in containers: + container["image"] = image def update_image_pull_secrets(spec, image_pull_secrets): @@ -137,60 +144,118 @@

    Module codeflare_sdk.utils.generate_yaml

    container["env"] = env -def update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu): +def update_resources( + spec, + worker_cpu_requests, + worker_cpu_limits, + worker_memory_requests, + worker_memory_limits, + custom_resources, +): container = spec.get("containers") for resource in container: requests = resource.get("resources").get("requests") if requests is not None: - requests["cpu"] = min_cpu - requests["memory"] = min_memory - requests["nvidia.com/gpu"] = gpu + requests["cpu"] = worker_cpu_requests + requests["memory"] = worker_memory_requests limits = resource.get("resources").get("limits") if limits is not None: - limits["cpu"] = max_cpu - limits["memory"] = max_memory - limits["nvidia.com/gpu"] = gpu + limits["cpu"] = worker_cpu_limits + limits["memory"] = worker_memory_limits + for k in custom_resources.keys(): + limits[k] = custom_resources[k] + requests[k] = custom_resources[k] + + +def head_worker_gpu_count_from_cluster( + cluster: "codeflare_sdk.cluster.Cluster", +) -> typing.Tuple[int, int]: + head_gpus = 0 + worker_gpus = 0 + for k in cluster.config.head_extended_resource_requests.keys(): + resource_type = cluster.config.extended_resource_mapping[k] + if resource_type == "GPU": + head_gpus += int(cluster.config.head_extended_resource_requests[k]) + for k in cluster.config.worker_extended_resource_requests.keys(): + resource_type = cluster.config.extended_resource_mapping[k] + if resource_type == "GPU": + worker_gpus += int(cluster.config.worker_extended_resource_requests[k]) + + return head_gpus, worker_gpus + + +FORBIDDEN_CUSTOM_RESOURCE_TYPES = ["GPU", "CPU", "memory"] + + +def head_worker_resources_from_cluster( + cluster: "codeflare_sdk.cluster.Cluster", +) -> typing.Tuple[dict, dict]: + to_return = {}, {} + for k in cluster.config.head_extended_resource_requests.keys(): + resource_type = cluster.config.extended_resource_mapping[k] + if resource_type in FORBIDDEN_CUSTOM_RESOURCE_TYPES: + continue + to_return[0][resource_type] = cluster.config.head_extended_resource_requests[ + k + ] + to_return[0].get(resource_type, 0) + + for k in cluster.config.worker_extended_resource_requests.keys(): + resource_type = cluster.config.extended_resource_mapping[k] + if resource_type in FORBIDDEN_CUSTOM_RESOURCE_TYPES: + continue + to_return[1][resource_type] = cluster.config.worker_extended_resource_requests[ + k + ] + to_return[1].get(resource_type, 0) + return to_return def update_nodes( - cluster_yaml, - appwrapper_name, - min_cpu, - max_cpu, - min_memory, - max_memory, - gpu, - workers, - image, - env, - image_pull_secrets, - head_cpus, - head_memory, - head_gpus, + ray_cluster_dict: dict, + cluster: "codeflare_sdk.cluster.Cluster", ): - head = cluster_yaml.get("spec").get("headGroupSpec") - head["rayStartParams"]["num-gpus"] = str(int(head_gpus)) + head = ray_cluster_dict.get("spec").get("headGroupSpec") + worker = ray_cluster_dict.get("spec").get("workerGroupSpecs")[0] + head_gpus, worker_gpus = head_worker_gpu_count_from_cluster(cluster) + head_resources, worker_resources = head_worker_resources_from_cluster(cluster) + head_resources = json.dumps(head_resources).replace('"', '\\"') + head_resources = f'"{head_resources}"' + worker_resources = json.dumps(worker_resources).replace('"', '\\"') + worker_resources = f'"{worker_resources}"' + head["rayStartParams"]["num-gpus"] = str(head_gpus) + head["rayStartParams"]["resources"] = head_resources - worker = cluster_yaml.get("spec").get("workerGroupSpecs")[0] # Head counts as first worker - worker["replicas"] = workers - worker["minReplicas"] = workers - worker["maxReplicas"] = workers - worker["groupName"] = "small-group-" + appwrapper_name - worker["rayStartParams"]["num-gpus"] = str(int(gpu)) + worker["replicas"] = cluster.config.num_workers + worker["minReplicas"] = cluster.config.num_workers + worker["maxReplicas"] = cluster.config.num_workers + worker["groupName"] = "small-group-" + cluster.config.name + worker["rayStartParams"]["num-gpus"] = str(worker_gpus) + worker["rayStartParams"]["resources"] = worker_resources for comp in [head, worker]: spec = comp.get("template").get("spec") - update_image_pull_secrets(spec, image_pull_secrets) - update_image(spec, image) - update_env(spec, env) + update_image_pull_secrets(spec, cluster.config.image_pull_secrets) + update_image(spec, cluster.config.image) + update_env(spec, cluster.config.envs) if comp == head: # TODO: Eventually add head node configuration outside of template update_resources( - spec, head_cpus, head_cpus, head_memory, head_memory, head_gpus + spec, + cluster.config.head_cpus, + cluster.config.head_cpus, + cluster.config.head_memory, + cluster.config.head_memory, + cluster.config.head_extended_resource_requests, ) else: - update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu) + update_resources( + spec, + cluster.config.worker_cpu_requests, + cluster.config.worker_cpu_limits, + cluster.config.worker_memory_requests, + cluster.config.worker_memory_limits, + cluster.config.worker_extended_resource_requests, + ) def del_from_list_by_name(l: list, target: typing.List[str]) -> list: @@ -208,8 +273,11 @@

    Module codeflare_sdk.utils.generate_yaml

    namespace=namespace, plural="localqueues", ) - except Exception as e: # pragma: no cover - return _kube_api_error_handling(e) + except ApiException as e: # pragma: no cover + if e.status == 404 or e.status == 403: + return + else: + return _kube_api_error_handling(e) for lq in local_queues["items"]: if ( "annotations" in lq["metadata"] @@ -218,9 +286,6 @@

    Module codeflare_sdk.utils.generate_yaml

    == "true" ): return lq["metadata"]["name"] - raise ValueError( - "Default Local Queue with kueue.x-k8s.io/default-queue: true annotation not found please create a default Local Queue or provide the local_queue name in Cluster Configuration" - ) def local_queue_exists(namespace: str, local_queue_name: str): @@ -245,7 +310,9 @@

    Module codeflare_sdk.utils.generate_yaml

    def add_queue_label(item: dict, namespace: str, local_queue: Optional[str]): lq_name = local_queue or get_default_kueue_name(namespace) - if not local_queue_exists(namespace, lq_name): + if lq_name == None: + return + elif not local_queue_exists(namespace, lq_name): raise ValueError( "local_queue provided does not exist or is not in this namespace. Please provide the correct local_queue name in Cluster Configuration" ) @@ -291,65 +358,32 @@

    Module codeflare_sdk.utils.generate_yaml

    print(f"Written to: {output_file_name}") -def generate_appwrapper( - name: str, - namespace: str, - head_cpus: int, - head_memory: int, - head_gpus: int, - min_cpu: int, - max_cpu: int, - min_memory: int, - max_memory: int, - gpu: int, - workers: int, - template: str, - image: str, - appwrapper: bool, - env, - image_pull_secrets: list, - write_to_file: bool, - local_queue: Optional[str], - labels, -): - cluster_yaml = read_template(template) - appwrapper_name, cluster_name = gen_names(name) - update_names(cluster_yaml, cluster_name, namespace) - update_nodes( +def generate_appwrapper(cluster: "codeflare_sdk.cluster.Cluster"): + cluster_yaml = read_template(cluster.config.template) + appwrapper_name, _ = gen_names(cluster.config.name) + update_names( cluster_yaml, - appwrapper_name, - min_cpu, - max_cpu, - min_memory, - max_memory, - gpu, - workers, - image, - env, - image_pull_secrets, - head_cpus, - head_memory, - head_gpus, + cluster, ) - augment_labels(cluster_yaml, labels) + update_nodes(cluster_yaml, cluster) + augment_labels(cluster_yaml, cluster.config.labels) notebook_annotations(cluster_yaml) - user_yaml = ( - wrap_cluster(cluster_yaml, appwrapper_name, namespace) - if appwrapper + wrap_cluster(cluster_yaml, appwrapper_name, cluster.config.namespace) + if cluster.config.appwrapper else cluster_yaml ) - add_queue_label(user_yaml, namespace, local_queue) + add_queue_label(user_yaml, cluster.config.namespace, cluster.config.local_queue) - if write_to_file: + if cluster.config.write_to_file: directory_path = os.path.expanduser("~/.codeflare/resources/") outfile = os.path.join(directory_path, appwrapper_name + ".yaml") write_user_yaml(user_yaml, outfile) return outfile else: user_yaml = yaml.dump(user_yaml) - print(f"Yaml resources loaded for {name}") + print(f"Yaml resources loaded for {cluster.config.name}") return user_yaml
    @@ -371,7 +405,9 @@

    Functions

    def add_queue_label(item: dict, namespace: str, local_queue: Optional[str]):
         lq_name = local_queue or get_default_kueue_name(namespace)
    -    if not local_queue_exists(namespace, lq_name):
    +    if lq_name == None:
    +        return
    +    elif not local_queue_exists(namespace, lq_name):
             raise ValueError(
                 "local_queue provided does not exist or is not in this namespace. Please provide the correct local_queue name in Cluster Configuration"
             )
    @@ -428,7 +464,7 @@ 

    Functions

    -def generate_appwrapper(name: str, namespace: str, head_cpus: int, head_memory: int, head_gpus: int, min_cpu: int, max_cpu: int, min_memory: int, max_memory: int, gpu: int, workers: int, template: str, image: str, appwrapper: bool, env, image_pull_secrets: list, write_to_file: bool, local_queue: Optional[str], labels) +def generate_appwrapper(cluster: codeflare_sdk.cluster.Cluster)
    @@ -436,65 +472,32 @@

    Functions

    Expand source code -
    def generate_appwrapper(
    -    name: str,
    -    namespace: str,
    -    head_cpus: int,
    -    head_memory: int,
    -    head_gpus: int,
    -    min_cpu: int,
    -    max_cpu: int,
    -    min_memory: int,
    -    max_memory: int,
    -    gpu: int,
    -    workers: int,
    -    template: str,
    -    image: str,
    -    appwrapper: bool,
    -    env,
    -    image_pull_secrets: list,
    -    write_to_file: bool,
    -    local_queue: Optional[str],
    -    labels,
    -):
    -    cluster_yaml = read_template(template)
    -    appwrapper_name, cluster_name = gen_names(name)
    -    update_names(cluster_yaml, cluster_name, namespace)
    -    update_nodes(
    +
    def generate_appwrapper(cluster: "codeflare_sdk.cluster.Cluster"):
    +    cluster_yaml = read_template(cluster.config.template)
    +    appwrapper_name, _ = gen_names(cluster.config.name)
    +    update_names(
             cluster_yaml,
    -        appwrapper_name,
    -        min_cpu,
    -        max_cpu,
    -        min_memory,
    -        max_memory,
    -        gpu,
    -        workers,
    -        image,
    -        env,
    -        image_pull_secrets,
    -        head_cpus,
    -        head_memory,
    -        head_gpus,
    +        cluster,
         )
    -    augment_labels(cluster_yaml, labels)
    +    update_nodes(cluster_yaml, cluster)
    +    augment_labels(cluster_yaml, cluster.config.labels)
         notebook_annotations(cluster_yaml)
    -
         user_yaml = (
    -        wrap_cluster(cluster_yaml, appwrapper_name, namespace)
    -        if appwrapper
    +        wrap_cluster(cluster_yaml, appwrapper_name, cluster.config.namespace)
    +        if cluster.config.appwrapper
             else cluster_yaml
         )
     
    -    add_queue_label(user_yaml, namespace, local_queue)
    +    add_queue_label(user_yaml, cluster.config.namespace, cluster.config.local_queue)
     
    -    if write_to_file:
    +    if cluster.config.write_to_file:
             directory_path = os.path.expanduser("~/.codeflare/resources/")
             outfile = os.path.join(directory_path, appwrapper_name + ".yaml")
             write_user_yaml(user_yaml, outfile)
             return outfile
         else:
             user_yaml = yaml.dump(user_yaml)
    -        print(f"Yaml resources loaded for {name}")
    +        print(f"Yaml resources loaded for {cluster.config.name}")
             return user_yaml
    @@ -518,8 +521,11 @@

    Functions

    namespace=namespace, plural="localqueues", ) - except Exception as e: # pragma: no cover - return _kube_api_error_handling(e) + except ApiException as e: # pragma: no cover + if e.status == 404 or e.status == 403: + return + else: + return _kube_api_error_handling(e) for lq in local_queues["items"]: if ( "annotations" in lq["metadata"] @@ -527,10 +533,64 @@

    Functions

    and lq["metadata"]["annotations"]["kueue.x-k8s.io/default-queue"].lower() == "true" ): - return lq["metadata"]["name"] - raise ValueError( - "Default Local Queue with kueue.x-k8s.io/default-queue: true annotation not found please create a default Local Queue or provide the local_queue name in Cluster Configuration" - )
    + return lq["metadata"]["name"] + + +
    +def head_worker_gpu_count_from_cluster(cluster: codeflare_sdk.cluster.Cluster) ‑> Tuple[int, int] +
    +
    +
    +
    + +Expand source code + +
    def head_worker_gpu_count_from_cluster(
    +    cluster: "codeflare_sdk.cluster.Cluster",
    +) -> typing.Tuple[int, int]:
    +    head_gpus = 0
    +    worker_gpus = 0
    +    for k in cluster.config.head_extended_resource_requests.keys():
    +        resource_type = cluster.config.extended_resource_mapping[k]
    +        if resource_type == "GPU":
    +            head_gpus += int(cluster.config.head_extended_resource_requests[k])
    +    for k in cluster.config.worker_extended_resource_requests.keys():
    +        resource_type = cluster.config.extended_resource_mapping[k]
    +        if resource_type == "GPU":
    +            worker_gpus += int(cluster.config.worker_extended_resource_requests[k])
    +
    +    return head_gpus, worker_gpus
    +
    +
    +
    +def head_worker_resources_from_cluster(cluster: codeflare_sdk.cluster.Cluster) ‑> Tuple[dict, dict] +
    +
    +
    +
    + +Expand source code + +
    def head_worker_resources_from_cluster(
    +    cluster: "codeflare_sdk.cluster.Cluster",
    +) -> typing.Tuple[dict, dict]:
    +    to_return = {}, {}
    +    for k in cluster.config.head_extended_resource_requests.keys():
    +        resource_type = cluster.config.extended_resource_mapping[k]
    +        if resource_type in FORBIDDEN_CUSTOM_RESOURCE_TYPES:
    +            continue
    +        to_return[0][resource_type] = cluster.config.head_extended_resource_requests[
    +            k
    +        ] + to_return[0].get(resource_type, 0)
    +
    +    for k in cluster.config.worker_extended_resource_requests.keys():
    +        resource_type = cluster.config.extended_resource_mapping[k]
    +        if resource_type in FORBIDDEN_CUSTOM_RESOURCE_TYPES:
    +            continue
    +        to_return[1][resource_type] = cluster.config.worker_extended_resource_requests[
    +            k
    +        ] + to_return[1].get(resource_type, 0)
    +    return to_return
    @@ -672,8 +732,9 @@

    Functions

    def update_image(spec, image):
         containers = spec.get("containers")
    -    for container in containers:
    -        container["image"] = image
    + if image != "": + for container in containers: + container["image"] = image
    @@ -693,7 +754,7 @@

    Functions

    -def update_names(cluster_yaml, cluster_name, namespace) +def update_names(cluster_yaml: dict, cluster: codeflare_sdk.cluster.Cluster)
    @@ -701,14 +762,17 @@

    Functions

    Expand source code -
    def update_names(cluster_yaml, cluster_name, namespace):
    -    meta = cluster_yaml.get("metadata")
    -    meta["name"] = cluster_name
    -    meta["namespace"] = namespace
    +
    def update_names(
    +    cluster_yaml: dict,
    +    cluster: "codeflare_sdk.cluster.Cluster",
    +):
    +    metadata = cluster_yaml.get("metadata")
    +    metadata["name"] = cluster.config.name
    +    metadata["namespace"] = cluster.config.namespace
    -def update_nodes(cluster_yaml, appwrapper_name, min_cpu, max_cpu, min_memory, max_memory, gpu, workers, image, env, image_pull_secrets, head_cpus, head_memory, head_gpus) +def update_nodes(ray_cluster_dict: dict, cluster: codeflare_sdk.cluster.Cluster)
    @@ -717,48 +781,56 @@

    Functions

    Expand source code
    def update_nodes(
    -    cluster_yaml,
    -    appwrapper_name,
    -    min_cpu,
    -    max_cpu,
    -    min_memory,
    -    max_memory,
    -    gpu,
    -    workers,
    -    image,
    -    env,
    -    image_pull_secrets,
    -    head_cpus,
    -    head_memory,
    -    head_gpus,
    +    ray_cluster_dict: dict,
    +    cluster: "codeflare_sdk.cluster.Cluster",
     ):
    -    head = cluster_yaml.get("spec").get("headGroupSpec")
    -    head["rayStartParams"]["num-gpus"] = str(int(head_gpus))
    +    head = ray_cluster_dict.get("spec").get("headGroupSpec")
    +    worker = ray_cluster_dict.get("spec").get("workerGroupSpecs")[0]
    +    head_gpus, worker_gpus = head_worker_gpu_count_from_cluster(cluster)
    +    head_resources, worker_resources = head_worker_resources_from_cluster(cluster)
    +    head_resources = json.dumps(head_resources).replace('"', '\\"')
    +    head_resources = f'"{head_resources}"'
    +    worker_resources = json.dumps(worker_resources).replace('"', '\\"')
    +    worker_resources = f'"{worker_resources}"'
    +    head["rayStartParams"]["num-gpus"] = str(head_gpus)
    +    head["rayStartParams"]["resources"] = head_resources
     
    -    worker = cluster_yaml.get("spec").get("workerGroupSpecs")[0]
         # Head counts as first worker
    -    worker["replicas"] = workers
    -    worker["minReplicas"] = workers
    -    worker["maxReplicas"] = workers
    -    worker["groupName"] = "small-group-" + appwrapper_name
    -    worker["rayStartParams"]["num-gpus"] = str(int(gpu))
    +    worker["replicas"] = cluster.config.num_workers
    +    worker["minReplicas"] = cluster.config.num_workers
    +    worker["maxReplicas"] = cluster.config.num_workers
    +    worker["groupName"] = "small-group-" + cluster.config.name
    +    worker["rayStartParams"]["num-gpus"] = str(worker_gpus)
    +    worker["rayStartParams"]["resources"] = worker_resources
     
         for comp in [head, worker]:
             spec = comp.get("template").get("spec")
    -        update_image_pull_secrets(spec, image_pull_secrets)
    -        update_image(spec, image)
    -        update_env(spec, env)
    +        update_image_pull_secrets(spec, cluster.config.image_pull_secrets)
    +        update_image(spec, cluster.config.image)
    +        update_env(spec, cluster.config.envs)
             if comp == head:
                 # TODO: Eventually add head node configuration outside of template
                 update_resources(
    -                spec, head_cpus, head_cpus, head_memory, head_memory, head_gpus
    +                spec,
    +                cluster.config.head_cpus,
    +                cluster.config.head_cpus,
    +                cluster.config.head_memory,
    +                cluster.config.head_memory,
    +                cluster.config.head_extended_resource_requests,
                 )
             else:
    -            update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu)
    + update_resources( + spec, + cluster.config.worker_cpu_requests, + cluster.config.worker_cpu_limits, + cluster.config.worker_memory_requests, + cluster.config.worker_memory_limits, + cluster.config.worker_extended_resource_requests, + )
    -def update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu) +def update_resources(spec, worker_cpu_requests, worker_cpu_limits, worker_memory_requests, worker_memory_limits, custom_resources)
    @@ -766,19 +838,27 @@

    Functions

    Expand source code -
    def update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu):
    +
    def update_resources(
    +    spec,
    +    worker_cpu_requests,
    +    worker_cpu_limits,
    +    worker_memory_requests,
    +    worker_memory_limits,
    +    custom_resources,
    +):
         container = spec.get("containers")
         for resource in container:
             requests = resource.get("resources").get("requests")
             if requests is not None:
    -            requests["cpu"] = min_cpu
    -            requests["memory"] = min_memory
    -            requests["nvidia.com/gpu"] = gpu
    +            requests["cpu"] = worker_cpu_requests
    +            requests["memory"] = worker_memory_requests
             limits = resource.get("resources").get("limits")
             if limits is not None:
    -            limits["cpu"] = max_cpu
    -            limits["memory"] = max_memory
    -            limits["nvidia.com/gpu"] = gpu
    + limits["cpu"] = worker_cpu_limits + limits["memory"] = worker_memory_limits + for k in custom_resources.keys(): + limits[k] = custom_resources[k] + requests[k] = custom_resources[k]
    @@ -844,6 +924,8 @@

    Index

  • gen_names
  • generate_appwrapper
  • get_default_kueue_name
  • +
  • head_worker_gpu_count_from_cluster
  • +
  • head_worker_resources_from_cluster
  • is_kind_cluster
  • is_openshift_cluster
  • local_queue_exists
  • diff --git a/docs/detailed-documentation/utils/index.html b/docs/detailed-documentation/utils/index.html index 1eb081d2b..4a65cc393 100644 --- a/docs/detailed-documentation/utils/index.html +++ b/docs/detailed-documentation/utils/index.html @@ -26,6 +26,10 @@

    Module codeflare_sdk.utils

    Sub-modules

    +
    codeflare_sdk.utils.demos
    +
    +
    +
    codeflare_sdk.utils.generate_cert
    @@ -67,6 +71,7 @@

    Index

  • Sub-modules

      +
    • codeflare_sdk.utils.demos
    • codeflare_sdk.utils.generate_cert
    • codeflare_sdk.utils.generate_yaml
    • codeflare_sdk.utils.kube_api_helpers
    • diff --git a/docs/detailed-documentation/utils/pretty_print.html b/docs/detailed-documentation/utils/pretty_print.html index 2e7a69b81..cbffd1223 100644 --- a/docs/detailed-documentation/utils/pretty_print.html +++ b/docs/detailed-documentation/utils/pretty_print.html @@ -169,7 +169,7 @@

      Module codeflare_sdk.utils.pretty_print

      workers = str(cluster.workers) memory = f"{cluster.worker_mem_min}~{cluster.worker_mem_max}" cpu = str(cluster.worker_cpu) - gpu = str(cluster.worker_gpu) + gpu = str(cluster.worker_extended_resources.get("nvidia.com/gpu", 0)) #'table0' to display the cluster name, status, url, and dashboard link table0 = Table(box=None, show_header=False) @@ -347,7 +347,7 @@

      Functions

      workers = str(cluster.workers) memory = f"{cluster.worker_mem_min}~{cluster.worker_mem_max}" cpu = str(cluster.worker_cpu) - gpu = str(cluster.worker_gpu) + gpu = str(cluster.worker_extended_resources.get("nvidia.com/gpu", 0)) #'table0' to display the cluster name, status, url, and dashboard link table0 = Table(box=None, show_header=False)