diff --git a/demo-notebooks/guided-demos/3_basic_interactive.ipynb b/demo-notebooks/guided-demos/3_basic_interactive.ipynb index 4d5031984..4a1b0f6f7 100644 --- a/demo-notebooks/guided-demos/3_basic_interactive.ipynb +++ b/demo-notebooks/guided-demos/3_basic_interactive.ipynb @@ -139,7 +139,7 @@ "# establish connection to ray cluster\n", "\n", "#install additionall libraries that will be required for model training\n", - "runtime_env = {\"pip\": [\"transformers\", \"datasets\", \"evaluate\", \"pyarrow<7.0.0\"]}\n", + "runtime_env = {\"pip\": [\"transformers\", \"datasets\", \"evaluate\", \"pyarrow<7.0.0\", \"accelerate\"]}\n", "\n", "ray.init(address=f'{ray_cluster_uri}', runtime_env=runtime_env)\n", "\n", diff --git a/demo-notebooks/guided-demos/notebook-ex-outputs/3_basic_interactive.ipynb b/demo-notebooks/guided-demos/notebook-ex-outputs/3_basic_interactive.ipynb index 896a63743..df4c3a944 100644 --- a/demo-notebooks/guided-demos/notebook-ex-outputs/3_basic_interactive.ipynb +++ b/demo-notebooks/guided-demos/notebook-ex-outputs/3_basic_interactive.ipynb @@ -231,7 +231,7 @@ "# establish connection to ray cluster\n", "\n", "#install additionall libraries that will be required for model training\n", - "runtime_env = {\"pip\": [\"transformers\", \"datasets\", \"evaluate\", \"pyarrow<7.0.0\"]}\n", + "runtime_env = {\"pip\": [\"transformers\", \"datasets\", \"evaluate\", \"pyarrow<7.0.0\", \"accelerate\"]}\n", "\n", "ray.init(address=f'{ray_cluster_uri}', runtime_env=runtime_env)\n", "\n", diff --git a/src/codeflare_sdk/cluster/awload.py b/src/codeflare_sdk/cluster/awload.py new file mode 100644 index 000000000..5621d6734 --- /dev/null +++ b/src/codeflare_sdk/cluster/awload.py @@ -0,0 +1,106 @@ +# Copyright 2022 IBM, Red Hat +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +The awload sub-module contains the definition of the AWManager object, which handles +submission and deletion of existing AppWrappers from a user's file system. +""" + +from os.path import isfile +import errno +import os +import openshift as oc +import yaml + + +class AWManager: + """ + An object for submitting and removing existing AppWrapper yamls + to be added to the MCAD queue. + """ + + def __init__(self, filename: str) -> None: + """ + Create the AppWrapper Manager object by passing in an + AppWrapper yaml file + """ + if not isfile(filename): + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), filename) + self.filename = filename + try: + with open(self.filename) as f: + awyaml = yaml.load(f, Loader=yaml.FullLoader) + assert awyaml["kind"] == "AppWrapper" + self.name = awyaml["metadata"]["name"] + self.namespace = awyaml["metadata"]["namespace"] + except: + raise ValueError( + f"{filename } is not a correctly formatted AppWrapper yaml" + ) + self.submitted = False + + def submit(self) -> None: + """ + Attempts to create the AppWrapper custom resource using the yaml file + """ + try: + with oc.project(self.namespace): + oc.invoke("create", ["-f", self.filename]) + except oc.OpenShiftPythonException as osp: # pragma: no cover + error_msg = osp.result.err() + if "Unauthorized" in error_msg or "Forbidden" in error_msg: + raise PermissionError( + "Action not permitted, have you put in correct/up-to-date auth credentials?" + ) + elif "AlreadyExists" in error_msg: + raise FileExistsError( + f"An AppWrapper of the name {self.name} already exists in namespace {self.namespace}" + ) + raise osp + + self.submitted = True + print(f"AppWrapper {self.filename} submitted!") + + def remove(self) -> None: + """ + Attempts to delete the AppWrapper custom resource matching the name in the yaml, + if submitted by this manager. + """ + if not self.submitted: + print("AppWrapper not submitted by this manager yet, nothing to remove") + return + + try: + with oc.project(self.namespace): + oc.invoke("delete", ["AppWrapper", self.name]) + except oc.OpenShiftPythonException as osp: # pragma: no cover + error_msg = osp.result.err() + if ( + 'the server doesn\'t have a resource type "AppWrapper"' in error_msg + or "forbidden" in error_msg + or "Unauthorized" in error_msg + or "Missing or incomplete configuration" in error_msg + ): + raise PermissionError( + "Action not permitted, have you put in correct/up-to-date auth credentials?" + ) + elif "not found" in error_msg: + self.submitted = False + print("AppWrapper not found, was deleted in another manner") + return + else: + raise osp + + self.submitted = False + print(f"AppWrapper {self.name} removed!") diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index bc30cb3e6..31974b00f 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -136,7 +136,7 @@ def down(self): or "Missing or incomplete configuration" in error_msg ): raise PermissionError( - "Action not permitted, have you run cluster.up() yet?" + "Action not permitted, have you run auth.login()/cluster.up() yet?" ) elif "not found" in error_msg: print("Cluster not found, have you run cluster.up() yet?") diff --git a/tests/test-case-bad.yaml b/tests/test-case-bad.yaml new file mode 100644 index 000000000..1d273ca60 --- /dev/null +++ b/tests/test-case-bad.yaml @@ -0,0 +1,175 @@ +apiVersion: mcad.ibm.com/v1beta1 +kind: AppsWrapper +metadata: + labels: + orderedinstance: cpu.small_gpu.large + nam: unit-test-cluster + namspace: ns +spec: + priority: 9 + resources: + GenericItems: + - custompodresources: + - limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + replicas: 1 + requests: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + - limits: + cpu: 4 + memory: 6G + nvidia.com/gpu: 7 + replicas: 2 + requests: + cpu: 3 + memory: 5G + nvidia.com/gpu: 7 + generictemplate: + apiVersion: ray.io/v1alpha1 + kind: RayCluster + metadata: + labels: + appwrapper.mcad.ibm.com: unit-test-cluster + controller-tools.k8s.io: '1.0' + name: unit-test-cluster + namespace: ns + spec: + autoscalerOptions: + idleTimeoutSeconds: 60 + imagePullPolicy: Always + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 500m + memory: 512Mi + upscalingMode: Default + enableInTreeAutoscaling: false + headGroupSpec: + rayStartParams: + block: 'true' + dashboard-host: 0.0.0.0 + num-gpus: '0' + serviceType: ClusterIP + template: + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: unit-test-cluster + operator: In + values: + - unit-test-cluster + containers: + - env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + image: ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103 + imagePullPolicy: Always + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - ray stop + name: ray-head + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + resources: + limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + requests: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + rayVersion: 1.12.0 + workerGroupSpecs: + - groupName: small-group-unit-test-cluster + maxReplicas: 2 + minReplicas: 2 + rayStartParams: + block: 'true' + num-gpus: '7' + replicas: 2 + template: + metadata: + annotations: + key: value + labels: + key: value + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: unit-test-cluster + operator: In + values: + - unit-test-cluster + containers: + - env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + image: ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103 + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - ray stop + name: machine-learning + resources: + limits: + cpu: 4 + memory: 6G + nvidia.com/gpu: 7 + requests: + cpu: 3 + memory: 5G + nvidia.com/gpu: 7 + initContainers: + - command: + - sh + - -c + - until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; + do echo waiting for myservice; sleep 2; done + image: busybox:1.28 + name: init-myservice + replicas: 1 + - generictemplate: + apiVersion: route.openshift.io/v1 + kind: Route + metadata: + labels: + odh-ray-cluster-service: unit-test-cluster-head-svc + name: ray-dashboard-unit-test-cluster + namespace: ns + spec: + port: + targetPort: dashboard + to: + kind: Service + name: unit-test-cluster-head-svc + replica: 1 + Items: [] diff --git a/tests/unit_test.py b/tests/unit_test.py index 1d858e3f4..f1255dc45 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -21,6 +21,7 @@ parent = Path(__file__).resolve().parents[1] sys.path.append(str(parent) + "/src") +from codeflare_sdk.cluster.awload import AWManager from codeflare_sdk.cluster.cluster import ( Cluster, ClusterConfiguration, @@ -1932,7 +1933,54 @@ def parse_j(cmd): return f"{max_worker}x{gpu}" -# Make sure to keep this function and the efollowing function at the end of the file +def test_AWManager_creation(): + testaw = AWManager("test.yaml") + assert testaw.name == "test" + assert testaw.namespace == "ns" + assert testaw.submitted == False + try: + testaw = AWManager("fake") + except Exception as e: + assert type(e) == FileNotFoundError + assert str(e) == "[Errno 2] No such file or directory: 'fake'" + try: + testaw = AWManager("tests/test-case-bad.yaml") + except Exception as e: + assert type(e) == ValueError + assert ( + str(e) + == "tests/test-case-bad.yaml is not a correctly formatted AppWrapper yaml" + ) + + +def arg_check_aw_create_effect(*args): + assert args[0] == "create" + assert args[1] == ["-f", "test.yaml"] + + +def arg_check_aw_delete_effect(*args): + assert args[0] == "delete" + assert args[1] == ["AppWrapper", "test"] + + +def test_AWManager_submit_remove(mocker, capsys): + testaw = AWManager("test.yaml") + testaw.remove() + captured = capsys.readouterr() + assert ( + captured.out + == "AppWrapper not submitted by this manager yet, nothing to remove\n" + ) + assert testaw.submitted == False + mocker.patch("openshift.invoke", side_effect=arg_check_aw_create_effect) + testaw.submit() + assert testaw.submitted == True + mocker.patch("openshift.invoke", side_effect=arg_check_aw_delete_effect) + testaw.remove() + assert testaw.submitted == False + + +# Make sure to keep this function and the following function at the end of the file def test_cmd_line_generation(): os.system( f"python3 {parent}/src/codeflare_sdk/utils/generate_yaml.py --name=unit-cmd-cluster --min-cpu=1 --max-cpu=1 --min-memory=2 --max-memory=2 --gpu=1 --workers=2 --template=src/codeflare_sdk/templates/new-template.yaml"