Skip to content

Adding AppWrapper from File System Management #120

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion demo-notebooks/guided-demos/3_basic_interactive.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@
"# establish connection to ray cluster\n",
"\n",
"#install additionall libraries that will be required for model training\n",
"runtime_env = {\"pip\": [\"transformers\", \"datasets\", \"evaluate\", \"pyarrow<7.0.0\"]}\n",
"runtime_env = {\"pip\": [\"transformers\", \"datasets\", \"evaluate\", \"pyarrow<7.0.0\", \"accelerate\"]}\n",
"\n",
"ray.init(address=f'{ray_cluster_uri}', runtime_env=runtime_env)\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@
"# establish connection to ray cluster\n",
"\n",
"#install additionall libraries that will be required for model training\n",
"runtime_env = {\"pip\": [\"transformers\", \"datasets\", \"evaluate\", \"pyarrow<7.0.0\"]}\n",
"runtime_env = {\"pip\": [\"transformers\", \"datasets\", \"evaluate\", \"pyarrow<7.0.0\", \"accelerate\"]}\n",
"\n",
"ray.init(address=f'{ray_cluster_uri}', runtime_env=runtime_env)\n",
"\n",
Expand Down
106 changes: 106 additions & 0 deletions src/codeflare_sdk/cluster/awload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# Copyright 2022 IBM, Red Hat
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
The awload sub-module contains the definition of the AWManager object, which handles
submission and deletion of existing AppWrappers from a user's file system.
"""

from os.path import isfile
import errno
import os
import openshift as oc
import yaml


class AWManager:
"""
An object for submitting and removing existing AppWrapper yamls
to be added to the MCAD queue.
"""

def __init__(self, filename: str) -> None:
"""
Create the AppWrapper Manager object by passing in an
AppWrapper yaml file
"""
if not isfile(filename):
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), filename)
self.filename = filename
try:
with open(self.filename) as f:
awyaml = yaml.load(f, Loader=yaml.FullLoader)
assert awyaml["kind"] == "AppWrapper"
self.name = awyaml["metadata"]["name"]
self.namespace = awyaml["metadata"]["namespace"]
except:
raise ValueError(
f"{filename } is not a correctly formatted AppWrapper yaml"
)
self.submitted = False

def submit(self) -> None:
"""
Attempts to create the AppWrapper custom resource using the yaml file
"""
try:
with oc.project(self.namespace):
oc.invoke("create", ["-f", self.filename])
except oc.OpenShiftPythonException as osp: # pragma: no cover
error_msg = osp.result.err()
if "Unauthorized" in error_msg or "Forbidden" in error_msg:
raise PermissionError(
"Action not permitted, have you put in correct/up-to-date auth credentials?"
)
elif "AlreadyExists" in error_msg:
raise FileExistsError(
f"An AppWrapper of the name {self.name} already exists in namespace {self.namespace}"
)
raise osp

self.submitted = True
print(f"AppWrapper {self.filename} submitted!")

def remove(self) -> None:
"""
Attempts to delete the AppWrapper custom resource matching the name in the yaml,
if submitted by this manager.
"""
if not self.submitted:
print("AppWrapper not submitted by this manager yet, nothing to remove")
return

try:
with oc.project(self.namespace):
oc.invoke("delete", ["AppWrapper", self.name])
except oc.OpenShiftPythonException as osp: # pragma: no cover
error_msg = osp.result.err()
if (
'the server doesn\'t have a resource type "AppWrapper"' in error_msg
or "forbidden" in error_msg
or "Unauthorized" in error_msg
or "Missing or incomplete configuration" in error_msg
):
raise PermissionError(
"Action not permitted, have you put in correct/up-to-date auth credentials?"
)
elif "not found" in error_msg:
self.submitted = False
print("AppWrapper not found, was deleted in another manner")
return
else:
raise osp

self.submitted = False
print(f"AppWrapper {self.name} removed!")
2 changes: 1 addition & 1 deletion src/codeflare_sdk/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def down(self):
or "Missing or incomplete configuration" in error_msg
):
raise PermissionError(
"Action not permitted, have you run cluster.up() yet?"
"Action not permitted, have you run auth.login()/cluster.up() yet?"
)
elif "not found" in error_msg:
print("Cluster not found, have you run cluster.up() yet?")
Expand Down
175 changes: 175 additions & 0 deletions tests/test-case-bad.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
apiVersion: mcad.ibm.com/v1beta1
kind: AppsWrapper
metadata:
labels:
orderedinstance: cpu.small_gpu.large
nam: unit-test-cluster
namspace: ns
spec:
priority: 9
resources:
GenericItems:
- custompodresources:
- limits:
cpu: 2
memory: 8G
nvidia.com/gpu: 0
replicas: 1
requests:
cpu: 2
memory: 8G
nvidia.com/gpu: 0
- limits:
cpu: 4
memory: 6G
nvidia.com/gpu: 7
replicas: 2
requests:
cpu: 3
memory: 5G
nvidia.com/gpu: 7
generictemplate:
apiVersion: ray.io/v1alpha1
kind: RayCluster
metadata:
labels:
appwrapper.mcad.ibm.com: unit-test-cluster
controller-tools.k8s.io: '1.0'
name: unit-test-cluster
namespace: ns
spec:
autoscalerOptions:
idleTimeoutSeconds: 60
imagePullPolicy: Always
resources:
limits:
cpu: 500m
memory: 512Mi
requests:
cpu: 500m
memory: 512Mi
upscalingMode: Default
enableInTreeAutoscaling: false
headGroupSpec:
rayStartParams:
block: 'true'
dashboard-host: 0.0.0.0
num-gpus: '0'
serviceType: ClusterIP
template:
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: unit-test-cluster
operator: In
values:
- unit-test-cluster
containers:
- env:
- name: MY_POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
image: ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103
imagePullPolicy: Always
lifecycle:
preStop:
exec:
command:
- /bin/sh
- -c
- ray stop
name: ray-head
ports:
- containerPort: 6379
name: gcs
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
resources:
limits:
cpu: 2
memory: 8G
nvidia.com/gpu: 0
requests:
cpu: 2
memory: 8G
nvidia.com/gpu: 0
rayVersion: 1.12.0
workerGroupSpecs:
- groupName: small-group-unit-test-cluster
maxReplicas: 2
minReplicas: 2
rayStartParams:
block: 'true'
num-gpus: '7'
replicas: 2
template:
metadata:
annotations:
key: value
labels:
key: value
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: unit-test-cluster
operator: In
values:
- unit-test-cluster
containers:
- env:
- name: MY_POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
image: ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103
lifecycle:
preStop:
exec:
command:
- /bin/sh
- -c
- ray stop
name: machine-learning
resources:
limits:
cpu: 4
memory: 6G
nvidia.com/gpu: 7
requests:
cpu: 3
memory: 5G
nvidia.com/gpu: 7
initContainers:
- command:
- sh
- -c
- until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local;
do echo waiting for myservice; sleep 2; done
image: busybox:1.28
name: init-myservice
replicas: 1
- generictemplate:
apiVersion: route.openshift.io/v1
kind: Route
metadata:
labels:
odh-ray-cluster-service: unit-test-cluster-head-svc
name: ray-dashboard-unit-test-cluster
namespace: ns
spec:
port:
targetPort: dashboard
to:
kind: Service
name: unit-test-cluster-head-svc
replica: 1
Items: []
50 changes: 49 additions & 1 deletion tests/unit_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
parent = Path(__file__).resolve().parents[1]
sys.path.append(str(parent) + "/src")

from codeflare_sdk.cluster.awload import AWManager
from codeflare_sdk.cluster.cluster import (
Cluster,
ClusterConfiguration,
Expand Down Expand Up @@ -1932,7 +1933,54 @@ def parse_j(cmd):
return f"{max_worker}x{gpu}"


# Make sure to keep this function and the efollowing function at the end of the file
def test_AWManager_creation():
testaw = AWManager("test.yaml")
assert testaw.name == "test"
assert testaw.namespace == "ns"
assert testaw.submitted == False
try:
testaw = AWManager("fake")
except Exception as e:
assert type(e) == FileNotFoundError
assert str(e) == "[Errno 2] No such file or directory: 'fake'"
try:
testaw = AWManager("tests/test-case-bad.yaml")
except Exception as e:
assert type(e) == ValueError
assert (
str(e)
== "tests/test-case-bad.yaml is not a correctly formatted AppWrapper yaml"
)


def arg_check_aw_create_effect(*args):
assert args[0] == "create"
assert args[1] == ["-f", "test.yaml"]


def arg_check_aw_delete_effect(*args):
assert args[0] == "delete"
assert args[1] == ["AppWrapper", "test"]


def test_AWManager_submit_remove(mocker, capsys):
testaw = AWManager("test.yaml")
testaw.remove()
captured = capsys.readouterr()
assert (
captured.out
== "AppWrapper not submitted by this manager yet, nothing to remove\n"
)
assert testaw.submitted == False
mocker.patch("openshift.invoke", side_effect=arg_check_aw_create_effect)
testaw.submit()
assert testaw.submitted == True
mocker.patch("openshift.invoke", side_effect=arg_check_aw_delete_effect)
testaw.remove()
assert testaw.submitted == False


# Make sure to keep this function and the following function at the end of the file
def test_cmd_line_generation():
os.system(
f"python3 {parent}/src/codeflare_sdk/utils/generate_yaml.py --name=unit-cmd-cluster --min-cpu=1 --max-cpu=1 --min-memory=2 --max-memory=2 --gpu=1 --workers=2 --template=src/codeflare_sdk/templates/new-template.yaml"
Expand Down