From db38aa9751a6609ffc13612a67c92503df877e15 Mon Sep 17 00:00:00 2001 From: Karel Suta Date: Mon, 30 Oct 2023 15:53:44 +0100 Subject: [PATCH] Setup existing DW tests to run in CFO --- test/odh/environment.go | 49 +++++++ test/odh/mcad_ray_test.go | 98 ++++++++++++++ test/odh/notebook.go | 99 ++++++++++++++ test/odh/pytorch_mcad_test.go | 77 +++++++++++ test/odh/resources/custom-nb-small.yaml | 165 +++++++++++++++++++++++ test/odh/resources/mnist.py | 160 ++++++++++++++++++++++ test/odh/resources/mnist_mcad_mini.ipynb | 93 +++++++++++++ test/odh/resources/mnist_ray_mini.ipynb | 145 ++++++++++++++++++++ test/odh/resources/requirements.txt | 4 + test/odh/support.go | 34 +++++ 10 files changed, 924 insertions(+) create mode 100644 test/odh/environment.go create mode 100644 test/odh/mcad_ray_test.go create mode 100644 test/odh/notebook.go create mode 100644 test/odh/pytorch_mcad_test.go create mode 100644 test/odh/resources/custom-nb-small.yaml create mode 100644 test/odh/resources/mnist.py create mode 100644 test/odh/resources/mnist_mcad_mini.ipynb create mode 100644 test/odh/resources/mnist_ray_mini.ipynb create mode 100644 test/odh/resources/requirements.txt create mode 100644 test/odh/support.go diff --git a/test/odh/environment.go b/test/odh/environment.go new file mode 100644 index 000000000..0a087c9d5 --- /dev/null +++ b/test/odh/environment.go @@ -0,0 +1,49 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package odh + +import ( + "os" + + . "github.com/project-codeflare/codeflare-common/support" +) + +const ( + // The environment variable for namespace where ODH is installed to. + odhNamespaceEnvVar = "ODH_NAMESPACE" + // The environment variable for ODH Notebook ImageStream name + notebookImageStreamName = "NOTEBOOK_IMAGE_STREAM_NAME" +) + +func GetOpenDataHubNamespace() string { + return lookupEnvOrDefault(odhNamespaceEnvVar, "opendatahub") +} + +func GetNotebookImageStreamName(t Test) string { + isName, ok := os.LookupEnv(notebookImageStreamName) + if !ok { + t.T().Fatalf("Expected environment variable %s not found, please use this environment variable to specify what ImageStream to use for Notebook.", notebookImageStreamName) + } + return isName +} + +func lookupEnvOrDefault(key, value string) string { + if v, ok := os.LookupEnv(key); ok { + return v + } + return value +} diff --git a/test/odh/mcad_ray_test.go b/test/odh/mcad_ray_test.go new file mode 100644 index 000000000..770b64d9d --- /dev/null +++ b/test/odh/mcad_ray_test.go @@ -0,0 +1,98 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package odh + +import ( + "testing" + + . "github.com/onsi/gomega" + . "github.com/project-codeflare/codeflare-common/support" + mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" + rayv1alpha1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1" + + rbacv1 "k8s.io/api/rbac/v1" +) + +func TestMCADRay(t *testing.T) { + test := With(t) + + // Create a namespace + namespace := test.NewTestNamespace() + + // Test configuration + jupyterNotebookConfigMapFileName := "mnist_ray_mini.ipynb" + config := CreateConfigMap(test, namespace.Name, map[string][]byte{ + // MNIST Ray Notebook + jupyterNotebookConfigMapFileName: ReadFile(test, "resources/mnist_ray_mini.ipynb"), + "mnist.py": ReadFile(test, "resources/mnist.py"), + "requirements.txt": ReadFile(test, "resources/requirements.txt"), + }) + + // Create RBAC, retrieve token for user with limited rights + policyRules := []rbacv1.PolicyRule{ + { + Verbs: []string{"get", "create", "delete", "list", "patch", "update"}, + APIGroups: []string{mcadv1beta1.GroupName}, + Resources: []string{"appwrappers"}, + }, + { + Verbs: []string{"get", "list"}, + APIGroups: []string{rayv1alpha1.GroupVersion.Group}, + Resources: []string{"rayclusters", "rayclusters/status"}, + }, + { + Verbs: []string{"get", "list"}, + APIGroups: []string{"route.openshift.io"}, + Resources: []string{"routes"}, + }, + } + + // Create cluster wide RBAC, required for SDK OpenShift check + // TODO reevaluate once SDK change OpenShift detection logic + clusterPolicyRules := []rbacv1.PolicyRule{ + { + Verbs: []string{"get", "list"}, + APIGroups: []string{"config.openshift.io"}, + Resources: []string{"ingresses"}, + ResourceNames: []string{"cluster"}, + }, + } + + sa := CreateServiceAccount(test, namespace.Name) + role := CreateRole(test, namespace.Name, policyRules) + CreateRoleBinding(test, namespace.Name, sa, role) + clusterRole := CreateClusterRole(test, clusterPolicyRules) + CreateClusterRoleBinding(test, sa, clusterRole) + token := CreateToken(test, namespace.Name, sa) + + // Create Notebook CR + createNotebook(test, namespace, token, config.Name, jupyterNotebookConfigMapFileName) + + // Make sure the AppWrapper is created and running + test.Eventually(AppWrappers(test, namespace), TestTimeoutLong). + Should( + And( + HaveLen(1), + ContainElement(WithTransform(AppWrapperName, HavePrefix("mnisttest"))), + ContainElement(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateActive))), + ), + ) + + // Make sure the AppWrapper finishes and is deleted + test.Eventually(AppWrappers(test, namespace), TestTimeoutLong). + Should(HaveLen(0)) +} diff --git a/test/odh/notebook.go b/test/odh/notebook.go new file mode 100644 index 000000000..8c7b28275 --- /dev/null +++ b/test/odh/notebook.go @@ -0,0 +1,99 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package odh + +import ( + "bytes" + "html/template" + + gomega "github.com/onsi/gomega" + . "github.com/project-codeflare/codeflare-common/support" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/util/yaml" + + imagev1 "github.com/openshift/api/image/v1" +) + +const recommendedTagAnnotation = "opendatahub.io/workbench-image-recommended" + +var notebookResource = schema.GroupVersionResource{Group: "kubeflow.org", Version: "v1", Resource: "notebooks"} + +type NotebookProps struct { + IngressDomain string + OpenShiftApiUrl string + KubernetesBearerToken string + Namespace string + OpenDataHubNamespace string + ImageStreamName string + ImageStreamTag string + NotebookConfigMapName string + NotebookConfigMapFileName string + NotebookPVC string +} + +func createNotebook(test Test, namespace *corev1.Namespace, notebookToken, jupyterNotebookConfigMapName, jupyterNotebookConfigMapFileName string) { + // Create PVC for Notebook + notebookPVC := CreatePersistentVolumeClaim(test, namespace.Name, "10Gi", corev1.ReadWriteOnce) + + // Retrieve ImageStream tag for + is := GetImageStream(test, GetOpenDataHubNamespace(), GetNotebookImageStreamName(test)) + recommendedTagName := getRecommendedImageStreamTag(test, is) + + // Read the Notebook CR from resources and perform replacements for custom values using go template + notebookProps := NotebookProps{ + IngressDomain: GetOpenShiftIngressDomain(test), + OpenShiftApiUrl: GetOpenShiftApiUrl(test), + KubernetesBearerToken: notebookToken, + Namespace: namespace.Name, + OpenDataHubNamespace: GetOpenDataHubNamespace(), + ImageStreamName: GetNotebookImageStreamName(test), + ImageStreamTag: recommendedTagName, + NotebookConfigMapName: jupyterNotebookConfigMapName, + NotebookConfigMapFileName: jupyterNotebookConfigMapFileName, + NotebookPVC: notebookPVC.Name, + } + notebookTemplate, err := files.ReadFile("resources/custom-nb-small.yaml") + test.Expect(err).NotTo(gomega.HaveOccurred()) + parsedNotebookTemplate, err := template.New("notebook").Parse(string(notebookTemplate)) + test.Expect(err).NotTo(gomega.HaveOccurred()) + + // Filter template and store results to the buffer + notebookBuffer := new(bytes.Buffer) + err = parsedNotebookTemplate.Execute(notebookBuffer, notebookProps) + test.Expect(err).NotTo(gomega.HaveOccurred()) + + // Create Notebook CR + notebookCR := &unstructured.Unstructured{} + err = yaml.NewYAMLOrJSONDecoder(notebookBuffer, 8192).Decode(notebookCR) + test.Expect(err).NotTo(gomega.HaveOccurred()) + _, err = test.Client().Dynamic().Resource(notebookResource).Namespace(namespace.Name).Create(test.Ctx(), notebookCR, metav1.CreateOptions{}) + test.Expect(err).NotTo(gomega.HaveOccurred()) +} + +func getRecommendedImageStreamTag(test Test, is *imagev1.ImageStream) (tagName string) { + for _, tag := range is.Spec.Tags { + if tag.Annotations[recommendedTagAnnotation] == "true" { + return tag.Name + } + } + test.T().Fatalf("tag with annotation '%s' not found in ImageStream %s", recommendedTagAnnotation, is.Name) + return +} diff --git a/test/odh/pytorch_mcad_test.go b/test/odh/pytorch_mcad_test.go new file mode 100644 index 000000000..0dd33a363 --- /dev/null +++ b/test/odh/pytorch_mcad_test.go @@ -0,0 +1,77 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package odh + +import ( + "testing" + + . "github.com/onsi/gomega" + . "github.com/project-codeflare/codeflare-common/support" + mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" + + rbacv1 "k8s.io/api/rbac/v1" +) + +func TestMnistPyTorchMCAD(t *testing.T) { + test := With(t) + + // Create a namespace + namespace := test.NewTestNamespace() + + // Test configuration + jupyterNotebookConfigMapFileName := "mnist_mcad_mini.ipynb" + config := CreateConfigMap(test, namespace.Name, map[string][]byte{ + // MNIST MCAD Notebook + jupyterNotebookConfigMapFileName: ReadFile(test, "resources/mnist_mcad_mini.ipynb"), + }) + + // Create RBAC, retrieve token for user with limited rights + policyRules := []rbacv1.PolicyRule{ + { + Verbs: []string{"get", "create", "delete", "list", "patch", "update"}, + APIGroups: []string{mcadv1beta1.GroupName}, + Resources: []string{"appwrappers"}, + }, + // Needed for job.logs() + { + Verbs: []string{"get"}, + APIGroups: []string{""}, + Resources: []string{"pods/log"}, + }, + } + sa := CreateServiceAccount(test, namespace.Name) + role := CreateRole(test, namespace.Name, policyRules) + CreateRoleBinding(test, namespace.Name, sa, role) + token := CreateToken(test, namespace.Name, sa) + + // Create Notebook CR + createNotebook(test, namespace, token, config.Name, jupyterNotebookConfigMapFileName) + + // Make sure the AppWrapper is created and running + test.Eventually(AppWrappers(test, namespace), TestTimeoutLong). + Should( + And( + HaveLen(1), + ContainElement(WithTransform(AppWrapperName, HavePrefix("mnistjob"))), + ContainElement(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateActive))), + ), + ) + + // Make sure the AppWrapper finishes and is deleted + test.Eventually(AppWrappers(test, namespace), TestTimeoutLong). + Should(HaveLen(0)) +} diff --git a/test/odh/resources/custom-nb-small.yaml b/test/odh/resources/custom-nb-small.yaml new file mode 100644 index 000000000..95aaaf106 --- /dev/null +++ b/test/odh/resources/custom-nb-small.yaml @@ -0,0 +1,165 @@ +# This template maybe used to spin up a custom notebook image +# i.e.: sed s/{{.IngressDomain}}/$(oc get ingresses.config/cluster -o jsonpath={.spec.domain})/g tests/resources/custom-nb.template | oc apply -f - +# resources generated: +# pod/jupyter-nb-kube-3aadmin-0 +# service/jupyter-nb-kube-3aadmin +# route.route.openshift.io/jupyter-nb-kube-3aadmin (jupyter-nb-kube-3aadmin-opendatahub.apps.tedbig412.cp.fyre.ibm.com) +# service/jupyter-nb-kube-3aadmin-tls +apiVersion: kubeflow.org/v1 +kind: Notebook +metadata: + annotations: + notebooks.opendatahub.io/inject-oauth: "true" + notebooks.opendatahub.io/last-image-selection: codeflare-notebook:latest + notebooks.opendatahub.io/last-size-selection: Small + notebooks.opendatahub.io/oauth-logout-url: https://odh-dashboard-{{.OpenDataHubNamespace}}.{{.IngressDomain}}/notebookController/kube-3aadmin/home + opendatahub.io/link: https://jupyter-nb-kube-3aadmin-{{.Namespace}}.{{.IngressDomain}}/notebook/{{.Namespace}}/jupyter-nb-kube-3aadmin + opendatahub.io/username: kube:admin + generation: 1 + labels: + app: jupyter-nb-kube-3aadmin + opendatahub.io/dashboard: "true" + opendatahub.io/odh-managed: "true" + opendatahub.io/user: kube-3aadmin + name: jupyter-nb-kube-3aadmin + namespace: {{.Namespace}} +spec: + template: + spec: + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - preference: + matchExpressions: + - key: nvidia.com/gpu.present + operator: NotIn + values: + - "true" + weight: 1 + containers: + - env: + - name: NOTEBOOK_ARGS + value: |- + --ServerApp.port=8888 + --ServerApp.token='' + --ServerApp.password='' + --ServerApp.base_url=/notebook/{{.Namespace}}/jupyter-nb-kube-3aadmin + --ServerApp.quit_button=False + --ServerApp.tornado_settings={"user":"kube-3aadmin","hub_host":"https://odh-dashboard-{{.OpenDataHubNamespace}}.{{.IngressDomain}}","hub_prefix":"/notebookController/kube-3aadmin"} + - name: JUPYTER_IMAGE + value: image-registry.openshift-image-registry.svc:5000/{{.OpenDataHubNamespace}}/{{.ImageStreamName}}:{{.ImageStreamTag}} + - name: JUPYTER_NOTEBOOK_PORT + value: "8888" + - name: OCP_SERVER + value: {{.OpenShiftApiUrl}} + - name: OCP_TOKEN + value: {{.KubernetesBearerToken}} + image: image-registry.openshift-image-registry.svc:5000/{{.OpenDataHubNamespace}}/{{.ImageStreamName}}:{{.ImageStreamTag}} + command: ["/bin/sh", "-c", "pip install papermill && oc login --token=${OCP_TOKEN} --server=${OCP_SERVER} --insecure-skip-tls-verify=true && papermill /opt/app-root/notebooks/{{.NotebookConfigMapFileName}} /opt/app-root/src/mcad-out.ipynb -p namespace {{.Namespace}} && sleep infinity"] + # args: ["pip install papermill && oc login --token=${OCP_TOKEN} --server=${OCP_SERVER} --insecure-skip-tls-verify=true && papermill /opt/app-root/notebooks/mcad.ipynb /opt/app-root/src/mcad-out.ipynb" ] + imagePullPolicy: Always + # livenessProbe: + # failureThreshold: 3 + # httpGet: + # path: /notebook/{{.Namespace}}/jupyter-nb-kube-3aadmin/api + # port: notebook-port + # scheme: HTTP + # initialDelaySeconds: 10 + # periodSeconds: 5 + # successThreshold: 1 + # timeoutSeconds: 1 + name: jupyter-nb-kube-3aadmin + ports: + - containerPort: 8888 + name: notebook-port + protocol: TCP + resources: + limits: + cpu: "2" + memory: 3Gi + requests: + cpu: "1" + memory: 3Gi + volumeMounts: + - mountPath: /opt/app-root/src + name: jupyterhub-nb-kube-3aadmin-pvc + - mountPath: /opt/app-root/notebooks + name: {{.NotebookConfigMapName}} + workingDir: /opt/app-root/src + - args: + - --provider=openshift + - --https-address=:8443 + - --http-address= + - --openshift-service-account=jupyter-nb-kube-3aadmin + - --cookie-secret-file=/etc/oauth/config/cookie_secret + - --cookie-expire=24h0m0s + - --tls-cert=/etc/tls/private/tls.crt + - --tls-key=/etc/tls/private/tls.key + - --upstream=http://localhost:8888 + - --upstream-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt + - --skip-auth-regex=^(?:/notebook/$(NAMESPACE)/jupyter-nb-kube-3aadmin)?/api$ + - --email-domain=* + - --skip-provider-button + - --openshift-sar={"verb":"get","resource":"notebooks","resourceAPIGroup":"kubeflow.org","resourceName":"jupyter-nb-kube-3aadmin","namespace":"$(NAMESPACE)"} + - --logout-url=https://odh-dashboard-{{.OpenDataHubNamespace}}.{{.IngressDomain}}/notebookController/kube-3aadmin/home + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + image: registry.redhat.io/openshift4/ose-oauth-proxy:v4.10 + imagePullPolicy: Always + livenessProbe: + failureThreshold: 3 + httpGet: + path: /oauth/healthz + port: oauth-proxy + scheme: HTTPS + initialDelaySeconds: 30 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + name: oauth-proxy + ports: + - containerPort: 8443 + name: oauth-proxy + protocol: TCP + readinessProbe: + failureThreshold: 3 + httpGet: + path: /oauth/healthz + port: oauth-proxy + scheme: HTTPS + initialDelaySeconds: 5 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + resources: + limits: + cpu: 100m + memory: 64Mi + requests: + cpu: 100m + memory: 64Mi + volumeMounts: + - mountPath: /etc/oauth/config + name: oauth-config + - mountPath: /etc/tls/private + name: tls-certificates + enableServiceLinks: false + serviceAccountName: jupyter-nb-kube-3aadmin + volumes: + - name: jupyterhub-nb-kube-3aadmin-pvc + persistentVolumeClaim: + claimName: {{.NotebookPVC}} + - name: oauth-config + secret: + defaultMode: 420 + secretName: jupyter-nb-kube-3aadmin-oauth-config + - name: tls-certificates + secret: + defaultMode: 420 + secretName: jupyter-nb-kube-3aadmin-tls + - name: {{.NotebookConfigMapName}} + configMap: + name: {{.NotebookConfigMapName}} diff --git a/test/odh/resources/mnist.py b/test/odh/resources/mnist.py new file mode 100644 index 000000000..d6a211944 --- /dev/null +++ b/test/odh/resources/mnist.py @@ -0,0 +1,160 @@ +# Copyright 2022 IBM, Red Hat +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import torch +from pytorch_lightning import LightningModule, Trainer +from pytorch_lightning.callbacks.progress import TQDMProgressBar +from torch import nn +from torch.nn import functional as F +from torch.utils.data import DataLoader, random_split +from torchmetrics import Accuracy +from torchvision import transforms +from torchvision.datasets import MNIST + +PATH_DATASETS = os.environ.get("PATH_DATASETS", ".") +BATCH_SIZE = 256 if torch.cuda.is_available() else 64 +# %% + +print("prior to running the trainer") +print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR")) +print("MASTER_PORT: is ", os.getenv("MASTER_PORT")) + + +class LitMNIST(LightningModule): + def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4): + + super().__init__() + + # Set our init args as class attributes + self.data_dir = data_dir + self.hidden_size = hidden_size + self.learning_rate = learning_rate + + # Hardcode some dataset specific attributes + self.num_classes = 10 + self.dims = (1, 28, 28) + channels, width, height = self.dims + self.transform = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)), + ] + ) + + # Define PyTorch model + self.model = nn.Sequential( + nn.Flatten(), + nn.Linear(channels * width * height, hidden_size), + nn.ReLU(), + nn.Dropout(0.1), + nn.Linear(hidden_size, hidden_size), + nn.ReLU(), + nn.Dropout(0.1), + nn.Linear(hidden_size, self.num_classes), + ) + + self.val_accuracy = Accuracy() + self.test_accuracy = Accuracy() + + def forward(self, x): + x = self.model(x) + return F.log_softmax(x, dim=1) + + def training_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + return loss + + def validation_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + preds = torch.argmax(logits, dim=1) + self.val_accuracy.update(preds, y) + + # Calling self.log will surface up scalars for you in TensorBoard + self.log("val_loss", loss, prog_bar=True) + self.log("val_acc", self.val_accuracy, prog_bar=True) + + def test_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + preds = torch.argmax(logits, dim=1) + self.test_accuracy.update(preds, y) + + # Calling self.log will surface up scalars for you in TensorBoard + self.log("test_loss", loss, prog_bar=True) + self.log("test_acc", self.test_accuracy, prog_bar=True) + + def configure_optimizers(self): + optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) + return optimizer + + #################### + # DATA RELATED HOOKS + #################### + + def prepare_data(self): + # download + print("Downloading MNIST dataset...") + MNIST(self.data_dir, train=True, download=True) + MNIST(self.data_dir, train=False, download=True) + + def setup(self, stage=None): + + # Assign train/val datasets for use in dataloaders + if stage == "fit" or stage is None: + mnist_full = MNIST(self.data_dir, train=True, transform=self.transform) + self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) + + # Assign test dataset for use in dataloader(s) + if stage == "test" or stage is None: + self.mnist_test = MNIST( + self.data_dir, train=False, transform=self.transform + ) + + def train_dataloader(self): + return DataLoader(self.mnist_train, batch_size=BATCH_SIZE) + + def val_dataloader(self): + return DataLoader(self.mnist_val, batch_size=BATCH_SIZE) + + def test_dataloader(self): + return DataLoader(self.mnist_test, batch_size=BATCH_SIZE) + + +# Init DataLoader from MNIST Dataset + +model = LitMNIST() + +print("GROUP: ", int(os.environ.get("GROUP_WORLD_SIZE", 1))) +print("LOCAL: ", int(os.environ.get("LOCAL_WORLD_SIZE", 1))) + +# Initialize a trainer +trainer = Trainer( + accelerator="auto", + # devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs + max_epochs=2, + callbacks=[TQDMProgressBar(refresh_rate=20)], + num_nodes=int(os.environ.get("GROUP_WORLD_SIZE", 1)), + devices=int(os.environ.get("LOCAL_WORLD_SIZE", 1)), + strategy="ddp", +) + +# Train the model ⚡ +trainer.fit(model) diff --git a/test/odh/resources/mnist_mcad_mini.ipynb b/test/odh/resources/mnist_mcad_mini.ipynb new file mode 100644 index 000000000..0b53324ab --- /dev/null +++ b/test/odh/resources/mnist_mcad_mini.ipynb @@ -0,0 +1,93 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "b55bc3ea-4ce3-49bf-bb1f-e209de8ca47a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Import pieces from codeflare-sdk\n", + "from codeflare_sdk.job.jobs import DDPJobDefinition\n", + "from time import sleep" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47ca5c15", + "metadata": { + "tags": ["parameters"] + }, + "outputs": [], + "source": [ + "#parameters\n", + "namespace = \"default\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26b21373", + "metadata": {}, + "outputs": [], + "source": [ + "job = DDPJobDefinition(name=\"mnistjob\", script=\"mnist.py\", scheduler_args={\"namespace\": namespace}, j=\"1x1\", gpu=0, cpu=1, memMB=2000, image=\"quay.io/project-codeflare/mnist-job-test:v0.0.1\").submit()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d24e9f95", + "metadata": {}, + "outputs": [], + "source": [ + "finished = False\n", + "while not finished:\n", + " sleep(1)\n", + " try:\n", + " finished = (\"Epoch 4: 100%\" in job.logs())\n", + " except:\n", + " finished = False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f078b7cd", + "metadata": {}, + "outputs": [], + "source": [ + "job.cancel()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + }, + "vscode": { + "interpreter": { + "hash": "f9f85f796d01129d0dd105a088854619f454435301f6ffec2fea96ecbd9be4ac" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/test/odh/resources/mnist_ray_mini.ipynb b/test/odh/resources/mnist_ray_mini.ipynb new file mode 100644 index 000000000..38992cc7d --- /dev/null +++ b/test/odh/resources/mnist_ray_mini.ipynb @@ -0,0 +1,145 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "b55bc3ea-4ce3-49bf-bb1f-e209de8ca47a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Import pieces from codeflare-sdk\n", + "from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration\n", + "from codeflare_sdk.job.jobs import DDPJobDefinition\n", + "from time import sleep" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30888aed", + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "#parameters\n", + "namespace = \"default\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f4bc870-091f-4e11-9642-cba145710159", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Create our cluster and submit appwrapper\n", + "cluster = Cluster(ClusterConfiguration(namespace=namespace, name='mnisttest', head_cpus=1, head_memory=2, num_workers=1, min_cpus=1, max_cpus=1, min_memory=1, max_memory=2, num_gpus=0, instascale=False))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0884bbc-c224-4ca0-98a0-02dfa09c2200", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Bring up the cluster\n", + "cluster.up()\n", + "cluster.wait_ready()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df71c1ed", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "cluster.status()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7fd45bc5-03c0-4ae5-9ec5-dd1c30f1a084", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "cluster.details()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47ca5c15", + "metadata": {}, + "outputs": [], + "source": [ + "job = DDPJobDefinition(name=\"mnisttest\", script=\"mnist.py\", workspace=\"file:///opt/app-root/notebooks/..data\", scheduler_args={\"requirements\": \"/opt/app-root/notebooks/requirements.txt\"}).submit(cluster)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f63a178a", + "metadata": {}, + "outputs": [], + "source": [ + "finished = False\n", + "while not finished:\n", + " sleep(1)\n", + " status = job.status()\n", + " finished = (str(status.state) == \"SUCCEEDED\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b099777", + "metadata": {}, + "outputs": [], + "source": [ + "cluster.down()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + }, + "vscode": { + "interpreter": { + "hash": "f9f85f796d01129d0dd105a088854619f454435301f6ffec2fea96ecbd9be4ac" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/test/odh/resources/requirements.txt b/test/odh/resources/requirements.txt new file mode 100644 index 000000000..7266b064a --- /dev/null +++ b/test/odh/resources/requirements.txt @@ -0,0 +1,4 @@ +pytorch_lightning==1.5.10 +ray_lightning +torchmetrics==0.9.1 +torchvision==0.12.0 diff --git a/test/odh/support.go b/test/odh/support.go new file mode 100644 index 000000000..d828ed950 --- /dev/null +++ b/test/odh/support.go @@ -0,0 +1,34 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package odh + +import ( + "embed" + + "github.com/onsi/gomega" + "github.com/project-codeflare/codeflare-common/support" +) + +//go:embed resources/* +var files embed.FS + +func ReadFile(t support.Test, fileName string) []byte { + t.T().Helper() + file, err := files.ReadFile(fileName) + t.Expect(err).NotTo(gomega.HaveOccurred()) + return file +}