diff --git a/README.md b/README.md index 9488b3b46..de350d042 100644 --- a/README.md +++ b/README.md @@ -86,6 +86,21 @@ The e2e tests can be executed locally by running the following commands: Alternatively, You can run the e2e test(s) from your IDE / debugger. +#### Testing on disconnected cluster + +To properly run e2e tests on disconnected cluster user has to provide additional environment variables to properly configure testing environment: + +- `CODEFLARE_TEST_PYTORCH_IMAGE` - image tag for image used to run training job using MCAD +- `CODEFLARE_TEST_RAY_IMAGE` - image tag for Ray cluster image +- `MNIST_DATASET_URL` - URL where MNIST dataset is available +- `PIP_INDEX_URL` - URL where PyPI server with needed dependencies is running +- `PIP_TRUSTED_HOST` - PyPI server hostname + +For ODH tests additional environment variables are needed: + +- `NOTEBOOK_IMAGE_STREAM_NAME` - name of the ODH Notebook ImageStream to be used +- `ODH_NAMESPACE` - namespace where ODH is installed + ## Release 1. Invoke [project-codeflare-release.yaml](https://github.com/project-codeflare/codeflare-operator/actions/workflows/project-codeflare-release.yml) diff --git a/go.mod b/go.mod index c8cca86f1..252653db6 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,7 @@ go 1.20 require ( github.com/onsi/gomega v1.27.10 github.com/openshift/api v0.0.0-20230213134911-7ba313770556 - github.com/project-codeflare/codeflare-common v0.0.0-20240111082724-8f0684651717 + github.com/project-codeflare/codeflare-common v0.0.0-20240201153809-2e7292120303 github.com/project-codeflare/instascale v0.4.0 github.com/project-codeflare/multi-cluster-app-dispatcher v1.39.0 github.com/ray-project/kuberay/ray-operator v1.0.0 diff --git a/go.sum b/go.sum index a1ad6ba03..b36099d6c 100644 --- a/go.sum +++ b/go.sum @@ -384,8 +384,8 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/project-codeflare/codeflare-common v0.0.0-20240111082724-8f0684651717 h1:knUKEKvfEzVuSwQ4NAe2+I/Oxo4WztU5rYR8d/F66Lw= -github.com/project-codeflare/codeflare-common v0.0.0-20240111082724-8f0684651717/go.mod h1:2Ck9LC+6Xi4jTDSlCJoP00tCzSrxek0roLsjvUgL2gY= +github.com/project-codeflare/codeflare-common v0.0.0-20240201153809-2e7292120303 h1:30LG8751WElZmWA3mVS8l23l2oZnUCqbDkLCyy0U/p0= +github.com/project-codeflare/codeflare-common v0.0.0-20240201153809-2e7292120303/go.mod h1:2Ck9LC+6Xi4jTDSlCJoP00tCzSrxek0roLsjvUgL2gY= github.com/project-codeflare/instascale v0.4.0 h1:l/cb+x4FrJ2bN9wXjv1mCngy77tVw0CLMiqJovTAflo= github.com/project-codeflare/instascale v0.4.0/go.mod h1:CpduFXKeuqYW4Ph1CPOJV6dpAdpebOxhbU4CmccZWSo= github.com/project-codeflare/multi-cluster-app-dispatcher v1.39.0 h1:zoS7pEAWK6eGELPCIIHB3W8Zb/a27Rf55ChYso7EV3o= diff --git a/test/e2e/mnist_pytorch_mcad_job_test.go b/test/e2e/mnist_pytorch_mcad_job_test.go index 883457e95..a642d31ca 100644 --- a/test/e2e/mnist_pytorch_mcad_job_test.go +++ b/test/e2e/mnist_pytorch_mcad_job_test.go @@ -81,6 +81,8 @@ func TestMNISTPyTorchMCAD(t *testing.T) { Env: []corev1.EnvVar{ {Name: "PYTHONUSERBASE", Value: "/workdir"}, {Name: "MNIST_DATASET_URL", Value: GetMnistDatasetURL()}, + {Name: "PIP_INDEX_URL", Value: GetPipIndexURL()}, + {Name: "PIP_TRUSTED_HOST", Value: GetPipTrustedHost()}, }, Command: []string{"/bin/sh", "-c", "pip install -r /test/requirements.txt && torchrun /test/mnist.py"}, VolumeMounts: []corev1.VolumeMount{ diff --git a/test/e2e/mnist_rayjob_mcad_raycluster_test.go b/test/e2e/mnist_rayjob_mcad_raycluster_test.go index 21bd98ad8..1118079ef 100644 --- a/test/e2e/mnist_rayjob_mcad_raycluster_test.go +++ b/test/e2e/mnist_rayjob_mcad_raycluster_test.go @@ -229,6 +229,8 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) { - torchvision==0.12.0 env_vars: MNIST_DATASET_URL: "` + GetMnistDatasetURL() + `" + PIP_INDEX_URL: "` + GetPipIndexURL() + `" + PIP_TRUSTED_HOST: "` + GetPipTrustedHost() + `" `, ClusterSelector: map[string]string{ RayJobDefaultClusterSelectorKey: rayCluster.Name, diff --git a/test/odh/mcad_ray_test.go b/test/odh/mcad_ray_test.go index 770b64d9d..b72456c59 100644 --- a/test/odh/mcad_ray_test.go +++ b/test/odh/mcad_ray_test.go @@ -38,8 +38,8 @@ func TestMCADRay(t *testing.T) { config := CreateConfigMap(test, namespace.Name, map[string][]byte{ // MNIST Ray Notebook jupyterNotebookConfigMapFileName: ReadFile(test, "resources/mnist_ray_mini.ipynb"), - "mnist.py": ReadFile(test, "resources/mnist.py"), - "requirements.txt": ReadFile(test, "resources/requirements.txt"), + "mnist.py": readMnistPy(test), + "requirements.txt": readRequirementsTxt(test), }) // Create RBAC, retrieve token for user with limited rights @@ -59,6 +59,11 @@ func TestMCADRay(t *testing.T) { APIGroups: []string{"route.openshift.io"}, Resources: []string{"routes"}, }, + { + Verbs: []string{"get", "list"}, + APIGroups: []string{"networking.k8s.io"}, + Resources: []string{"ingresses"}, + }, } // Create cluster wide RBAC, required for SDK OpenShift check @@ -96,3 +101,36 @@ func TestMCADRay(t *testing.T) { test.Eventually(AppWrappers(test, namespace), TestTimeoutLong). Should(HaveLen(0)) } + +func readRequirementsTxt(test Test) []byte { + // Read the requirements.txt from resources and perform replacements for custom values using go template + props := struct { + PipIndexUrl string + PipTrustedHost string + }{ + PipIndexUrl: "--index " + GetPipIndexURL(), + } + + // Provide trusted host only if defined + if len(GetPipTrustedHost()) > 0 { + props.PipTrustedHost = "--trusted-host " + GetPipTrustedHost() + } + + template, err := files.ReadFile("resources/requirements.txt") + test.Expect(err).NotTo(HaveOccurred()) + + return ParseTemplate(test, template, props) +} + +func readMnistPy(test Test) []byte { + // Read the mnist.py from resources and perform replacements for custom values using go template + props := struct { + MnistDatasetURL string + }{ + MnistDatasetURL: GetMnistDatasetURL(), + } + template, err := files.ReadFile("resources/mnist.py") + test.Expect(err).NotTo(HaveOccurred()) + + return ParseTemplate(test, template, props) +} diff --git a/test/odh/notebook.go b/test/odh/notebook.go index 8c7b28275..70bdda82c 100644 --- a/test/odh/notebook.go +++ b/test/odh/notebook.go @@ -18,7 +18,6 @@ package odh import ( "bytes" - "html/template" gomega "github.com/onsi/gomega" . "github.com/project-codeflare/codeflare-common/support" @@ -44,6 +43,7 @@ type NotebookProps struct { OpenDataHubNamespace string ImageStreamName string ImageStreamTag string + RayImage string NotebookConfigMapName string NotebookConfigMapFileName string NotebookPVC string @@ -66,23 +66,19 @@ func createNotebook(test Test, namespace *corev1.Namespace, notebookToken, jupyt OpenDataHubNamespace: GetOpenDataHubNamespace(), ImageStreamName: GetNotebookImageStreamName(test), ImageStreamTag: recommendedTagName, + RayImage: GetRayImage(), NotebookConfigMapName: jupyterNotebookConfigMapName, NotebookConfigMapFileName: jupyterNotebookConfigMapFileName, NotebookPVC: notebookPVC.Name, } notebookTemplate, err := files.ReadFile("resources/custom-nb-small.yaml") test.Expect(err).NotTo(gomega.HaveOccurred()) - parsedNotebookTemplate, err := template.New("notebook").Parse(string(notebookTemplate)) - test.Expect(err).NotTo(gomega.HaveOccurred()) - // Filter template and store results to the buffer - notebookBuffer := new(bytes.Buffer) - err = parsedNotebookTemplate.Execute(notebookBuffer, notebookProps) - test.Expect(err).NotTo(gomega.HaveOccurred()) + parsedNotebookTemplate := ParseTemplate(test, notebookTemplate, notebookProps) // Create Notebook CR notebookCR := &unstructured.Unstructured{} - err = yaml.NewYAMLOrJSONDecoder(notebookBuffer, 8192).Decode(notebookCR) + err = yaml.NewYAMLOrJSONDecoder(bytes.NewBuffer(parsedNotebookTemplate), 8192).Decode(notebookCR) test.Expect(err).NotTo(gomega.HaveOccurred()) _, err = test.Client().Dynamic().Resource(notebookResource).Namespace(namespace.Name).Create(test.Ctx(), notebookCR, metav1.CreateOptions{}) test.Expect(err).NotTo(gomega.HaveOccurred()) diff --git a/test/odh/resources/custom-nb-small.yaml b/test/odh/resources/custom-nb-small.yaml index 95aaaf106..791a2d98a 100644 --- a/test/odh/resources/custom-nb-small.yaml +++ b/test/odh/resources/custom-nb-small.yaml @@ -55,7 +55,7 @@ spec: - name: OCP_TOKEN value: {{.KubernetesBearerToken}} image: image-registry.openshift-image-registry.svc:5000/{{.OpenDataHubNamespace}}/{{.ImageStreamName}}:{{.ImageStreamTag}} - command: ["/bin/sh", "-c", "pip install papermill && oc login --token=${OCP_TOKEN} --server=${OCP_SERVER} --insecure-skip-tls-verify=true && papermill /opt/app-root/notebooks/{{.NotebookConfigMapFileName}} /opt/app-root/src/mcad-out.ipynb -p namespace {{.Namespace}} && sleep infinity"] + command: ["/bin/sh", "-c", "pip install papermill && oc login --token=${OCP_TOKEN} --server=${OCP_SERVER} --insecure-skip-tls-verify=true && papermill /opt/app-root/notebooks/{{.NotebookConfigMapFileName}} /opt/app-root/src/mcad-out.ipynb -p namespace {{.Namespace}} -p ray_image {{.RayImage}} && sleep infinity"] # args: ["pip install papermill && oc login --token=${OCP_TOKEN} --server=${OCP_SERVER} --insecure-skip-tls-verify=true && papermill /opt/app-root/notebooks/mcad.ipynb /opt/app-root/src/mcad-out.ipynb" ] imagePullPolicy: Always # livenessProbe: diff --git a/test/odh/resources/mnist.py b/test/odh/resources/mnist.py index d6a211944..e88e8fc9a 100644 --- a/test/odh/resources/mnist.py +++ b/test/odh/resources/mnist.py @@ -15,6 +15,7 @@ import os import torch +import requests from pytorch_lightning import LightningModule, Trainer from pytorch_lightning.callbacks.progress import TQDMProgressBar from torch import nn @@ -32,6 +33,8 @@ print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR")) print("MASTER_PORT: is ", os.getenv("MASTER_PORT")) +MNIST_DATASET_URL = "{{.MnistDatasetURL}}" +print("MNIST_DATASET_URL: is ", MNIST_DATASET_URL) class LitMNIST(LightningModule): def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4): @@ -110,8 +113,34 @@ def configure_optimizers(self): #################### def prepare_data(self): - # download - print("Downloading MNIST dataset...") + datasetFiles = [ + "t10k-images-idx3-ubyte", + "t10k-labels-idx1-ubyte", + "train-images-idx3-ubyte", + "train-labels-idx1-ubyte" + ] + + # Create required folder structure + downloadLocation = os.path.join(PATH_DATASETS, "MNIST", "raw") + os.makedirs(downloadLocation, exist_ok=True) + print(f"{downloadLocation} folder_path created!") + + for file in datasetFiles: + print(f"Downloading MNIST dataset {file}... to path : {downloadLocation}") + response = requests.get(f"{MNIST_DATASET_URL}{file}", stream=True) + filePath = os.path.join(downloadLocation, file) + + #to download dataset file + try: + if response.status_code == 200: + open(filePath, 'wb').write(response.content) + print(f"{file}: Downloaded and saved zipped file to path - {filePath}") + else: + print(f"Failed to download file {file}") + except Exception as e: + print(e) + print(f"Downloaded MNIST dataset to... {downloadLocation}") + MNIST(self.data_dir, train=True, download=True) MNIST(self.data_dir, train=False, download=True) diff --git a/test/odh/resources/mnist_ray_mini.ipynb b/test/odh/resources/mnist_ray_mini.ipynb index 38992cc7d..0d8fcc53a 100644 --- a/test/odh/resources/mnist_ray_mini.ipynb +++ b/test/odh/resources/mnist_ray_mini.ipynb @@ -27,7 +27,8 @@ "outputs": [], "source": [ "#parameters\n", - "namespace = \"default\"" + "namespace = \"default\"\n", + "ray_image = \"has to be specified\"" ] }, { @@ -40,7 +41,7 @@ "outputs": [], "source": [ "# Create our cluster and submit appwrapper\n", - "cluster = Cluster(ClusterConfiguration(namespace=namespace, name='mnisttest', head_cpus=1, head_memory=2, num_workers=1, min_cpus=1, max_cpus=1, min_memory=1, max_memory=2, num_gpus=0, instascale=False))" + "cluster = Cluster(ClusterConfiguration(namespace=namespace, name='mnisttest', head_cpus=1, head_memory=2, num_workers=1, min_cpus=1, max_cpus=1, min_memory=1, max_memory=2, num_gpus=0, instascale=False, image=ray_image))" ] }, { diff --git a/test/odh/resources/requirements.txt b/test/odh/resources/requirements.txt index 7266b064a..cf9fb5889 100644 --- a/test/odh/resources/requirements.txt +++ b/test/odh/resources/requirements.txt @@ -1,3 +1,5 @@ +{{.PipIndexUrl}} +{{.PipTrustedHost}} pytorch_lightning==1.5.10 ray_lightning torchmetrics==0.9.1 diff --git a/test/odh/template.go b/test/odh/template.go new file mode 100644 index 000000000..3ff4da17f --- /dev/null +++ b/test/odh/template.go @@ -0,0 +1,40 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package odh + +import ( + "bytes" + "html/template" + + "github.com/onsi/gomega" + "github.com/project-codeflare/codeflare-common/support" +) + +func ParseTemplate(t support.Test, inputTemplate []byte, props interface{}) []byte { + t.T().Helper() + + // Parse input template + parsedTemplate, err := template.New("template").Parse(string(inputTemplate)) + t.Expect(err).NotTo(gomega.HaveOccurred()) + + // Filter template and store results to the buffer + buffer := new(bytes.Buffer) + err = parsedTemplate.Execute(buffer, props) + t.Expect(err).NotTo(gomega.HaveOccurred()) + + return buffer.Bytes() +}