From 011cc01502c4e1ce765a0770234ec26773b781ff Mon Sep 17 00:00:00 2001 From: Srihari Date: Thu, 14 Sep 2023 13:22:33 +0530 Subject: [PATCH 1/4] Create OLM upgrade e2e scenario using codeflare SDK --- .github/workflows/olm_tests.yaml | 17 ++ test/e2e/mnist_rayjob.py | 44 +++++ test/e2e/olm_upgrade_test.go | 322 +++++++++++++++++++++++++++++++ test/e2e/raycluster_sdk.py | 29 +++ 4 files changed, 412 insertions(+) create mode 100644 test/e2e/mnist_rayjob.py create mode 100644 test/e2e/olm_upgrade_test.go create mode 100644 test/e2e/raycluster_sdk.py diff --git a/.github/workflows/olm_tests.yaml b/.github/workflows/olm_tests.yaml index 83175503a..de1de85bb 100644 --- a/.github/workflows/olm_tests.yaml +++ b/.github/workflows/olm_tests.yaml @@ -121,6 +121,14 @@ jobs: OPM_BUNDLE_OPT: "--use-http" BUNDLE_PUSH_OPT: "--tls-verify=false" CATALOG_PUSH_OPT: "--tls-verify=false" + - name: Run OLM Upgrade e2e Ray cluster Up test + run: | + export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }} + echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV + set -euo pipefail + go test -timeout 30m -v ./test/e2e -run TestMNISTRayClusterUp -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt + env: + RUN_OLM_TESTS: true - name: Update Operator to the built version run: | @@ -151,6 +159,15 @@ jobs: SUBSCRIPTION_NAME: "codeflare-operator" SUBSCRIPTION_NAMESPACE: "openshift-operators" + - name: Run OLM Upgrade e2e Ray cluster Job submit test + run: | + export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }} + echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV + set -euo pipefail + go test -timeout 30m -v ./test/e2e -run TestMnistJobSubmit -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt + env: + RUN_OLM_TESTS: true + - name: Run e2e tests against built operator run: | export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }} diff --git a/test/e2e/mnist_rayjob.py b/test/e2e/mnist_rayjob.py new file mode 100644 index 000000000..99cfcb1a7 --- /dev/null +++ b/test/e2e/mnist_rayjob.py @@ -0,0 +1,44 @@ +import sys + +from time import sleep + +from torchx.specs.api import AppState, is_terminal + +from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration +from codeflare_sdk.job.jobs import DDPJobDefinition + +namespace = sys.argv[1] + +cluster = Cluster(ClusterConfiguration(name='mnist')) + +jobdef = DDPJobDefinition( + name="mnist", + script="mnist.py", + scheduler_args={"requirements": "requirements.txt"}, +) +job = jobdef.submit(cluster) + +done = False +time = 0 +timeout = 300 +while not done: + status = job.status() + if is_terminal(status.state): + break + if not done: + print(status) + if timeout and time >= timeout: + raise TimeoutError(f"job has timed out after waiting {timeout}s") + sleep(5) + time += 5 + +print(f"Job has completed: {status.state}") + +print(job.logs()) + +cluster.down() + +if not status.state == AppState.SUCCEEDED: + exit(1) +else: + exit(0) diff --git a/test/e2e/olm_upgrade_test.go b/test/e2e/olm_upgrade_test.go new file mode 100644 index 000000000..3dde139f9 --- /dev/null +++ b/test/e2e/olm_upgrade_test.go @@ -0,0 +1,322 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "os" + "testing" + + . "github.com/onsi/gomega" + mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" + rayv1alpha1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1" + + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + . "github.com/project-codeflare/codeflare-operator/test/support" +) + +var namespaceName string = "test-ns-olmupgrade" +var namespace *corev1.Namespace + +// Creates a Ray cluster +func TestMNISTRayClusterUp(t *testing.T) { + + test := With(t) + test.T().Parallel() + if os.Getenv("RUN_OLM_TESTS") == "true" { + // Create a namespace + namespace := CreateTestNamespaceWithName(test, namespaceName) + test.T().Logf("Created namespace %s successfully", namespace.Name) + + // Test configuration + config := &corev1.ConfigMap{ + TypeMeta: metav1.TypeMeta{ + APIVersion: corev1.SchemeGroupVersion.String(), + Kind: "ConfigMap", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "mnist-raycluster-sdk", + Namespace: namespace.Name, + }, + BinaryData: map[string][]byte{ + // SDK script + "raycluster_sdk.py": ReadFile(test, "raycluster_sdk.py"), + }, + Immutable: Ptr(true), + } + config, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), config, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created ConfigMap %s/%s successfully", config.Namespace, config.Name) + + // SDK client RBAC + serviceAccount := &corev1.ServiceAccount{ + TypeMeta: metav1.TypeMeta{ + APIVersion: corev1.SchemeGroupVersion.String(), + Kind: "ServiceAccount", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "sdk-user", + Namespace: namespace.Name, + }, + } + serviceAccount, err = test.Client().Core().CoreV1().ServiceAccounts(namespace.Name).Create(test.Ctx(), serviceAccount, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + + role := &rbacv1.Role{ + TypeMeta: metav1.TypeMeta{ + APIVersion: rbacv1.SchemeGroupVersion.String(), + Kind: "Role", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "sdk", + Namespace: namespace.Name, + }, + Rules: []rbacv1.PolicyRule{ + { + Verbs: []string{"get", "create", "delete", "list", "patch", "update"}, + APIGroups: []string{mcadv1beta1.GroupName}, + Resources: []string{"appwrappers"}, + }, + { + Verbs: []string{"get", "list"}, + APIGroups: []string{rayv1alpha1.GroupVersion.Group}, + Resources: []string{"rayclusters", "rayclusters/status"}, + }, + { + Verbs: []string{"get", "list"}, + APIGroups: []string{"route.openshift.io"}, + Resources: []string{"routes"}, + }, + }, + } + role, err = test.Client().Core().RbacV1().Roles(namespace.Name).Create(test.Ctx(), role, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + + roleBinding := &rbacv1.RoleBinding{ + TypeMeta: metav1.TypeMeta{ + APIVersion: rbacv1.SchemeGroupVersion.String(), + Kind: "RoleBinding", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "sdk", + }, + RoleRef: rbacv1.RoleRef{ + APIGroup: rbacv1.SchemeGroupVersion.Group, + Kind: "Role", + Name: role.Name, + }, + Subjects: []rbacv1.Subject{ + { + Kind: "ServiceAccount", + APIGroup: corev1.SchemeGroupVersion.Group, + Name: serviceAccount.Name, + Namespace: serviceAccount.Namespace, + }, + }, + } + _, err = test.Client().Core().RbacV1().RoleBindings(namespace.Name).Create(test.Ctx(), roleBinding, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + + job := &batchv1.Job{ + TypeMeta: metav1.TypeMeta{ + APIVersion: batchv1.SchemeGroupVersion.String(), + Kind: "Job", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "sdk", + Namespace: namespace.Name, + }, + Spec: batchv1.JobSpec{ + Completions: Ptr(int32(1)), + Parallelism: Ptr(int32(1)), + BackoffLimit: Ptr(int32(0)), + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "test", + // FIXME: switch to base Python image once the dependency on OpenShift CLI is removed + // See https://github.com/project-codeflare/codeflare-sdk/pull/146 + Image: "quay.io/opendatahub/notebooks:jupyter-minimal-ubi8-python-3.8-4c8f26e", + Command: []string{"/bin/sh", "-c", "pip install codeflare-sdk==" + GetCodeFlareSDKVersion() + " && cp /test/* . && python raycluster_sdk.py" + " " + namespace.Name}, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "test", + MountPath: "/test", + }, + }, + }, + }, + Volumes: []corev1.Volume{ + { + Name: "test", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: config.Name, + }, + }, + }, + }, + }, + RestartPolicy: corev1.RestartPolicyNever, + ServiceAccountName: serviceAccount.Name, + }, + }, + }, + } + job, err = test.Client().Core().BatchV1().Jobs(namespace.Name).Create(test.Ctx(), job, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created Job %s/%s successfully", job.Namespace, job.Name) + + // Retrieving the job logs once it has completed or timed out + defer WriteJobLogs(test, job.Namespace, job.Name) + + test.T().Logf("Waiting for Job %s/%s to complete", job.Namespace, job.Name) + test.Eventually(Job(test, job.Namespace, job.Name), TestTimeoutLong).Should( + Or( + WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue)), + WithTransform(ConditionStatus(batchv1.JobFailed), Equal(corev1.ConditionTrue)), + )) + + // Assert the job has completed successfully + test.Expect(GetJob(test, job.Namespace, job.Name)). + To(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue))) + + } else { + test.T().Skip("Skipping OLM upgarde test because RUN_OLM_TESTS is not set") + } +} + +// Submit a Job to the Ray cluster and trains the MNIST dataset using the CodeFlare SDK. +func TestMnistJobSubmit(t *testing.T) { + + test := With(t) + test.T().Parallel() + if os.Getenv("RUN_OLM_TESTS") == "true" { + + // Test configuration + config := &corev1.ConfigMap{ + TypeMeta: metav1.TypeMeta{ + APIVersion: corev1.SchemeGroupVersion.String(), + Kind: "ConfigMap", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "mnist-ray-job", + Namespace: namespaceName, + }, + BinaryData: map[string][]byte{ + // SDK script + "mnist_rayjob.py": ReadFile(test, "mnist_rayjob.py"), + // pip requirements + "requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"), + // MNIST training script + "mnist.py": ReadFile(test, "mnist.py"), + }, + Immutable: Ptr(true), + } + config, err := test.Client().Core().CoreV1().ConfigMaps(namespaceName).Create(test.Ctx(), config, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created ConfigMap %s/%s successfully", config.Namespace, config.Name) + + // SDK client RBAC + serviceAccount := &corev1.ServiceAccount{ + TypeMeta: metav1.TypeMeta{ + APIVersion: corev1.SchemeGroupVersion.String(), + Kind: "ServiceAccount", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "ray-user", + Namespace: namespaceName, + }, + } + serviceAccount, err = test.Client().Core().CoreV1().ServiceAccounts(namespaceName).Create(test.Ctx(), serviceAccount, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + + job := &batchv1.Job{ + TypeMeta: metav1.TypeMeta{ + APIVersion: batchv1.SchemeGroupVersion.String(), + Kind: "Job", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "rayjob", + Namespace: namespaceName, + }, + Spec: batchv1.JobSpec{ + Completions: Ptr(int32(1)), + Parallelism: Ptr(int32(1)), + BackoffLimit: Ptr(int32(0)), + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "test", + // FIXME: switch to base Python image once the dependency on OpenShift CLI is removed + // See https://github.com/project-codeflare/codeflare-sdk/pull/146 + Image: "quay.io/opendatahub/notebooks:jupyter-minimal-ubi8-python-3.8-4c8f26e", + Command: []string{"/bin/sh", "-c", "pip install codeflare-sdk==" + GetCodeFlareSDKVersion() + " && cp /test/* . && python mnist_rayjob.py" + " " + namespaceName}, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "test", + MountPath: "/test", + }, + }, + }, + }, + Volumes: []corev1.Volume{ + { + Name: "test", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: config.Name, + }, + }, + }, + }, + }, + RestartPolicy: corev1.RestartPolicyNever, + ServiceAccountName: serviceAccount.Name, + }, + }, + }, + } + job, err = test.Client().Core().BatchV1().Jobs(namespaceName).Create(test.Ctx(), job, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created Job %s/%s successfully", job.Namespace, job.Name) + + // Retrieving the job logs once it has completed or timed out + defer WriteJobLogs(test, job.Namespace, job.Name) + + test.T().Logf("Waiting for Job %s/%s to complete", job.Namespace, job.Name) + test.Eventually(Job(test, job.Namespace, job.Name), TestTimeoutLong).Should( + Or( + WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue)), + WithTransform(ConditionStatus(batchv1.JobFailed), Equal(corev1.ConditionTrue)), + )) + + // Assert the job has completed successfully + test.Expect(GetJob(test, job.Namespace, job.Name)). + To(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue))) + } else { + test.T().Skip("Skipping OLM upgarde test because RUN_OLM_TESTS is not set") + } + +} diff --git a/test/e2e/raycluster_sdk.py b/test/e2e/raycluster_sdk.py new file mode 100644 index 000000000..5a8abf5cd --- /dev/null +++ b/test/e2e/raycluster_sdk.py @@ -0,0 +1,29 @@ +import sys + +from time import sleep + +from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration + +namespace = sys.argv[1] + +cluster = Cluster(ClusterConfiguration( + name='mnist', + namespace=namespace, + num_workers=1, + min_cpus='500m', + max_cpus=1, + min_memory=0.5, + max_memory=1, + num_gpus=0, + instascale=False, +)) + +cluster.up() + +cluster.status() + +cluster.wait_ready() + +cluster.status() + +cluster.details() From c70f2570cb971c04d41a45a9583454f0c6376c89 Mon Sep 17 00:00:00 2001 From: Srihari Date: Thu, 14 Sep 2023 13:32:41 +0530 Subject: [PATCH 2/4] Create OLM upgrade e2e scenario using codeflare SDK --- test/e2e/olm_upgrade_test.go | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/test/e2e/olm_upgrade_test.go b/test/e2e/olm_upgrade_test.go index 3dde139f9..acb25a4b0 100644 --- a/test/e2e/olm_upgrade_test.go +++ b/test/e2e/olm_upgrade_test.go @@ -42,9 +42,16 @@ func TestMNISTRayClusterUp(t *testing.T) { test.T().Parallel() if os.Getenv("RUN_OLM_TESTS") == "true" { // Create a namespace - namespace := CreateTestNamespaceWithName(test, namespaceName) + namespace = CreateTestNamespaceWithName(test, namespaceName) test.T().Logf("Created namespace %s successfully", namespace.Name) + // Delete namespace only if test failed + defer func() { + if t.Failed() { + DeleteTestNamespace(test, namespaceName) + } + }() + // Test configuration config := &corev1.ConfigMap{ TypeMeta: metav1.TypeMeta{ @@ -212,6 +219,9 @@ func TestMnistJobSubmit(t *testing.T) { test.T().Parallel() if os.Getenv("RUN_OLM_TESTS") == "true" { + //delete the namespace after test complete + defer DeleteTestNamespace(test, namespaceName) + // Test configuration config := &corev1.ConfigMap{ TypeMeta: metav1.TypeMeta{ @@ -315,6 +325,7 @@ func TestMnistJobSubmit(t *testing.T) { // Assert the job has completed successfully test.Expect(GetJob(test, job.Namespace, job.Name)). To(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue))) + } else { test.T().Skip("Skipping OLM upgarde test because RUN_OLM_TESTS is not set") } From 25d6fa20e3a82a828b0ead0f1760253f891e4905 Mon Sep 17 00:00:00 2001 From: Srihari Date: Thu, 23 Nov 2023 12:15:37 +0530 Subject: [PATCH 3/4] rebase and resolving conflicts --- .github/workflows/olm_tests.yaml | 9 +- go.mod | 1 - test/e2e/mnist_rayjob.py | 2 +- test/e2e/olm_upgrade_test.go | 333 ------------------------- test/e2e/raycluster_sdk.py | 29 --- test/e2e/start_ray_cluster.py | 51 ++++ test/upgrade/olm_upgrade_test.go | 403 +++++++++++++++++++++++++++++++ 7 files changed, 458 insertions(+), 370 deletions(-) delete mode 100644 test/e2e/olm_upgrade_test.go delete mode 100644 test/e2e/raycluster_sdk.py create mode 100644 test/e2e/start_ray_cluster.py create mode 100644 test/upgrade/olm_upgrade_test.go diff --git a/.github/workflows/olm_tests.yaml b/.github/workflows/olm_tests.yaml index de1de85bb..927dced70 100644 --- a/.github/workflows/olm_tests.yaml +++ b/.github/workflows/olm_tests.yaml @@ -121,14 +121,13 @@ jobs: OPM_BUNDLE_OPT: "--use-http" BUNDLE_PUSH_OPT: "--tls-verify=false" CATALOG_PUSH_OPT: "--tls-verify=false" + - name: Run OLM Upgrade e2e Ray cluster Up test run: | export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }} echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV set -euo pipefail - go test -timeout 30m -v ./test/e2e -run TestMNISTRayClusterUp -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt - env: - RUN_OLM_TESTS: true + go test -timeout 30m -v ./test/upgrade -run TestMNISTRayClusterUp -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt - name: Update Operator to the built version run: | @@ -164,9 +163,7 @@ jobs: export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }} echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV set -euo pipefail - go test -timeout 30m -v ./test/e2e -run TestMnistJobSubmit -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt - env: - RUN_OLM_TESTS: true + go test -timeout 30m -v ./test/upgrade -run TestMnistJobSubmit -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt - name: Run e2e tests against built operator run: | diff --git a/go.mod b/go.mod index e9f358d48..c7405e30e 100644 --- a/go.mod +++ b/go.mod @@ -6,7 +6,6 @@ require ( github.com/onsi/gomega v1.27.10 github.com/openshift/api v0.0.0-20230213134911-7ba313770556 github.com/project-codeflare/codeflare-common v0.0.0-20231129165224-988ba1da9069 - github.com/project-codeflare/instascale v0.3.1 github.com/project-codeflare/multi-cluster-app-dispatcher v1.38.1 github.com/ray-project/kuberay/ray-operator v1.0.0-rc.1 go.uber.org/zap v1.26.0 diff --git a/test/e2e/mnist_rayjob.py b/test/e2e/mnist_rayjob.py index 99cfcb1a7..7ffb39271 100644 --- a/test/e2e/mnist_rayjob.py +++ b/test/e2e/mnist_rayjob.py @@ -9,7 +9,7 @@ namespace = sys.argv[1] -cluster = Cluster(ClusterConfiguration(name='mnist')) +cluster = Cluster(ClusterConfiguration(name='mnist',namespace=namespace)) jobdef = DDPJobDefinition( name="mnist", diff --git a/test/e2e/olm_upgrade_test.go b/test/e2e/olm_upgrade_test.go deleted file mode 100644 index acb25a4b0..000000000 --- a/test/e2e/olm_upgrade_test.go +++ /dev/null @@ -1,333 +0,0 @@ -/* -Copyright 2023. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package e2e - -import ( - "os" - "testing" - - . "github.com/onsi/gomega" - mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" - rayv1alpha1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1" - - batchv1 "k8s.io/api/batch/v1" - corev1 "k8s.io/api/core/v1" - rbacv1 "k8s.io/api/rbac/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - - . "github.com/project-codeflare/codeflare-operator/test/support" -) - -var namespaceName string = "test-ns-olmupgrade" -var namespace *corev1.Namespace - -// Creates a Ray cluster -func TestMNISTRayClusterUp(t *testing.T) { - - test := With(t) - test.T().Parallel() - if os.Getenv("RUN_OLM_TESTS") == "true" { - // Create a namespace - namespace = CreateTestNamespaceWithName(test, namespaceName) - test.T().Logf("Created namespace %s successfully", namespace.Name) - - // Delete namespace only if test failed - defer func() { - if t.Failed() { - DeleteTestNamespace(test, namespaceName) - } - }() - - // Test configuration - config := &corev1.ConfigMap{ - TypeMeta: metav1.TypeMeta{ - APIVersion: corev1.SchemeGroupVersion.String(), - Kind: "ConfigMap", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "mnist-raycluster-sdk", - Namespace: namespace.Name, - }, - BinaryData: map[string][]byte{ - // SDK script - "raycluster_sdk.py": ReadFile(test, "raycluster_sdk.py"), - }, - Immutable: Ptr(true), - } - config, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), config, metav1.CreateOptions{}) - test.Expect(err).NotTo(HaveOccurred()) - test.T().Logf("Created ConfigMap %s/%s successfully", config.Namespace, config.Name) - - // SDK client RBAC - serviceAccount := &corev1.ServiceAccount{ - TypeMeta: metav1.TypeMeta{ - APIVersion: corev1.SchemeGroupVersion.String(), - Kind: "ServiceAccount", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "sdk-user", - Namespace: namespace.Name, - }, - } - serviceAccount, err = test.Client().Core().CoreV1().ServiceAccounts(namespace.Name).Create(test.Ctx(), serviceAccount, metav1.CreateOptions{}) - test.Expect(err).NotTo(HaveOccurred()) - - role := &rbacv1.Role{ - TypeMeta: metav1.TypeMeta{ - APIVersion: rbacv1.SchemeGroupVersion.String(), - Kind: "Role", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "sdk", - Namespace: namespace.Name, - }, - Rules: []rbacv1.PolicyRule{ - { - Verbs: []string{"get", "create", "delete", "list", "patch", "update"}, - APIGroups: []string{mcadv1beta1.GroupName}, - Resources: []string{"appwrappers"}, - }, - { - Verbs: []string{"get", "list"}, - APIGroups: []string{rayv1alpha1.GroupVersion.Group}, - Resources: []string{"rayclusters", "rayclusters/status"}, - }, - { - Verbs: []string{"get", "list"}, - APIGroups: []string{"route.openshift.io"}, - Resources: []string{"routes"}, - }, - }, - } - role, err = test.Client().Core().RbacV1().Roles(namespace.Name).Create(test.Ctx(), role, metav1.CreateOptions{}) - test.Expect(err).NotTo(HaveOccurred()) - - roleBinding := &rbacv1.RoleBinding{ - TypeMeta: metav1.TypeMeta{ - APIVersion: rbacv1.SchemeGroupVersion.String(), - Kind: "RoleBinding", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "sdk", - }, - RoleRef: rbacv1.RoleRef{ - APIGroup: rbacv1.SchemeGroupVersion.Group, - Kind: "Role", - Name: role.Name, - }, - Subjects: []rbacv1.Subject{ - { - Kind: "ServiceAccount", - APIGroup: corev1.SchemeGroupVersion.Group, - Name: serviceAccount.Name, - Namespace: serviceAccount.Namespace, - }, - }, - } - _, err = test.Client().Core().RbacV1().RoleBindings(namespace.Name).Create(test.Ctx(), roleBinding, metav1.CreateOptions{}) - test.Expect(err).NotTo(HaveOccurred()) - - job := &batchv1.Job{ - TypeMeta: metav1.TypeMeta{ - APIVersion: batchv1.SchemeGroupVersion.String(), - Kind: "Job", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "sdk", - Namespace: namespace.Name, - }, - Spec: batchv1.JobSpec{ - Completions: Ptr(int32(1)), - Parallelism: Ptr(int32(1)), - BackoffLimit: Ptr(int32(0)), - Template: corev1.PodTemplateSpec{ - Spec: corev1.PodSpec{ - Containers: []corev1.Container{ - { - Name: "test", - // FIXME: switch to base Python image once the dependency on OpenShift CLI is removed - // See https://github.com/project-codeflare/codeflare-sdk/pull/146 - Image: "quay.io/opendatahub/notebooks:jupyter-minimal-ubi8-python-3.8-4c8f26e", - Command: []string{"/bin/sh", "-c", "pip install codeflare-sdk==" + GetCodeFlareSDKVersion() + " && cp /test/* . && python raycluster_sdk.py" + " " + namespace.Name}, - VolumeMounts: []corev1.VolumeMount{ - { - Name: "test", - MountPath: "/test", - }, - }, - }, - }, - Volumes: []corev1.Volume{ - { - Name: "test", - VolumeSource: corev1.VolumeSource{ - ConfigMap: &corev1.ConfigMapVolumeSource{ - LocalObjectReference: corev1.LocalObjectReference{ - Name: config.Name, - }, - }, - }, - }, - }, - RestartPolicy: corev1.RestartPolicyNever, - ServiceAccountName: serviceAccount.Name, - }, - }, - }, - } - job, err = test.Client().Core().BatchV1().Jobs(namespace.Name).Create(test.Ctx(), job, metav1.CreateOptions{}) - test.Expect(err).NotTo(HaveOccurred()) - test.T().Logf("Created Job %s/%s successfully", job.Namespace, job.Name) - - // Retrieving the job logs once it has completed or timed out - defer WriteJobLogs(test, job.Namespace, job.Name) - - test.T().Logf("Waiting for Job %s/%s to complete", job.Namespace, job.Name) - test.Eventually(Job(test, job.Namespace, job.Name), TestTimeoutLong).Should( - Or( - WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue)), - WithTransform(ConditionStatus(batchv1.JobFailed), Equal(corev1.ConditionTrue)), - )) - - // Assert the job has completed successfully - test.Expect(GetJob(test, job.Namespace, job.Name)). - To(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue))) - - } else { - test.T().Skip("Skipping OLM upgarde test because RUN_OLM_TESTS is not set") - } -} - -// Submit a Job to the Ray cluster and trains the MNIST dataset using the CodeFlare SDK. -func TestMnistJobSubmit(t *testing.T) { - - test := With(t) - test.T().Parallel() - if os.Getenv("RUN_OLM_TESTS") == "true" { - - //delete the namespace after test complete - defer DeleteTestNamespace(test, namespaceName) - - // Test configuration - config := &corev1.ConfigMap{ - TypeMeta: metav1.TypeMeta{ - APIVersion: corev1.SchemeGroupVersion.String(), - Kind: "ConfigMap", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "mnist-ray-job", - Namespace: namespaceName, - }, - BinaryData: map[string][]byte{ - // SDK script - "mnist_rayjob.py": ReadFile(test, "mnist_rayjob.py"), - // pip requirements - "requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"), - // MNIST training script - "mnist.py": ReadFile(test, "mnist.py"), - }, - Immutable: Ptr(true), - } - config, err := test.Client().Core().CoreV1().ConfigMaps(namespaceName).Create(test.Ctx(), config, metav1.CreateOptions{}) - test.Expect(err).NotTo(HaveOccurred()) - test.T().Logf("Created ConfigMap %s/%s successfully", config.Namespace, config.Name) - - // SDK client RBAC - serviceAccount := &corev1.ServiceAccount{ - TypeMeta: metav1.TypeMeta{ - APIVersion: corev1.SchemeGroupVersion.String(), - Kind: "ServiceAccount", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "ray-user", - Namespace: namespaceName, - }, - } - serviceAccount, err = test.Client().Core().CoreV1().ServiceAccounts(namespaceName).Create(test.Ctx(), serviceAccount, metav1.CreateOptions{}) - test.Expect(err).NotTo(HaveOccurred()) - - job := &batchv1.Job{ - TypeMeta: metav1.TypeMeta{ - APIVersion: batchv1.SchemeGroupVersion.String(), - Kind: "Job", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "rayjob", - Namespace: namespaceName, - }, - Spec: batchv1.JobSpec{ - Completions: Ptr(int32(1)), - Parallelism: Ptr(int32(1)), - BackoffLimit: Ptr(int32(0)), - Template: corev1.PodTemplateSpec{ - Spec: corev1.PodSpec{ - Containers: []corev1.Container{ - { - Name: "test", - // FIXME: switch to base Python image once the dependency on OpenShift CLI is removed - // See https://github.com/project-codeflare/codeflare-sdk/pull/146 - Image: "quay.io/opendatahub/notebooks:jupyter-minimal-ubi8-python-3.8-4c8f26e", - Command: []string{"/bin/sh", "-c", "pip install codeflare-sdk==" + GetCodeFlareSDKVersion() + " && cp /test/* . && python mnist_rayjob.py" + " " + namespaceName}, - VolumeMounts: []corev1.VolumeMount{ - { - Name: "test", - MountPath: "/test", - }, - }, - }, - }, - Volumes: []corev1.Volume{ - { - Name: "test", - VolumeSource: corev1.VolumeSource{ - ConfigMap: &corev1.ConfigMapVolumeSource{ - LocalObjectReference: corev1.LocalObjectReference{ - Name: config.Name, - }, - }, - }, - }, - }, - RestartPolicy: corev1.RestartPolicyNever, - ServiceAccountName: serviceAccount.Name, - }, - }, - }, - } - job, err = test.Client().Core().BatchV1().Jobs(namespaceName).Create(test.Ctx(), job, metav1.CreateOptions{}) - test.Expect(err).NotTo(HaveOccurred()) - test.T().Logf("Created Job %s/%s successfully", job.Namespace, job.Name) - - // Retrieving the job logs once it has completed or timed out - defer WriteJobLogs(test, job.Namespace, job.Name) - - test.T().Logf("Waiting for Job %s/%s to complete", job.Namespace, job.Name) - test.Eventually(Job(test, job.Namespace, job.Name), TestTimeoutLong).Should( - Or( - WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue)), - WithTransform(ConditionStatus(batchv1.JobFailed), Equal(corev1.ConditionTrue)), - )) - - // Assert the job has completed successfully - test.Expect(GetJob(test, job.Namespace, job.Name)). - To(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue))) - - } else { - test.T().Skip("Skipping OLM upgarde test because RUN_OLM_TESTS is not set") - } - -} diff --git a/test/e2e/raycluster_sdk.py b/test/e2e/raycluster_sdk.py deleted file mode 100644 index 5a8abf5cd..000000000 --- a/test/e2e/raycluster_sdk.py +++ /dev/null @@ -1,29 +0,0 @@ -import sys - -from time import sleep - -from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration - -namespace = sys.argv[1] - -cluster = Cluster(ClusterConfiguration( - name='mnist', - namespace=namespace, - num_workers=1, - min_cpus='500m', - max_cpus=1, - min_memory=0.5, - max_memory=1, - num_gpus=0, - instascale=False, -)) - -cluster.up() - -cluster.status() - -cluster.wait_ready() - -cluster.status() - -cluster.details() diff --git a/test/e2e/start_ray_cluster.py b/test/e2e/start_ray_cluster.py new file mode 100644 index 000000000..238bccdf2 --- /dev/null +++ b/test/e2e/start_ray_cluster.py @@ -0,0 +1,51 @@ +import sys +import os + +from time import sleep + +from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration + +namespace = sys.argv[1] +ray_image = os.getenv('RAY_IMAGE') +host = os.getenv('CLUSTER_HOSTNAME') + +ingress_options = {} +if host is not None: + ingress_options = { + "ingresses": [ + { + "ingressName": "ray-dashboard", + "port": 8265, + "pathType": "Prefix", + "path": "/", + "host": host, + }, + ] + } + + +cluster = Cluster(ClusterConfiguration( + name='mnist', + namespace=namespace, + num_workers=1, + head_cpus='500m', + head_memory=2, + min_cpus='500m', + max_cpus=1, + min_memory=0.5, + max_memory=2, + num_gpus=0, + instascale=False, + image=ray_image, + ingress_options=ingress_options, +)) + +cluster.up() + +cluster.status() + +cluster.wait_ready() + +cluster.status() + +cluster.details() diff --git a/test/upgrade/olm_upgrade_test.go b/test/upgrade/olm_upgrade_test.go new file mode 100644 index 000000000..0d27c0649 --- /dev/null +++ b/test/upgrade/olm_upgrade_test.go @@ -0,0 +1,403 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package upgrade + +import ( + "testing" + + . "github.com/onsi/gomega" + . "github.com/project-codeflare/codeflare-common/support" + mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" + rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" + + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + . "github.com/project-codeflare/codeflare-operator/test/e2e" +) + +var ( + namespaceName = "test-ns-olmupgrade" +) + +// Creates a Ray cluster +func TestMNISTRayClusterUp(t *testing.T) { + + test := With(t) + test.T().Parallel() + + // Create a namespace + namespace := CreateTestNamespaceWithName(test, namespaceName) + test.T().Logf("Created namespace %s successfully", namespace.Name) + + // Delete namespace only if test failed + defer func() { + if t.Failed() { + DeleteTestNamespace(test, namespace) + } else { + StoreNamespaceLogs(test, namespace) + } + }() + + // Test configuration + config := CreateConfigMap(test, namespace.Name, map[string][]byte{ + // SDK script + "start_ray_cluster.py": ReadFile(test, "start_ray_cluster.py"), + // codeflare-sdk installation script + "install-codeflare-sdk.sh": ReadFile(test, "install-codeflare-sdk.sh"), + }) + + // Create RBAC, retrieve token for user with limited rights + policyRules := []rbacv1.PolicyRule{ + { + Verbs: []string{"get", "create", "delete", "list", "patch", "update"}, + APIGroups: []string{mcadv1beta1.GroupName}, + Resources: []string{"appwrappers"}, + }, + { + Verbs: []string{"get", "list"}, + APIGroups: []string{rayv1.GroupVersion.Group}, + Resources: []string{"rayclusters", "rayclusters/status"}, + }, + { + Verbs: []string{"get", "list"}, + APIGroups: []string{"route.openshift.io"}, + Resources: []string{"routes"}, + }, + { + Verbs: []string{"get", "list"}, + APIGroups: []string{"networking.k8s.io"}, + Resources: []string{"ingresses"}, + }, + } + + // Create cluster wide RBAC, required for SDK OpenShift check + // TODO reevaluate once SDK change OpenShift detection logic + clusterPolicyRules := []rbacv1.PolicyRule{ + { + Verbs: []string{"get", "list"}, + APIGroups: []string{"config.openshift.io"}, + Resources: []string{"ingresses"}, + ResourceNames: []string{"cluster"}, + }, + } + + sa := CreateServiceAccount(test, namespace.Name) + role := CreateRole(test, namespace.Name, policyRules) + CreateRoleBinding(test, namespace.Name, sa, role) + clusterRole := CreateClusterRole(test, clusterPolicyRules) + CreateClusterRoleBinding(test, sa, clusterRole) + + job := &batchv1.Job{ + TypeMeta: metav1.TypeMeta{ + APIVersion: batchv1.SchemeGroupVersion.String(), + Kind: "Job", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "sdk", + Namespace: namespace.Name, + }, + Spec: batchv1.JobSpec{ + Completions: Ptr(int32(1)), + Parallelism: Ptr(int32(1)), + BackoffLimit: Ptr(int32(0)), + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "test", + // FIXME: switch to base Python image once the dependency on OpenShift CLI is removed + // See https://github.com/project-codeflare/codeflare-sdk/pull/146 + Image: "quay.io/opendatahub/notebooks:jupyter-minimal-ubi8-python-3.8-4c8f26e", + Env: []corev1.EnvVar{ + {Name: "PYTHONUSERBASE", Value: "/workdir"}, + {Name: "RAY_IMAGE", Value: GetRayImage()}, + }, + Command: []string{"/bin/sh", "-c", "cp /test/* . && chmod +x install-codeflare-sdk.sh && ./install-codeflare-sdk.sh && python start_ray_cluster.py" + " " + namespace.Name}, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "test", + MountPath: "/test", + }, + { + Name: "codeflare-sdk", + MountPath: "/codeflare-sdk", + }, + { + Name: "workdir", + MountPath: "/workdir", + }, + }, + WorkingDir: "/workdir", + SecurityContext: &corev1.SecurityContext{ + AllowPrivilegeEscalation: Ptr(false), + SeccompProfile: &corev1.SeccompProfile{ + Type: "RuntimeDefault", + }, + Capabilities: &corev1.Capabilities{ + Drop: []corev1.Capability{"ALL"}, + }, + RunAsNonRoot: Ptr(true), + }, + }, + }, + Volumes: []corev1.Volume{ + { + Name: "test", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: config.Name, + }, + }, + }, + }, + { + Name: "codeflare-sdk", + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, + { + Name: "workdir", + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, + }, + RestartPolicy: corev1.RestartPolicyNever, + ServiceAccountName: sa.Name, + }, + }, + }, + } + if GetClusterType(test) == KindCluster { + // Take first KinD node and redirect pod hostname requests there + node := GetNodes(test)[0] + hostname := GetClusterHostname(test) + IP := GetNodeInternalIP(test, node) + + test.T().Logf("Setting KinD cluster hostname '%s' to node IP '%s' for SDK pod", hostname, IP) + job.Spec.Template.Spec.HostAliases = []corev1.HostAlias{ + { + IP: IP, + Hostnames: []string{hostname}, + }, + } + + // Propagate hostname into Python code as env variable + hostnameEnvVar := corev1.EnvVar{Name: "CLUSTER_HOSTNAME", Value: hostname} + job.Spec.Template.Spec.Containers[0].Env = append(job.Spec.Template.Spec.Containers[0].Env, hostnameEnvVar) + } + + job, err := test.Client().Core().BatchV1().Jobs(namespace.Name).Create(test.Ctx(), job, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created Job %s/%s successfully", job.Namespace, job.Name) + + test.T().Logf("Waiting for Job %s/%s to complete", job.Namespace, job.Name) + test.Eventually(Job(test, job.Namespace, job.Name), TestTimeoutLong).Should( + Or( + WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue)), + WithTransform(ConditionStatus(batchv1.JobFailed), Equal(corev1.ConditionTrue)), + )) + + // Assert the job has completed successfully + test.Expect(GetJob(test, job.Namespace, job.Name)). + To(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue))) +} + +// Submit a Job to the Ray cluster and trains the MNIST dataset using the CodeFlare SDK. +func TestMnistJobSubmit(t *testing.T) { + + test := With(t) + test.T().Parallel() + + namespace := GetNamespaceWithName(test, namespaceName) + + //delete the namespace after test complete + defer DeleteTestNamespace(test, namespace) + + // Test configuration + config := CreateConfigMap(test, namespace.Name, map[string][]byte{ + // SDK script + "mnist_rayjob.py": ReadFile(test, "mnist_rayjob.py"), + // pip requirements + "requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"), + // MNIST training script + "mnist.py": ReadFile(test, "mnist.py"), + // codeflare-sdk installation script + "install-codeflare-sdk.sh": ReadFile(test, "install-codeflare-sdk.sh"), + }) + + // Create RBAC, retrieve token for user with limited rights + policyRules := []rbacv1.PolicyRule{ + { + Verbs: []string{"get", "create", "delete", "list", "patch", "update"}, + APIGroups: []string{mcadv1beta1.GroupName}, + Resources: []string{"appwrappers"}, + }, + { + Verbs: []string{"get", "list"}, + APIGroups: []string{rayv1.GroupVersion.Group}, + Resources: []string{"rayclusters", "rayclusters/status"}, + }, + { + Verbs: []string{"get", "list"}, + APIGroups: []string{"route.openshift.io"}, + Resources: []string{"routes"}, + }, + { + Verbs: []string{"get", "list"}, + APIGroups: []string{"networking.k8s.io"}, + Resources: []string{"ingresses"}, + }, + } + + // Create cluster wide RBAC, required for SDK OpenShift check + // TODO reevaluate once SDK change OpenShift detection logic + clusterPolicyRules := []rbacv1.PolicyRule{ + { + Verbs: []string{"get", "list"}, + APIGroups: []string{"config.openshift.io"}, + Resources: []string{"ingresses"}, + ResourceNames: []string{"cluster"}, + }, + } + + serviceAccount := CreateServiceAccount(test, namespace.Name) + role := CreateRole(test, namespace.Name, policyRules) + CreateRoleBinding(test, namespace.Name, serviceAccount, role) + clusterRole := CreateClusterRole(test, clusterPolicyRules) + CreateClusterRoleBinding(test, serviceAccount, clusterRole) + + job := &batchv1.Job{ + TypeMeta: metav1.TypeMeta{ + APIVersion: batchv1.SchemeGroupVersion.String(), + Kind: "Job", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "rayjob", + Namespace: namespaceName, + }, + Spec: batchv1.JobSpec{ + Completions: Ptr(int32(1)), + Parallelism: Ptr(int32(1)), + BackoffLimit: Ptr(int32(0)), + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "test", + // FIXME: switch to base Python image once the dependency on OpenShift CLI is removed + // See https://github.com/project-codeflare/codeflare-sdk/pull/146 + Image: "quay.io/opendatahub/notebooks:jupyter-minimal-ubi8-python-3.8-4c8f26e", + Command: []string{"/bin/sh", "-c", "cp /test/* . && chmod +x install-codeflare-sdk.sh && ./install-codeflare-sdk.sh && python mnist_rayjob.py" + " " + namespaceName}, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "test", + MountPath: "/test", + }, + { + Name: "codeflare-sdk", + MountPath: "/codeflare-sdk", + }, + { + Name: "workdir", + MountPath: "/workdir", + }, + }, + WorkingDir: "/workdir", + SecurityContext: &corev1.SecurityContext{ + AllowPrivilegeEscalation: Ptr(false), + SeccompProfile: &corev1.SeccompProfile{ + Type: "RuntimeDefault", + }, + Capabilities: &corev1.Capabilities{ + Drop: []corev1.Capability{"ALL"}, + }, + RunAsNonRoot: Ptr(true), + }, + }, + }, + Volumes: []corev1.Volume{ + { + Name: "test", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: config.Name, + }, + }, + }, + }, + { + Name: "codeflare-sdk", + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, + { + Name: "workdir", + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, + }, + RestartPolicy: corev1.RestartPolicyNever, + ServiceAccountName: serviceAccount.Name, + }, + }, + }, + } + + if GetClusterType(test) == KindCluster { + // Take first KinD node and redirect pod hostname requests there + node := GetNodes(test)[0] + hostname := GetClusterHostname(test) + IP := GetNodeInternalIP(test, node) + + test.T().Logf("Setting KinD cluster hostname '%s' to node IP '%s' for SDK pod", hostname, IP) + job.Spec.Template.Spec.HostAliases = []corev1.HostAlias{ + { + IP: IP, + Hostnames: []string{hostname}, + }, + } + + // Propagate hostname into Python code as env variable + hostnameEnvVar := corev1.EnvVar{Name: "CLUSTER_HOSTNAME", Value: hostname} + job.Spec.Template.Spec.Containers[0].Env = append(job.Spec.Template.Spec.Containers[0].Env, hostnameEnvVar) + } + + job, err := test.Client().Core().BatchV1().Jobs(namespaceName).Create(test.Ctx(), job, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created Job %s/%s successfully", job.Namespace, job.Name) + + test.T().Logf("Waiting for Job %s/%s to complete", job.Namespace, job.Name) + test.Eventually(Job(test, job.Namespace, job.Name), TestTimeoutLong).Should( + Or( + WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue)), + WithTransform(ConditionStatus(batchv1.JobFailed), Equal(corev1.ConditionTrue)), + )) + + // Assert the job has completed successfully + test.Expect(GetJob(test, job.Namespace, job.Name)). + To(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue))) + +} From f96772937dffec7e16efb8e3ccdda17395019ee4 Mon Sep 17 00:00:00 2001 From: Srihari Date: Mon, 4 Dec 2023 16:42:56 +0530 Subject: [PATCH 4/4] Create e2e scenario covering upgrade during training --- .github/workflows/olm_tests.yaml | 8 +- go.mod | 1 + test/e2e/mnist_rayjob.py | 44 ---- test/e2e/start_ray_cluster.py | 51 ----- test/upgrade/olm_upgrade_test.go | 345 +++++++------------------------ 5 files changed, 79 insertions(+), 370 deletions(-) delete mode 100644 test/e2e/mnist_rayjob.py delete mode 100644 test/e2e/start_ray_cluster.py diff --git a/.github/workflows/olm_tests.yaml b/.github/workflows/olm_tests.yaml index 927dced70..1dcb6f2d8 100644 --- a/.github/workflows/olm_tests.yaml +++ b/.github/workflows/olm_tests.yaml @@ -122,12 +122,12 @@ jobs: BUNDLE_PUSH_OPT: "--tls-verify=false" CATALOG_PUSH_OPT: "--tls-verify=false" - - name: Run OLM Upgrade e2e Ray cluster Up test + - name: Run OLM Upgrade e2e AppWrapper creation test run: | export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }} echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV set -euo pipefail - go test -timeout 30m -v ./test/upgrade -run TestMNISTRayClusterUp -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt + go test -timeout 30m -v ./test/upgrade -run TestMNISTCreateAppWrapper -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt - name: Update Operator to the built version run: | @@ -158,12 +158,12 @@ jobs: SUBSCRIPTION_NAME: "codeflare-operator" SUBSCRIPTION_NAMESPACE: "openshift-operators" - - name: Run OLM Upgrade e2e Ray cluster Job submit test + - name: Run OLM Upgrade e2e Appwrapper Job status test to monitor training run: | export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }} echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV set -euo pipefail - go test -timeout 30m -v ./test/upgrade -run TestMnistJobSubmit -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt + go test -timeout 30m -v ./test/upgrade -run TestMNISTCheckAppWrapperStatus -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt - name: Run e2e tests against built operator run: | diff --git a/go.mod b/go.mod index c7405e30e..e9f358d48 100644 --- a/go.mod +++ b/go.mod @@ -6,6 +6,7 @@ require ( github.com/onsi/gomega v1.27.10 github.com/openshift/api v0.0.0-20230213134911-7ba313770556 github.com/project-codeflare/codeflare-common v0.0.0-20231129165224-988ba1da9069 + github.com/project-codeflare/instascale v0.3.1 github.com/project-codeflare/multi-cluster-app-dispatcher v1.38.1 github.com/ray-project/kuberay/ray-operator v1.0.0-rc.1 go.uber.org/zap v1.26.0 diff --git a/test/e2e/mnist_rayjob.py b/test/e2e/mnist_rayjob.py deleted file mode 100644 index 7ffb39271..000000000 --- a/test/e2e/mnist_rayjob.py +++ /dev/null @@ -1,44 +0,0 @@ -import sys - -from time import sleep - -from torchx.specs.api import AppState, is_terminal - -from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration -from codeflare_sdk.job.jobs import DDPJobDefinition - -namespace = sys.argv[1] - -cluster = Cluster(ClusterConfiguration(name='mnist',namespace=namespace)) - -jobdef = DDPJobDefinition( - name="mnist", - script="mnist.py", - scheduler_args={"requirements": "requirements.txt"}, -) -job = jobdef.submit(cluster) - -done = False -time = 0 -timeout = 300 -while not done: - status = job.status() - if is_terminal(status.state): - break - if not done: - print(status) - if timeout and time >= timeout: - raise TimeoutError(f"job has timed out after waiting {timeout}s") - sleep(5) - time += 5 - -print(f"Job has completed: {status.state}") - -print(job.logs()) - -cluster.down() - -if not status.state == AppState.SUCCEEDED: - exit(1) -else: - exit(0) diff --git a/test/e2e/start_ray_cluster.py b/test/e2e/start_ray_cluster.py deleted file mode 100644 index 238bccdf2..000000000 --- a/test/e2e/start_ray_cluster.py +++ /dev/null @@ -1,51 +0,0 @@ -import sys -import os - -from time import sleep - -from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration - -namespace = sys.argv[1] -ray_image = os.getenv('RAY_IMAGE') -host = os.getenv('CLUSTER_HOSTNAME') - -ingress_options = {} -if host is not None: - ingress_options = { - "ingresses": [ - { - "ingressName": "ray-dashboard", - "port": 8265, - "pathType": "Prefix", - "path": "/", - "host": host, - }, - ] - } - - -cluster = Cluster(ClusterConfiguration( - name='mnist', - namespace=namespace, - num_workers=1, - head_cpus='500m', - head_memory=2, - min_cpus='500m', - max_cpus=1, - min_memory=0.5, - max_memory=2, - num_gpus=0, - instascale=False, - image=ray_image, - ingress_options=ingress_options, -)) - -cluster.up() - -cluster.status() - -cluster.wait_ready() - -cluster.status() - -cluster.details() diff --git a/test/upgrade/olm_upgrade_test.go b/test/upgrade/olm_upgrade_test.go index 0d27c0649..2abd086ea 100644 --- a/test/upgrade/olm_upgrade_test.go +++ b/test/upgrade/olm_upgrade_test.go @@ -22,29 +22,27 @@ import ( . "github.com/onsi/gomega" . "github.com/project-codeflare/codeflare-common/support" mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" - rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" - rbacv1 "k8s.io/api/rbac/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" . "github.com/project-codeflare/codeflare-operator/test/e2e" ) var ( - namespaceName = "test-ns-olmupgrade" + namespaceName = "test-ns-olmupgrade" + appwrapperName = "mnist" + jobName = "mnist-job" ) -// Creates a Ray cluster -func TestMNISTRayClusterUp(t *testing.T) { - +func TestMNISTCreateAppWrapper(t *testing.T) { test := With(t) test.T().Parallel() // Create a namespace namespace := CreateTestNamespaceWithName(test, namespaceName) - test.T().Logf("Created namespace %s successfully", namespace.Name) // Delete namespace only if test failed defer func() { @@ -56,105 +54,61 @@ func TestMNISTRayClusterUp(t *testing.T) { }() // Test configuration - config := CreateConfigMap(test, namespace.Name, map[string][]byte{ - // SDK script - "start_ray_cluster.py": ReadFile(test, "start_ray_cluster.py"), - // codeflare-sdk installation script - "install-codeflare-sdk.sh": ReadFile(test, "install-codeflare-sdk.sh"), - }) - - // Create RBAC, retrieve token for user with limited rights - policyRules := []rbacv1.PolicyRule{ - { - Verbs: []string{"get", "create", "delete", "list", "patch", "update"}, - APIGroups: []string{mcadv1beta1.GroupName}, - Resources: []string{"appwrappers"}, - }, - { - Verbs: []string{"get", "list"}, - APIGroups: []string{rayv1.GroupVersion.Group}, - Resources: []string{"rayclusters", "rayclusters/status"}, + config := &corev1.ConfigMap{ + TypeMeta: metav1.TypeMeta{ + APIVersion: corev1.SchemeGroupVersion.String(), + Kind: "ConfigMap", }, - { - Verbs: []string{"get", "list"}, - APIGroups: []string{"route.openshift.io"}, - Resources: []string{"routes"}, + ObjectMeta: metav1.ObjectMeta{ + Name: "mnist-mcad", + Namespace: namespace.Name, }, - { - Verbs: []string{"get", "list"}, - APIGroups: []string{"networking.k8s.io"}, - Resources: []string{"ingresses"}, + BinaryData: map[string][]byte{ + // pip requirements + "requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"), + // MNIST training script + "mnist.py": ReadFile(test, "mnist.py"), }, + Immutable: Ptr(true), } + config, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), config, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created ConfigMap %s/%s successfully", config.Namespace, config.Name) - // Create cluster wide RBAC, required for SDK OpenShift check - // TODO reevaluate once SDK change OpenShift detection logic - clusterPolicyRules := []rbacv1.PolicyRule{ - { - Verbs: []string{"get", "list"}, - APIGroups: []string{"config.openshift.io"}, - Resources: []string{"ingresses"}, - ResourceNames: []string{"cluster"}, - }, - } - - sa := CreateServiceAccount(test, namespace.Name) - role := CreateRole(test, namespace.Name, policyRules) - CreateRoleBinding(test, namespace.Name, sa, role) - clusterRole := CreateClusterRole(test, clusterPolicyRules) - CreateClusterRoleBinding(test, sa, clusterRole) - + // Batch Job job := &batchv1.Job{ TypeMeta: metav1.TypeMeta{ APIVersion: batchv1.SchemeGroupVersion.String(), Kind: "Job", }, ObjectMeta: metav1.ObjectMeta{ - Name: "sdk", + Name: jobName, Namespace: namespace.Name, }, Spec: batchv1.JobSpec{ - Completions: Ptr(int32(1)), - Parallelism: Ptr(int32(1)), - BackoffLimit: Ptr(int32(0)), + Completions: Ptr(int32(1)), + Parallelism: Ptr(int32(1)), Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ Containers: []corev1.Container{ { - Name: "test", - // FIXME: switch to base Python image once the dependency on OpenShift CLI is removed - // See https://github.com/project-codeflare/codeflare-sdk/pull/146 - Image: "quay.io/opendatahub/notebooks:jupyter-minimal-ubi8-python-3.8-4c8f26e", + Name: "job", + Image: GetPyTorchImage(), Env: []corev1.EnvVar{ - {Name: "PYTHONUSERBASE", Value: "/workdir"}, - {Name: "RAY_IMAGE", Value: GetRayImage()}, + corev1.EnvVar{Name: "PYTHONUSERBASE", Value: "/workdir"}, }, - Command: []string{"/bin/sh", "-c", "cp /test/* . && chmod +x install-codeflare-sdk.sh && ./install-codeflare-sdk.sh && python start_ray_cluster.py" + " " + namespace.Name}, + Command: []string{"/bin/sh", "-c", "pip install -r /test/requirements.txt && torchrun /test/mnist.py"}, VolumeMounts: []corev1.VolumeMount{ { Name: "test", MountPath: "/test", }, - { - Name: "codeflare-sdk", - MountPath: "/codeflare-sdk", - }, { Name: "workdir", MountPath: "/workdir", }, }, WorkingDir: "/workdir", - SecurityContext: &corev1.SecurityContext{ - AllowPrivilegeEscalation: Ptr(false), - SeccompProfile: &corev1.SeccompProfile{ - Type: "RuntimeDefault", - }, - Capabilities: &corev1.Capabilities{ - Drop: []corev1.Capability{"ALL"}, - }, - RunAsNonRoot: Ptr(true), - }, }, }, Volumes: []corev1.Volume{ @@ -168,12 +122,6 @@ func TestMNISTRayClusterUp(t *testing.T) { }, }, }, - { - Name: "codeflare-sdk", - VolumeSource: corev1.VolumeSource{ - EmptyDir: &corev1.EmptyDirVolumeSource{}, - }, - }, { Name: "workdir", VolumeSource: corev1.VolumeSource{ @@ -181,223 +129,78 @@ func TestMNISTRayClusterUp(t *testing.T) { }, }, }, - RestartPolicy: corev1.RestartPolicyNever, - ServiceAccountName: sa.Name, + RestartPolicy: corev1.RestartPolicyNever, }, }, }, } - if GetClusterType(test) == KindCluster { - // Take first KinD node and redirect pod hostname requests there - node := GetNodes(test)[0] - hostname := GetClusterHostname(test) - IP := GetNodeInternalIP(test, node) - test.T().Logf("Setting KinD cluster hostname '%s' to node IP '%s' for SDK pod", hostname, IP) - job.Spec.Template.Spec.HostAliases = []corev1.HostAlias{ - { - IP: IP, - Hostnames: []string{hostname}, + // Create an AppWrapper resource + aw := &mcadv1beta1.AppWrapper{ + ObjectMeta: metav1.ObjectMeta{ + Name: appwrapperName, + Namespace: namespace.Name, + }, + Spec: mcadv1beta1.AppWrapperSpec{ + AggrResources: mcadv1beta1.AppWrapperResourceList{ + GenericItems: []mcadv1beta1.AppWrapperGenericResource{ + { + DesiredAvailable: 1, + CustomPodResources: []mcadv1beta1.CustomPodResourceTemplate{ + { + Replicas: 1, + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("250m"), + corev1.ResourceMemory: resource.MustParse("512Mi"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("1"), + corev1.ResourceMemory: resource.MustParse("1G"), + }, + }, + }, + GenericTemplate: Raw(test, job), + CompletionStatus: "Complete", + }, + }, }, - } - - // Propagate hostname into Python code as env variable - hostnameEnvVar := corev1.EnvVar{Name: "CLUSTER_HOSTNAME", Value: hostname} - job.Spec.Template.Spec.Containers[0].Env = append(job.Spec.Template.Spec.Containers[0].Env, hostnameEnvVar) + }, } - job, err := test.Client().Core().BatchV1().Jobs(namespace.Name).Create(test.Ctx(), job, metav1.CreateOptions{}) + _, err = test.Client().MCAD().WorkloadV1beta1().AppWrappers(namespace.Name).Create(test.Ctx(), aw, metav1.CreateOptions{}) test.Expect(err).NotTo(HaveOccurred()) - test.T().Logf("Created Job %s/%s successfully", job.Namespace, job.Name) + test.T().Logf("Created MCAD AppWrapper %s/%s successfully", aw.Namespace, aw.Name) - test.T().Logf("Waiting for Job %s/%s to complete", job.Namespace, job.Name) - test.Eventually(Job(test, job.Namespace, job.Name), TestTimeoutLong).Should( - Or( - WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue)), - WithTransform(ConditionStatus(batchv1.JobFailed), Equal(corev1.ConditionTrue)), - )) + test.T().Logf("Waiting for MCAD %s/%s to be running", aw.Namespace, aw.Name) + test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutMedium). + Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateActive))) - // Assert the job has completed successfully - test.Expect(GetJob(test, job.Namespace, job.Name)). - To(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue))) } -// Submit a Job to the Ray cluster and trains the MNIST dataset using the CodeFlare SDK. -func TestMnistJobSubmit(t *testing.T) { - +func TestMNISTCheckAppWrapperStatus(t *testing.T) { test := With(t) test.T().Parallel() + // get namespace namespace := GetNamespaceWithName(test, namespaceName) //delete the namespace after test complete defer DeleteTestNamespace(test, namespace) - // Test configuration - config := CreateConfigMap(test, namespace.Name, map[string][]byte{ - // SDK script - "mnist_rayjob.py": ReadFile(test, "mnist_rayjob.py"), - // pip requirements - "requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"), - // MNIST training script - "mnist.py": ReadFile(test, "mnist.py"), - // codeflare-sdk installation script - "install-codeflare-sdk.sh": ReadFile(test, "install-codeflare-sdk.sh"), - }) - - // Create RBAC, retrieve token for user with limited rights - policyRules := []rbacv1.PolicyRule{ - { - Verbs: []string{"get", "create", "delete", "list", "patch", "update"}, - APIGroups: []string{mcadv1beta1.GroupName}, - Resources: []string{"appwrappers"}, - }, - { - Verbs: []string{"get", "list"}, - APIGroups: []string{rayv1.GroupVersion.Group}, - Resources: []string{"rayclusters", "rayclusters/status"}, - }, - { - Verbs: []string{"get", "list"}, - APIGroups: []string{"route.openshift.io"}, - Resources: []string{"routes"}, - }, - { - Verbs: []string{"get", "list"}, - APIGroups: []string{"networking.k8s.io"}, - Resources: []string{"ingresses"}, - }, - } - - // Create cluster wide RBAC, required for SDK OpenShift check - // TODO reevaluate once SDK change OpenShift detection logic - clusterPolicyRules := []rbacv1.PolicyRule{ - { - Verbs: []string{"get", "list"}, - APIGroups: []string{"config.openshift.io"}, - Resources: []string{"ingresses"}, - ResourceNames: []string{"cluster"}, - }, - } - - serviceAccount := CreateServiceAccount(test, namespace.Name) - role := CreateRole(test, namespace.Name, policyRules) - CreateRoleBinding(test, namespace.Name, serviceAccount, role) - clusterRole := CreateClusterRole(test, clusterPolicyRules) - CreateClusterRoleBinding(test, serviceAccount, clusterRole) - - job := &batchv1.Job{ - TypeMeta: metav1.TypeMeta{ - APIVersion: batchv1.SchemeGroupVersion.String(), - Kind: "Job", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "rayjob", - Namespace: namespaceName, - }, - Spec: batchv1.JobSpec{ - Completions: Ptr(int32(1)), - Parallelism: Ptr(int32(1)), - BackoffLimit: Ptr(int32(0)), - Template: corev1.PodTemplateSpec{ - Spec: corev1.PodSpec{ - Containers: []corev1.Container{ - { - Name: "test", - // FIXME: switch to base Python image once the dependency on OpenShift CLI is removed - // See https://github.com/project-codeflare/codeflare-sdk/pull/146 - Image: "quay.io/opendatahub/notebooks:jupyter-minimal-ubi8-python-3.8-4c8f26e", - Command: []string{"/bin/sh", "-c", "cp /test/* . && chmod +x install-codeflare-sdk.sh && ./install-codeflare-sdk.sh && python mnist_rayjob.py" + " " + namespaceName}, - VolumeMounts: []corev1.VolumeMount{ - { - Name: "test", - MountPath: "/test", - }, - { - Name: "codeflare-sdk", - MountPath: "/codeflare-sdk", - }, - { - Name: "workdir", - MountPath: "/workdir", - }, - }, - WorkingDir: "/workdir", - SecurityContext: &corev1.SecurityContext{ - AllowPrivilegeEscalation: Ptr(false), - SeccompProfile: &corev1.SeccompProfile{ - Type: "RuntimeDefault", - }, - Capabilities: &corev1.Capabilities{ - Drop: []corev1.Capability{"ALL"}, - }, - RunAsNonRoot: Ptr(true), - }, - }, - }, - Volumes: []corev1.Volume{ - { - Name: "test", - VolumeSource: corev1.VolumeSource{ - ConfigMap: &corev1.ConfigMapVolumeSource{ - LocalObjectReference: corev1.LocalObjectReference{ - Name: config.Name, - }, - }, - }, - }, - { - Name: "codeflare-sdk", - VolumeSource: corev1.VolumeSource{ - EmptyDir: &corev1.EmptyDirVolumeSource{}, - }, - }, - { - Name: "workdir", - VolumeSource: corev1.VolumeSource{ - EmptyDir: &corev1.EmptyDirVolumeSource{}, - }, - }, - }, - RestartPolicy: corev1.RestartPolicyNever, - ServiceAccountName: serviceAccount.Name, - }, - }, - }, - } - - if GetClusterType(test) == KindCluster { - // Take first KinD node and redirect pod hostname requests there - node := GetNodes(test)[0] - hostname := GetClusterHostname(test) - IP := GetNodeInternalIP(test, node) - - test.T().Logf("Setting KinD cluster hostname '%s' to node IP '%s' for SDK pod", hostname, IP) - job.Spec.Template.Spec.HostAliases = []corev1.HostAlias{ - { - IP: IP, - Hostnames: []string{hostname}, - }, - } - - // Propagate hostname into Python code as env variable - hostnameEnvVar := corev1.EnvVar{Name: "CLUSTER_HOSTNAME", Value: hostname} - job.Spec.Template.Spec.Containers[0].Env = append(job.Spec.Template.Spec.Containers[0].Env, hostnameEnvVar) - } - - job, err := test.Client().Core().BatchV1().Jobs(namespaceName).Create(test.Ctx(), job, metav1.CreateOptions{}) + aw, err := test.Client().MCAD().WorkloadV1beta1().AppWrappers(namespace.Name).Get(test.Ctx(), appwrapperName, metav1.GetOptions{}) test.Expect(err).NotTo(HaveOccurred()) - test.T().Logf("Created Job %s/%s successfully", job.Namespace, job.Name) + job, err := test.Client().Core().BatchV1().Jobs(namespace.Name).Get(test.Ctx(), jobName, metav1.GetOptions{}) + test.Expect(err).NotTo(HaveOccurred()) test.T().Logf("Waiting for Job %s/%s to complete", job.Namespace, job.Name) - test.Eventually(Job(test, job.Namespace, job.Name), TestTimeoutLong).Should( + test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutLong).Should( Or( - WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue)), - WithTransform(ConditionStatus(batchv1.JobFailed), Equal(corev1.ConditionTrue)), + WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateCompleted)), + WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateFailed)), )) // Assert the job has completed successfully - test.Expect(GetJob(test, job.Namespace, job.Name)). - To(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue))) + test.Expect(GetAppWrapper(test, namespace, aw.Name)). + To(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateCompleted))) }