Skip to content

Commit db0d3a9

Browse files
committed
adjust e2e test so that RayCluster really is managed by Kueue
1 parent d4a2f11 commit db0d3a9

File tree

7 files changed

+88
-91
lines changed

7 files changed

+88
-91
lines changed

.github/workflows/e2e_tests.yaml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,10 +55,6 @@ jobs:
5555
- name: Setup and start KinD cluster
5656
uses: ./common/github-actions/kind
5757

58-
- name: Deploy Kueue
59-
run: |
60-
make kueue-e2e
61-
6258
- name: Deploy CodeFlare stack
6359
id: deploy
6460
run: |

Makefile

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -389,13 +389,9 @@ test-e2e: manifests fmt vet ## Run e2e tests.
389389
kind-e2e: ## Set up e2e KinD cluster
390390
test/e2e/kind.sh
391391

392-
.PHONY: kueue-e2e
393-
kueue-e2e: ## Deploy Kueue for e2e tests
394-
KUEUE_VERSION=$(KUEUE_VERSION) test/e2e/kueue.sh
395-
396392
.PHONY: setup-e2e
397393
setup-e2e: ## Set up e2e tests.
398-
KUBERAY_VERSION=$(KUBERAY_VERSION) test/e2e/setup.sh
394+
KUBERAY_VERSION=$(KUBERAY_VERSION) KUEUE_VERSION=$(KUEUE_VERSION) test/e2e/setup.sh
399395

400396
.PHONY: imports
401397
imports: openshift-goimports ## Organize imports in go files using openshift-goimports. Example: make imports

README.md

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,6 @@ The e2e tests can be executed locally by running the following commands:
3333
```bash
3434
# Create a KinD cluster
3535
make kind-e2e
36-
# Deploy Kueue
37-
make kueue-e2e
3836
# Install the CRDs
3937
make install
4038
```
@@ -43,20 +41,15 @@ The e2e tests can be executed locally by running the following commands:
4341
Some e2e tests cover the access to services via Ingresses, as end-users would do, which requires access to the Ingress controller load balancer by its IP.
4442
For it to work on macOS, this requires installing [docker-mac-net-connect](https://github.com/chipmk/docker-mac-net-connect).
4543

46-
2. Start the operator locally:
47-
48-
```bash
49-
NAMESPACE=default make run
50-
```
51-
52-
Alternatively, You can run the operator from your IDE / debugger.
53-
54-
3. Set up the test CodeFlare stack:
44+
2. Setup the rest of the CodeFlare stack.
5545

5646
```bash
5747
make setup-e2e
5848
```
5949

50+
[!NOTE]
51+
Kueue will only activate its Ray integration if KubeRay is installed before Kueue (as done by this make target).
52+
6053
[!NOTE]
6154
In OpenShift the KubeRay operator pod gets random user assigned. This user is then used to run Ray cluster.
6255
However the random user assigned by OpenShift doesn't have rights to store dataset downloaded as part of test execution, causing tests to fail.
@@ -76,6 +69,14 @@ The e2e tests can be executed locally by running the following commands:
7669
- 'system:serviceaccount:$(namespace):kuberay-operator'
7770
```
7871
72+
3. Start the operator locally:
73+
74+
```bash
75+
NAMESPACE=default make run
76+
```
77+
78+
Alternatively, You can run the operator from your IDE / debugger.
79+
7980
4. In a separate terminal, set your output directory for test files, and run the e2e suite:
8081
```bash
8182
export CODEFLARE_TEST_OUTPUT_DIR=<your_output_directory>

test/e2e/kueue.sh

Lines changed: 0 additions & 31 deletions
This file was deleted.

test/e2e/mnist_rayjob_raycluster_test.go

Lines changed: 53 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,57 @@ import (
3535
// directly managed by Kueue, and asserts successful completion of the training job.
3636
func TestMNISTRayJobRayCluster(t *testing.T) {
3737
test := With(t)
38-
test.T().Parallel()
3938

4039
// Create a namespace and localqueue in that namespace
4140
namespace := test.NewTestNamespace()
41+
localQueue := CreateKueueLocalQueue(test, namespace.Name, "e2e-cluster-queue")
4242

43-
// MNIST training script
44-
mnist := &corev1.ConfigMap{
43+
// Create MNIST training script
44+
mnist := constructMNISTConfigMap(test, namespace)
45+
mnist, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), mnist, metav1.CreateOptions{})
46+
test.Expect(err).NotTo(HaveOccurred())
47+
test.T().Logf("Created ConfigMap %s/%s successfully", mnist.Namespace, mnist.Name)
48+
49+
// Create RayCluster and assign it to the localqueue
50+
rayCluster := constructRayCluster(test, namespace, mnist)
51+
AssignToLocalQueue(rayCluster, localQueue)
52+
rayCluster, err = test.Client().Ray().RayV1().RayClusters(namespace.Name).Create(test.Ctx(), rayCluster, metav1.CreateOptions{})
53+
test.Expect(err).NotTo(HaveOccurred())
54+
test.T().Logf("Created RayCluster %s/%s successfully", rayCluster.Namespace, rayCluster.Name)
55+
56+
test.T().Logf("Waiting for RayCluster %s/%s to be running", rayCluster.Namespace, rayCluster.Name)
57+
test.Eventually(RayCluster(test, namespace.Name, rayCluster.Name), TestTimeoutMedium).
58+
Should(WithTransform(RayClusterState, Equal(rayv1.Ready)))
59+
60+
// Create RayJob
61+
rayJob := constructRayJob(test, namespace, rayCluster)
62+
rayJob, err = test.Client().Ray().RayV1().RayJobs(namespace.Name).Create(test.Ctx(), rayJob, metav1.CreateOptions{})
63+
test.Expect(err).NotTo(HaveOccurred())
64+
test.T().Logf("Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name)
65+
66+
rayDashboardURL := getRayDashboardURL(test, rayCluster.Namespace, rayCluster.Name)
67+
68+
test.T().Logf("Connecting to Ray cluster at: %s", rayDashboardURL.String())
69+
rayClient := NewRayClusterClient(rayDashboardURL)
70+
71+
// Wait for Ray job id to be available, this value is needed for writing logs in defer
72+
test.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutShort).
73+
Should(WithTransform(RayJobId, Not(BeEmpty())))
74+
75+
// Retrieving the job logs once it has completed or timed out
76+
defer WriteRayJobAPILogs(test, rayClient, GetRayJobId(test, rayJob.Namespace, rayJob.Name))
77+
78+
test.T().Logf("Waiting for RayJob %s/%s to complete", rayJob.Namespace, rayJob.Name)
79+
test.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutLong).
80+
Should(WithTransform(RayJobStatus, Satisfy(rayv1.IsJobTerminal)))
81+
82+
// Assert the Ray job has completed successfully
83+
test.Expect(GetRayJob(test, rayJob.Namespace, rayJob.Name)).
84+
To(WithTransform(RayJobStatus, Equal(rayv1.JobStatusSucceeded)))
85+
}
86+
87+
func constructMNISTConfigMap(test Test, namespace *corev1.Namespace) *corev1.ConfigMap {
88+
return &corev1.ConfigMap{
4589
TypeMeta: metav1.TypeMeta{
4690
APIVersion: corev1.SchemeGroupVersion.String(),
4791
Kind: "ConfigMap",
@@ -55,12 +99,10 @@ func TestMNISTRayJobRayCluster(t *testing.T) {
5599
},
56100
Immutable: Ptr(true),
57101
}
58-
mnist, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), mnist, metav1.CreateOptions{})
59-
test.Expect(err).NotTo(HaveOccurred())
60-
test.T().Logf("Created ConfigMap %s/%s successfully", mnist.Namespace, mnist.Name)
102+
}
61103

62-
// RayCluster
63-
rayCluster := &rayv1.RayCluster{
104+
func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.ConfigMap) *rayv1.RayCluster {
105+
return &rayv1.RayCluster{
64106
TypeMeta: metav1.TypeMeta{
65107
APIVersion: rayv1.GroupVersion.String(),
66108
Kind: "RayCluster",
@@ -173,16 +215,10 @@ func TestMNISTRayJobRayCluster(t *testing.T) {
173215
},
174216
},
175217
}
218+
}
176219

177-
_, err = test.Client().Ray().RayV1().RayClusters(namespace.Name).Create(test.Ctx(), rayCluster, metav1.CreateOptions{})
178-
test.Expect(err).NotTo(HaveOccurred())
179-
test.T().Logf("Created RayCluster %s/%s successfully", rayCluster.Namespace, rayCluster.Name)
180-
181-
test.T().Logf("Waiting for RayCluster %s/%s to be running", rayCluster.Namespace, rayCluster.Name)
182-
test.Eventually(RayCluster(test, namespace.Name, rayCluster.Name), TestTimeoutMedium).
183-
Should(WithTransform(RayClusterState, Equal(rayv1.Ready)))
184-
185-
rayJob := &rayv1.RayJob{
220+
func constructRayJob(_ Test, namespace *corev1.Namespace, rayCluster *rayv1.RayCluster) *rayv1.RayJob {
221+
return &rayv1.RayJob{
186222
TypeMeta: metav1.TypeMeta{
187223
APIVersion: rayv1.GroupVersion.String(),
188224
Kind: "RayJob",
@@ -220,29 +256,6 @@ func TestMNISTRayJobRayCluster(t *testing.T) {
220256
},
221257
},
222258
}
223-
rayJob, err = test.Client().Ray().RayV1().RayJobs(namespace.Name).Create(test.Ctx(), rayJob, metav1.CreateOptions{})
224-
test.Expect(err).NotTo(HaveOccurred())
225-
test.T().Logf("Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name)
226-
227-
rayDashboardURL := getRayDashboardURL(test, rayCluster.Namespace, rayCluster.Name)
228-
229-
test.T().Logf("Connecting to Ray cluster at: %s", rayDashboardURL.String())
230-
rayClient := NewRayClusterClient(rayDashboardURL)
231-
232-
// Wait for Ray job id to be available, this value is needed for writing logs in defer
233-
test.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutShort).
234-
Should(WithTransform(RayJobId, Not(BeEmpty())))
235-
236-
// Retrieving the job logs once it has completed or timed out
237-
defer WriteRayJobAPILogs(test, rayClient, GetRayJobId(test, rayJob.Namespace, rayJob.Name))
238-
239-
test.T().Logf("Waiting for RayJob %s/%s to complete", rayJob.Namespace, rayJob.Name)
240-
test.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutLong).
241-
Should(WithTransform(RayJobStatus, Satisfy(rayv1.IsJobTerminal)))
242-
243-
// Assert the Ray job has completed successfully
244-
test.Expect(GetRayJob(test, rayJob.Namespace, rayJob.Name)).
245-
To(WithTransform(RayJobStatus, Equal(rayv1.JobStatusSucceeded)))
246259
}
247260

248261
func getRayDashboardURL(test Test, namespace, rayClusterName string) url.URL {

test/e2e/setup.sh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,17 @@ roleRef:
5757
name: e2e-controller-rayclusters
5858
EOF
5959

60+
echo "Deploying Kueue $KUEUE_VERSION"
61+
kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml
62+
63+
# Sleep until the kueue manager is running
64+
echo "Waiting for pods in the kueue-system namespace to become ready"
65+
while [[ $(kubectl get pods -n kueue-system -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' | tr ' ' '\n' | sort -u) != "True" ]]
66+
do
67+
echo -n "." && sleep 1;
68+
done
69+
echo ""
70+
6071
echo Creating Kueue ResourceFlavor and ClusterQueue
6172
cat <<EOF | kubectl apply -f -
6273
apiVersion: kueue.x-k8s.io/v1beta1

test/e2e/support.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ import (
2121

2222
"github.com/onsi/gomega"
2323
"github.com/project-codeflare/codeflare-common/support"
24+
"sigs.k8s.io/controller-runtime/pkg/client"
25+
kueuev1beta1 "sigs.k8s.io/kueue/apis/kueue/v1beta1"
2426
)
2527

2628
//go:embed *.py *.txt *.sh
@@ -32,3 +34,12 @@ func ReadFile(t support.Test, fileName string) []byte {
3234
t.Expect(err).NotTo(gomega.HaveOccurred())
3335
return file
3436
}
37+
38+
func AssignToLocalQueue(object client.Object, localqueue *kueuev1beta1.LocalQueue) {
39+
labels := object.GetLabels()
40+
if labels == nil {
41+
labels = make(map[string]string)
42+
}
43+
labels["kueue.x-k8s.io/queue-name"] = localqueue.Name
44+
object.SetLabels(labels)
45+
}

0 commit comments

Comments
 (0)