Skip to content

Commit a6c9894

Browse files
committed
Create a Ray Cluster SDK upgrade scenarios
1 parent 0afa252 commit a6c9894

File tree

4 files changed

+501
-4
lines changed

4 files changed

+501
-4
lines changed

.github/workflows/olm_tests.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -122,12 +122,12 @@ jobs:
122122
BUNDLE_PUSH_OPT: "--tls-verify=false"
123123
CATALOG_PUSH_OPT: "--tls-verify=false"
124124

125-
- name: Run OLM Upgrade e2e AppWrapper creation test
125+
- name: Run OLM Pre Upgrade test scenarios
126126
run: |
127127
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
128128
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
129129
set -euo pipefail
130-
go test -timeout 30m -v ./test/upgrade -run TestMNISTCreateAppWrapper -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt
130+
go test -timeout 30m -v ./test/upgrade -run 'TestMNISTCreateAppWrapper|TestMNISTRayClusterUp' -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt
131131
132132
- name: Update Operator to the built version
133133
run: |
@@ -158,12 +158,12 @@ jobs:
158158
SUBSCRIPTION_NAME: "codeflare-operator"
159159
SUBSCRIPTION_NAMESPACE: "openshift-operators"
160160

161-
- name: Run OLM Upgrade e2e Appwrapper Job status test to monitor training
161+
- name: Run OLM Post Upgrade test scenarios
162162
run: |
163163
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
164164
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
165165
set -euo pipefail
166-
go test -timeout 30m -v ./test/upgrade -run TestMNISTCheckAppWrapperStatus -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt
166+
go test -timeout 30m -v ./test/upgrade -run 'TestMNISTCheckAppWrapperStatus|TestMnistJobSubmit' -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt
167167
168168
- name: Run e2e tests against built operator
169169
run: |

test/e2e/mnist_rayjob.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import sys
2+
import os
3+
4+
from time import sleep
5+
6+
from torchx.specs.api import AppState, is_terminal
7+
8+
from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration
9+
from codeflare_sdk.job.jobs import DDPJobDefinition
10+
11+
namespace = sys.argv[1]
12+
ray_image = os.getenv('RAY_IMAGE')
13+
host = os.getenv('CLUSTER_HOSTNAME')
14+
15+
ingress_options = {}
16+
if host is not None:
17+
ingress_options = {
18+
"ingresses": [
19+
{
20+
"ingressName": "ray-dashboard",
21+
"port": 8265,
22+
"pathType": "Prefix",
23+
"path": "/",
24+
"host": host,
25+
},
26+
]
27+
}
28+
29+
# cluster = get_cluster('mnist',namespace)
30+
31+
cluster = Cluster(ClusterConfiguration('mnist',namespace,image=ray_image,
32+
ingress_options=ingress_options))
33+
34+
print(cluster.details())
35+
36+
jobdef = DDPJobDefinition(
37+
name="mnist",
38+
script="mnist.py",
39+
scheduler_args={"requirements": "requirements.txt"},
40+
)
41+
job = jobdef.submit(cluster)
42+
43+
done = False
44+
time = 0
45+
timeout = 300
46+
while not done:
47+
status = job.status()
48+
if is_terminal(status.state):
49+
break
50+
if not done:
51+
print(status)
52+
if timeout and time >= timeout:
53+
raise TimeoutError(f"job has timed out after waiting {timeout}s")
54+
sleep(5)
55+
time += 5
56+
57+
print(f"Job has completed: {status.state}")
58+
59+
print(job.logs())
60+
61+
cluster.down()
62+
63+
if not status.state == AppState.SUCCEEDED:
64+
exit(1)
65+
else:
66+
exit(0)

test/e2e/start_ray_cluster.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import sys
2+
import os
3+
4+
from time import sleep
5+
from torchx.specs.api import AppState, is_terminal
6+
7+
from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration
8+
9+
namespace = sys.argv[1]
10+
ray_image = os.getenv('RAY_IMAGE')
11+
host = os.getenv('CLUSTER_HOSTNAME')
12+
13+
ingress_options = {}
14+
if host is not None:
15+
ingress_options = {
16+
"ingresses": [
17+
{
18+
"ingressName": "ray-dashboard",
19+
"port": 8265,
20+
"pathType": "Prefix",
21+
"path": "/",
22+
"host": host,
23+
},
24+
]
25+
}
26+
27+
cluster = Cluster(ClusterConfiguration(
28+
name='mnist',
29+
namespace=namespace,
30+
num_workers=1,
31+
head_cpus='500m',
32+
head_memory=2,
33+
min_cpus='500m',
34+
max_cpus=1,
35+
min_memory=1,
36+
max_memory=2,
37+
num_gpus=0,
38+
instascale=False,
39+
image=ray_image,
40+
ingress_options=ingress_options,
41+
))
42+
43+
cluster.up()
44+
45+
cluster.status()
46+
47+
cluster.wait_ready()
48+
49+
cluster.status()
50+
51+
cluster.details()

0 commit comments

Comments
 (0)