Skip to content

Commit eaf4d52

Browse files
authored
Merge pull request #68 from validatedpatterns/update_gpu_tests
Expect one GPU node, update nvidia pod count verification
2 parents 96e9184 + 12d1f13 commit eaf4d52

File tree

1 file changed

+21
-34
lines changed

1 file changed

+21
-34
lines changed

tests/interop/test_validate_gpu_nodes.py

Lines changed: 21 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
import logging
22
import os
33
import re
4-
import subprocess
54

65
import pytest
76
from ocp_resources.machine_set import MachineSet
87
from ocp_resources.node import Node
8+
from ocp_resources.pod import Pod
99

1010
from . import __loggername__
1111

@@ -113,6 +113,7 @@ def test_validate_gpu_node_role_labels_pods(openshift_dyn_client):
113113

114114
nodes = Node.get(dyn_client=openshift_dyn_client)
115115
gpu_nodes = []
116+
expected_count = 1
116117
for node in nodes:
117118
logger.info(node.instance.metadata.name)
118119
labels = node.instance.metadata.labels
@@ -125,9 +126,7 @@ def test_validate_gpu_node_role_labels_pods(openshift_dyn_client):
125126
if odh_label in label_str and worker_label in label_str:
126127
gpu_nodes.append(node)
127128

128-
# logger.info(node_count)
129-
130-
if len(gpu_nodes) == 3:
129+
if len(gpu_nodes) == int(expected_count):
131130
logger.info("PASS: Found 'worker' and 'odh-notebook' GPU node-role labels")
132131
else:
133132
err_msg = "Could not find 'worker' and 'odh-notebook' GPU node-role label"
@@ -139,35 +138,23 @@ def test_validate_gpu_node_role_labels_pods(openshift_dyn_client):
139138
"""
140139
logger.info("Checking pod count on GPU nodes")
141140

142-
for gpu_node in gpu_nodes:
143-
name = gpu_node.instance.metadata.name
144-
field_select = "--field-selector=spec.host=" + name
145-
pod_count = 0
146-
expected_count = 20
147-
failed_nodes = []
148-
cmd_out = subprocess.run(
149-
[oc, "get", "pod", "-A", field_select, "--no-headers"], capture_output=True
150-
)
151-
152-
if cmd_out.stdout:
153-
out_decoded = cmd_out.stdout.decode("utf-8")
154-
logger.info(node.instance.metadata.name + "\n" + out_decoded)
155-
out_split = out_decoded.splitlines()
156-
157-
for line in out_split:
158-
if "Completed" in line:
159-
continue
160-
else:
161-
pod_count += 1
162-
163-
if pod_count < expected_count:
164-
failed_nodes.append(node.instance.metadata.name)
165-
else:
166-
assert False, cmd_out.stderr
167-
168-
if failed_nodes:
169-
err_msg = f"Did not find the expected pod count on: {failed_nodes}"
141+
# We are assuming one GPU node
142+
gpu_node = gpu_nodes[0].instance.metadata.name
143+
nvidia_pods = []
144+
expected_count = 8
145+
project = "nvidia-gpu-operator"
146+
pods = Pod.get(dyn_client=openshift_dyn_client, namespace=project)
147+
148+
for pod in pods:
149+
if "nvidia" in pod.instance.metadata.name:
150+
logger.info(f"nvidia pod: {pod.instance.metadata.name}")
151+
if gpu_node in pod.instance.spec.nodeName:
152+
logger.info(f"nvidia pod node name: {pod.instance.spec.nodeName}")
153+
nvidia_pods.append(pod.instance.metadata.name)
154+
155+
if len(nvidia_pods) == int(expected_count):
156+
logger.info("PASS: Found the expected nvidia pod count for GPU nodes")
157+
else:
158+
err_msg = "Did not find the expected nvidia pod count for GPU nodes"
170159
logger.error(f"FAIL: {err_msg}")
171160
assert False, err_msg
172-
else:
173-
logger.info("PASS: Found the expected pod count for GPU nodes")

0 commit comments

Comments
 (0)