1
1
import logging
2
2
import os
3
3
import re
4
- import subprocess
5
4
6
5
import pytest
7
6
from ocp_resources .machine_set import MachineSet
8
7
from ocp_resources .node import Node
8
+ from ocp_resources .pod import Pod
9
9
10
10
from . import __loggername__
11
11
@@ -113,6 +113,7 @@ def test_validate_gpu_node_role_labels_pods(openshift_dyn_client):
113
113
114
114
nodes = Node .get (dyn_client = openshift_dyn_client )
115
115
gpu_nodes = []
116
+ expected_count = 1
116
117
for node in nodes :
117
118
logger .info (node .instance .metadata .name )
118
119
labels = node .instance .metadata .labels
@@ -125,9 +126,7 @@ def test_validate_gpu_node_role_labels_pods(openshift_dyn_client):
125
126
if odh_label in label_str and worker_label in label_str :
126
127
gpu_nodes .append (node )
127
128
128
- # logger.info(node_count)
129
-
130
- if len (gpu_nodes ) == 3 :
129
+ if len (gpu_nodes ) == int (expected_count ):
131
130
logger .info ("PASS: Found 'worker' and 'odh-notebook' GPU node-role labels" )
132
131
else :
133
132
err_msg = "Could not find 'worker' and 'odh-notebook' GPU node-role label"
@@ -139,35 +138,23 @@ def test_validate_gpu_node_role_labels_pods(openshift_dyn_client):
139
138
"""
140
139
logger .info ("Checking pod count on GPU nodes" )
141
140
142
- for gpu_node in gpu_nodes :
143
- name = gpu_node .instance .metadata .name
144
- field_select = "--field-selector=spec.host=" + name
145
- pod_count = 0
146
- expected_count = 20
147
- failed_nodes = []
148
- cmd_out = subprocess .run (
149
- [oc , "get" , "pod" , "-A" , field_select , "--no-headers" ], capture_output = True
150
- )
151
-
152
- if cmd_out .stdout :
153
- out_decoded = cmd_out .stdout .decode ("utf-8" )
154
- logger .info (node .instance .metadata .name + "\n " + out_decoded )
155
- out_split = out_decoded .splitlines ()
156
-
157
- for line in out_split :
158
- if "Completed" in line :
159
- continue
160
- else :
161
- pod_count += 1
162
-
163
- if pod_count < expected_count :
164
- failed_nodes .append (node .instance .metadata .name )
165
- else :
166
- assert False , cmd_out .stderr
167
-
168
- if failed_nodes :
169
- err_msg = f"Did not find the expected pod count on: { failed_nodes } "
141
+ # We are assuming one GPU node
142
+ gpu_node = gpu_nodes [0 ].instance .metadata .name
143
+ nvidia_pods = []
144
+ expected_count = 8
145
+ project = "nvidia-gpu-operator"
146
+ pods = Pod .get (dyn_client = openshift_dyn_client , namespace = project )
147
+
148
+ for pod in pods :
149
+ if "nvidia" in pod .instance .metadata .name :
150
+ logger .info (f"nvidia pod: { pod .instance .metadata .name } " )
151
+ if gpu_node in pod .instance .spec .nodeName :
152
+ logger .info (f"nvidia pod node name: { pod .instance .spec .nodeName } " )
153
+ nvidia_pods .append (pod .instance .metadata .name )
154
+
155
+ if len (nvidia_pods ) == int (expected_count ):
156
+ logger .info ("PASS: Found the expected nvidia pod count for GPU nodes" )
157
+ else :
158
+ err_msg = "Did not find the expected nvidia pod count for GPU nodes"
170
159
logger .error (f"FAIL: { err_msg } " )
171
160
assert False , err_msg
172
- else :
173
- logger .info ("PASS: Found the expected pod count for GPU nodes" )
0 commit comments