Skip to content

Commit 599e031

Browse files
authored
Merge pull request #22 from laraPPr/utils_function_nvidia
Create utils function for nvidia-smi check
2 parents 772e408 + 5c7e69d commit 599e031

File tree

5 files changed

+48
-48
lines changed

5 files changed

+48
-48
lines changed

.github/workflows/tests_link_nvidia_host_libraries.yml

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,18 @@ jobs:
1919

2020
- name: Initialize EESSI
2121
uses: eessi/github-action-eessi@v3
22-
22+
23+
- name: Test function nvidia_gpu_available before setup of libraries
24+
run: |
25+
source scripts/utils.sh
26+
if nvidia_gpu_available; then
27+
echo "Error: Found NVIDIA libraries before the mock libraries were set up."
28+
exit 1
29+
else
30+
echo "NVIDIA libraries were not found."
31+
echo "Proceeding to setting up the mock NVIDIA libraries."
32+
fi
33+
2334
- name: Setup mock NVIDIA libraries
2435
run: |
2536
# Run the script to create mock libraries
@@ -48,6 +59,16 @@ jobs:
4859
echo "Updating PATH"
4960
echo "PATH=/tmp/nvidia-bin:$PATH" >> $GITHUB_ENV
5061
62+
- name: Test nvidia_gpu_available after setup of mock libraries
63+
run: |
64+
source scripts/utils.sh
65+
if nvidia_gpu_available; then
66+
echo "mock NVIDIA libraries and nvidia-smi were set up"
67+
else
68+
echo "Error: mock nvidia-smi is not available."
69+
exit 1
70+
fi
71+
5172
- name: Test LD_PRELOAD mode
5273
run: |
5374
echo ">>> Testing LD_PRELOAD mode"

EESSI-install-software.sh

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,6 @@ display_help() {
1717
echo " --skip-cuda-install - disable installing a full CUDA SDK in the host_injections prefix (e.g. in CI)"
1818
}
1919

20-
# Function to check if a command exists
21-
function command_exists() {
22-
command -v "$1" >/dev/null 2>&1
23-
}
24-
2520
function copy_build_log() {
2621
# copy specified build log to specified directory, with some context added
2722
build_log=${1}
@@ -315,18 +310,9 @@ else
315310
fi
316311

317312
# Install NVIDIA drivers in host_injections (if they exist)
318-
if command_exists "nvidia-smi"; then
319-
export LD_LIBRARY_PATH="/.singularity.d/libs:${LD_LIBRARY_PATH}"
320-
nvidia-smi --version
321-
ec=$?
322-
if [ ${ec} -eq 0 ]; then
323-
echo "Command 'nvidia-smi' found. Installing NVIDIA drivers for use in prefix shell..."
324-
${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh
325-
else
326-
echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully."
327-
echo "This script now assumes this is NOT a GPU node."
328-
echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error."
329-
fi
313+
if nvidia_gpu_available; then
314+
echo "Installing NVIDIA drivers for use in prefix shell..."
315+
${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh
330316
fi
331317

332318
if [ ! -z "${shared_fs_path}" ]; then

bot/build.sh

Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -220,23 +220,9 @@ BUILD_STEP_ARGS+=("--save" "${TARBALL_TMP_BUILD_STEP_DIR}")
220220
BUILD_STEP_ARGS+=("--storage" "${STORAGE}")
221221

222222
# add options required to handle NVIDIA support
223-
if command_exists "nvidia-smi"; then
224-
# Accept that this may fail
225-
set +e
226-
nvidia-smi --version
227-
ec=$?
228-
set -e
229-
if [ ${ec} -eq 0 ]; then
230-
echo "Command 'nvidia-smi' found, using available GPU"
231-
BUILD_STEP_ARGS+=("--nvidia" "all")
232-
else
233-
echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully."
234-
echo "This script now assumes this is NOT a GPU node."
235-
echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error."
236-
BUILD_STEP_ARGS+=("--nvidia" "install")
237-
fi
223+
if nvidia_gpu_available; then
224+
BUILD_STEP_ARGS+=("--nvidia" "all")
238225
else
239-
echo "No 'nvidia-smi' found, no available GPU but allowing overriding this check"
240226
BUILD_STEP_ARGS+=("--nvidia" "install")
241227
fi
242228

bot/test.sh

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -225,20 +225,8 @@ fi
225225
TEST_STEP_ARGS+=("--extra-bind-paths" "/sys/fs/cgroup:/hostsys/fs/cgroup:ro")
226226

227227
# add options required to handle NVIDIA support
228-
if command_exists "nvidia-smi"; then
229-
# Accept that this may fail
230-
set +e
231-
nvidia-smi --version
232-
ec=$?
233-
set -e
234-
if [ ${ec} -eq 0 ]; then
235-
echo "Command 'nvidia-smi' found, using available GPU"
236-
TEST_STEP_ARGS+=("--nvidia" "run")
237-
else
238-
echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully."
239-
echo "This script now assumes this is NOT a GPU node."
240-
echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error."
241-
fi
228+
if nvidia_gpu_available; then
229+
TEST_STEP_ARGS+=("--nvidia" "run")
242230
fi
243231

244232
# prepare arguments to test_suite.sh (specific to test step)

scripts/utils.sh

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,3 +147,22 @@ function get_ipv4_address {
147147
echo "${hipv4}"
148148
return 0
149149
}
150+
151+
function nvidia_gpu_available {
152+
if command_exists "nvidia-smi"; then
153+
# We are careful here in case we are running in a container and LD_LIBARY_PATH has been wiped
154+
LD_LIBRARY_PATH="/.singularity.d/libs:${LD_LIBRARY_PATH}" nvidia-smi --version
155+
ec=$?
156+
if [ ${ec} -eq 0 ]; then
157+
echo "Command 'nvidia-smi' found."
158+
return 0
159+
else
160+
echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully."
161+
echo "This script now assumes this is NOT a GPU node."
162+
return 1
163+
fi
164+
else
165+
echo "No 'nvidia-smi' found, no available GPU."
166+
return 2
167+
fi
168+
}

0 commit comments

Comments
 (0)