Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 1039234

Browse files
authoredMay 15, 2025··
Fix a bug that was causing compute node self‑termination to hang on Ubuntu 24.04 nodes (#662)
1 parent 6a73bdd commit 1039234

File tree

3 files changed

+52
-6
lines changed

3 files changed

+52
-6
lines changed
 

‎CHANGELOG.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ This file is used to list changes made in each version of the aws-parallelcluste
66
3.13.1
77
------
88

9-
**CHANGES**
10-
- There were no changes for this version.
9+
**BUG FIXES**
10+
- Fix a bug that was causing compute node self‑termination to hang on Ubuntu 24.04 nodes.
1111

1212
3.13.0
1313
------

‎src/slurm_plugin/computemgtd.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,14 +125,34 @@ def _read_nodename_from_file(nodename_file_path):
125125
raise
126126

127127

128+
def _is_ubuntu2404():
129+
"""Return True if the OS is Ubuntu 24.04."""
130+
try:
131+
with open("/etc/os-release", "r") as f:
132+
info = dict(line.strip().split("=", 1) for line in f if "=" in line)
133+
os_id = info.get("ID", "").strip('"').lower()
134+
version = info.get("VERSION_ID", "").strip('"')
135+
return os_id == "ubuntu" and version.startswith("24.04")
136+
except Exception as e:
137+
log.warning("Unable to detect OS version from /etc/os-release: %s", e)
138+
return False
139+
140+
128141
@log_exception(log, "self terminating compute instance", catch_exception=CalledProcessError, raise_on_error=False)
129142
def _self_terminate():
130143
"""Self terminate the instance."""
131144
# Sleep for 10 seconds so termination log entries are uploaded to CW logs
132145
log.info("Preparing to self terminate the instance in 10 seconds!")
133146
time.sleep(10)
147+
if _is_ubuntu2404():
148+
shutdown_cmd = "sudo systemctl poweroff --force"
149+
log.info("Detected Ubuntu 24.04 – using `%s`", shutdown_cmd)
150+
else:
151+
shutdown_cmd = "sudo shutdown -h now"
152+
log.info("Using default shutdown command `%s`", shutdown_cmd)
153+
134154
log.info("Self terminating instance now!")
135-
run_command("sudo shutdown -h now")
155+
run_command(shutdown_cmd)
136156

137157

138158
@retry(stop_max_attempt_number=3, wait_fixed=1500)

‎tests/slurm_plugin/test_computemgtd.py

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,12 @@
1212

1313
import logging
1414
import os
15+
from unittest.mock import mock_open
1516

1617
import pytest
1718
import slurm_plugin
1819
from assertpy import assert_that
19-
from slurm_plugin.computemgtd import ComputemgtdConfig, _is_self_node_down, _self_terminate
20+
from slurm_plugin.computemgtd import ComputemgtdConfig, _is_self_node_down, _is_ubuntu2404, _self_terminate
2021
from slurm_plugin.slurm_resources import DynamicNode
2122

2223

@@ -103,13 +104,38 @@ def test_is_self_node_down(mock_node_info, expected_result, mocker):
103104
assert_that(_is_self_node_down("queue1-st-c5xlarge-1")).is_equal_to(expected_result)
104105

105106

106-
def test_self_terminate(mocker, caplog):
107+
@pytest.mark.parametrize(
108+
("is_ubuntu2404", "expected_cmd"),
109+
[
110+
(True, "sudo systemctl poweroff --force"),
111+
(False, "sudo shutdown -h now"),
112+
],
113+
)
114+
def test_self_terminate(mocker, caplog, is_ubuntu2404, expected_cmd):
107115
"""Verify self-termination is implemented via a shutdown command rather than calling TerminateInstances."""
116+
mocker.patch("slurm_plugin.computemgtd._is_ubuntu2404", return_value=is_ubuntu2404)
108117
run_command_patch = mocker.patch("slurm_plugin.computemgtd.run_command")
109118
sleep_patch = mocker.patch("slurm_plugin.computemgtd.time.sleep")
110119
with caplog.at_level(logging.INFO):
111120
_self_terminate()
112121
assert_that(caplog.text).contains("Preparing to self terminate the instance in 10 seconds!")
113122
assert_that(caplog.text).contains("Self terminating instance now!")
114-
run_command_patch.assert_called_with("sudo shutdown -h now")
123+
run_command_patch.assert_called_with(expected_cmd)
115124
sleep_patch.assert_called_with(10)
125+
126+
127+
@pytest.mark.parametrize(
128+
("file_content", "expected"),
129+
[
130+
('ID=ubuntu\nVERSION_ID="24.04"\n', True),
131+
('ID=ubuntu\nVERSION_ID="24.04.2"\n', True),
132+
('ID=ubuntu\nVERSION_ID="22.04"\n', False),
133+
('ID=rocky\nVERSION_ID="9.3"\n', False),
134+
("ID=ubuntu\n", False),
135+
],
136+
)
137+
def test_is_ubuntu2404(file_content, expected, mocker):
138+
m = mock_open(read_data=file_content)
139+
mocker.patch("builtins.open", m)
140+
141+
assert _is_ubuntu2404() is expected

0 commit comments

Comments
 (0)
Please sign in to comment.