Skip to content

Commit 3dba9ec

Browse files
authored
Merge pull request #7033 from jjhursey/v4-fix-sigkill-wait
v4.0.x:Fix the sigkill timeout sleep to prevent SIGCHLD from preventing completion
2 parents 106109a + c6fab32 commit 3dba9ec

File tree

1 file changed

+24
-4
lines changed

1 file changed

+24
-4
lines changed

orte/mca/odls/base/odls_base_default_fns.c

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1767,7 +1767,7 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
17671767
orte_proc_t *child;
17681768
opal_list_t procs_killed;
17691769
orte_proc_t *proc, proctmp;
1770-
int i, j;
1770+
int i, j, ret;
17711771
opal_pointer_array_t procarray, *procptr;
17721772
bool do_cleanup;
17731773
orte_odls_quick_caddy_t *cd;
@@ -1913,7 +1913,17 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
19131913
/* if we are issuing signals, then we need to wait a little
19141914
* and send the next in sequence */
19151915
if (0 < opal_list_get_size(&procs_killed)) {
1916-
sleep(orte_odls_globals.timeout_before_sigkill);
1916+
/* Wait a little. Do so in a loop since sleep() can be interrupted by a
1917+
* signal. Most likely SIGCHLD in this case */
1918+
ret = orte_odls_globals.timeout_before_sigkill;
1919+
while( ret > 0 ) {
1920+
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1921+
"%s Sleep %d sec (total = %d)",
1922+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1923+
ret, orte_odls_globals.timeout_before_sigkill));
1924+
ret = sleep(ret);
1925+
}
1926+
19171927
/* issue a SIGTERM to all */
19181928
OPAL_LIST_FOREACH(cd, &procs_killed, orte_odls_quick_caddy_t) {
19191929
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
@@ -1922,8 +1932,18 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
19221932
ORTE_NAME_PRINT(&cd->child->name)));
19231933
kill_local(cd->child->pid, SIGTERM);
19241934
}
1925-
/* wait a little again */
1926-
sleep(orte_odls_globals.timeout_before_sigkill);
1935+
1936+
/* Wait a little. Do so in a loop since sleep() can be interrupted by a
1937+
* signal. Most likely SIGCHLD in this case */
1938+
ret = orte_odls_globals.timeout_before_sigkill;
1939+
while( ret > 0 ) {
1940+
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1941+
"%s Sleep %d sec (total = %d)",
1942+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1943+
ret, orte_odls_globals.timeout_before_sigkill));
1944+
ret = sleep(ret);
1945+
}
1946+
19271947
/* issue a SIGKILL to all */
19281948
OPAL_LIST_FOREACH(cd, &procs_killed, orte_odls_quick_caddy_t) {
19291949
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,

0 commit comments

Comments
 (0)