Skip to content

Commit be67734

Browse files
authored
Merge pull request #6922 from hoopoepg/topic/fixed-hand-on-shmem-finalize-v4.0
SPML/UCX: fixed hang in SHMEM_FINALIZE - v4.0
2 parents e4adbee + 1f9fce8 commit be67734

File tree

2 files changed

+11
-8
lines changed

2 files changed

+11
-8
lines changed

opal/mca/common/ucx/common_ucx.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,8 +186,11 @@ static void opal_common_ucx_wait_all_requests(void **reqs, int count, ucp_worker
186186
}
187187
}
188188

189-
OPAL_DECLSPEC int opal_common_ucx_del_procs_nofence(opal_common_ucx_del_proc_t *procs, size_t count,
190-
size_t my_rank, size_t max_disconnect, ucp_worker_h worker) {
189+
OPAL_DECLSPEC int opal_common_ucx_del_procs_nofence(opal_common_ucx_del_proc_t *procs,
190+
size_t count, size_t my_rank,
191+
size_t max_disconnect,
192+
ucp_worker_h worker)
193+
{
191194
size_t num_reqs;
192195
size_t max_reqs;
193196
void *dreq, **dreqs;

oshmem/mca/spml/ucx/spml_ucx.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -127,18 +127,16 @@ int mca_spml_ucx_del_procs(ompi_proc_t** procs, size_t nprocs)
127127
mca_spml_ucx_ctx_default.ucp_peers[i].ucp_conn = NULL;
128128
}
129129

130-
ret = opal_common_ucx_del_procs(del_procs, nprocs, oshmem_my_proc_id(),
131-
mca_spml_ucx.num_disconnect,
132-
mca_spml_ucx_ctx_default.ucp_worker);
133-
130+
ret = opal_common_ucx_del_procs_nofence(del_procs, nprocs, oshmem_my_proc_id(),
131+
mca_spml_ucx.num_disconnect,
132+
mca_spml_ucx_ctx_default.ucp_worker);
133+
/* No need to barrier here - barrier is called in _shmem_finalize */
134134
free(del_procs);
135135
free(mca_spml_ucx.remote_addrs_tbl);
136136
free(mca_spml_ucx_ctx_default.ucp_peers);
137137

138138
mca_spml_ucx_ctx_default.ucp_peers = NULL;
139139

140-
opal_common_ucx_mca_proc_added();
141-
142140
return ret;
143141
}
144142

@@ -326,6 +324,8 @@ int mca_spml_ucx_add_procs(ompi_proc_t** procs, size_t nprocs)
326324
free(wk_roffs);
327325

328326
SPML_UCX_VERBOSE(50, "*** ADDED PROCS ***");
327+
328+
opal_common_ucx_mca_proc_added();
329329
return OSHMEM_SUCCESS;
330330

331331
error2:

0 commit comments

Comments
 (0)