Skip to content

Commit be19752

Browse files
authored
Merge pull request #7181 from devreal/btl-ugni-deadlock-v3.1.x
uGNI: Fix potential deadlock when processing outstanding transfers (v3.1.x)
2 parents 259ff9f + c0b6d30 commit be19752

File tree

1 file changed

+21
-9
lines changed

1 file changed

+21
-9
lines changed

opal/mca/btl/ugni/btl_ugni_component.c

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -672,34 +672,46 @@ static inline int
672672
mca_btl_ugni_progress_wait_list (mca_btl_ugni_module_t *ugni_module)
673673
{
674674
int rc = OPAL_SUCCESS;
675+
opal_list_t tmplist;
676+
opal_list_t *waitlist = &ugni_module->ep_wait_list;
675677
mca_btl_base_endpoint_t *endpoint = NULL;
676678
int count;
677679

678-
if (0 == opal_list_get_size(&ugni_module->ep_wait_list)) {
679-
return 0;
680-
}
681-
682680
/* check the count before taking the lock to avoid unnecessary locking */
683-
count = opal_list_get_size(&ugni_module->ep_wait_list);
681+
count = opal_list_get_size(waitlist);
684682
if (0 == count) {
685683
return 0;
686684
}
687685

686+
/* Don't hold the wait-list lock while processing the list as that may lead
687+
* to a deadlock.
688+
* Instead, move the wait_list elements into a temporary list and work on that.*/
689+
OBJ_CONSTRUCT(&tmplist, opal_list_t);
688690
OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
689-
count = opal_list_get_size(&ugni_module->ep_wait_list);
691+
opal_list_join(&tmplist, opal_list_get_end(&tmplist), waitlist);
692+
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
693+
count = opal_list_get_size(&tmplist);
690694
do {
691-
endpoint = (mca_btl_base_endpoint_t *) opal_list_remove_first (&ugni_module->ep_wait_list);
695+
endpoint = (mca_btl_base_endpoint_t *) opal_list_remove_first (&tmplist);
692696
if (endpoint != NULL) {
693697
rc = mca_btl_ugni_progress_send_wait_list (endpoint);
694698

695699
if (OPAL_SUCCESS != rc) {
696-
opal_list_append (&ugni_module->ep_wait_list, &endpoint->super);
700+
opal_list_append (&tmplist, &endpoint->super);
697701
} else {
698702
endpoint->wait_listed = false;
699703
}
700704
}
701705
} while (endpoint != NULL && --count > 0) ;
702-
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
706+
707+
/* reinsert unfinished elements into the wait-list */
708+
count = opal_list_get_size(&tmplist);
709+
if (0 < count) {
710+
OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
711+
opal_list_join(waitlist, opal_list_get_end(waitlist), &tmplist);
712+
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
713+
}
714+
OBJ_DESTRUCT(&tmplist);
703715

704716
return rc;
705717
}

0 commit comments

Comments
 (0)