@@ -672,34 +672,46 @@ static inline int
672
672
mca_btl_ugni_progress_wait_list (mca_btl_ugni_module_t * ugni_module )
673
673
{
674
674
int rc = OPAL_SUCCESS ;
675
+ opal_list_t tmplist ;
676
+ opal_list_t * waitlist = & ugni_module -> ep_wait_list ;
675
677
mca_btl_base_endpoint_t * endpoint = NULL ;
676
678
int count ;
677
679
678
- if (0 == opal_list_get_size (& ugni_module -> ep_wait_list )) {
679
- return 0 ;
680
- }
681
-
682
680
/* check the count before taking the lock to avoid unnecessary locking */
683
- count = opal_list_get_size (& ugni_module -> ep_wait_list );
681
+ count = opal_list_get_size (waitlist );
684
682
if (0 == count ) {
685
683
return 0 ;
686
684
}
687
685
686
+ /* Don't hold the wait-list lock while processing the list as that may lead
687
+ * to a deadlock.
688
+ * Instead, move the wait_list elements into a temporary list and work on that.*/
689
+ OBJ_CONSTRUCT (& tmplist , opal_list_t );
688
690
OPAL_THREAD_LOCK (& ugni_module -> ep_wait_list_lock );
689
- count = opal_list_get_size (& ugni_module -> ep_wait_list );
691
+ opal_list_join (& tmplist , opal_list_get_end (& tmplist ), waitlist );
692
+ OPAL_THREAD_UNLOCK (& ugni_module -> ep_wait_list_lock );
693
+ count = opal_list_get_size (& tmplist );
690
694
do {
691
- endpoint = (mca_btl_base_endpoint_t * ) opal_list_remove_first (& ugni_module -> ep_wait_list );
695
+ endpoint = (mca_btl_base_endpoint_t * ) opal_list_remove_first (& tmplist );
692
696
if (endpoint != NULL ) {
693
697
rc = mca_btl_ugni_progress_send_wait_list (endpoint );
694
698
695
699
if (OPAL_SUCCESS != rc ) {
696
- opal_list_append (& ugni_module -> ep_wait_list , & endpoint -> super );
700
+ opal_list_append (& tmplist , & endpoint -> super );
697
701
} else {
698
702
endpoint -> wait_listed = false;
699
703
}
700
704
}
701
705
} while (endpoint != NULL && -- count > 0 ) ;
702
- OPAL_THREAD_UNLOCK (& ugni_module -> ep_wait_list_lock );
706
+
707
+ /* reinsert unfinished elements into the wait-list */
708
+ count = opal_list_get_size (& tmplist );
709
+ if (0 < count ) {
710
+ OPAL_THREAD_LOCK (& ugni_module -> ep_wait_list_lock );
711
+ opal_list_join (waitlist , opal_list_get_end (waitlist ), & tmplist );
712
+ OPAL_THREAD_UNLOCK (& ugni_module -> ep_wait_list_lock );
713
+ }
714
+ OBJ_DESTRUCT (& tmplist );
703
715
704
716
return rc ;
705
717
}
0 commit comments