@@ -357,6 +357,7 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int mpi_assert, ompi_win_t *
357
357
return ret ;
358
358
}
359
359
360
+
360
361
int ompi_osc_rdma_start_atomic (ompi_group_t * group , int mpi_assert , ompi_win_t * win )
361
362
{
362
363
ompi_osc_rdma_module_t * module = GET_MODULE (win );
@@ -590,6 +591,82 @@ int ompi_osc_rdma_test_atomic (ompi_win_t *win, int *flag)
590
591
return OMPI_SUCCESS ;
591
592
}
592
593
594
+ /**
595
+ * This function implements a different barrier mechanism for Fence,
596
+ * when any of the selected btl does not support remote completion.
597
+ * This barrier is based on imposing the MCA_BTL_ORDER_RDMA_ATOMCS
598
+ * ordering requirement on seleted btls.
599
+ */
600
+ static
601
+ int ompi_osc_rdma_fence_barrier_by_ordered_channel (ompi_win_t * win )
602
+ {
603
+ ompi_osc_rdma_module_t * module = GET_MODULE (win );
604
+ ompi_osc_rdma_state_t * state = module -> state ;
605
+ ompi_osc_rdma_sync_t * sync = & module -> all_sync ;
606
+ ompi_osc_rdma_peer_t * * peers ;
607
+ ompi_group_t * group ;
608
+ int num_peers ;
609
+ int ret ;
610
+
611
+ assert (module -> btl_order == MCA_BTL_IN_ORDER_RDMA_ATOMICS );
612
+ OPAL_THREAD_LOCK (& module -> lock );
613
+
614
+ if (ompi_comm_size (module -> comm ) == 1 ) {
615
+ OPAL_THREAD_UNLOCK (& (module -> lock ));
616
+ return OMPI_SUCCESS ;
617
+ }
618
+
619
+ ret = ompi_comm_group (module -> comm , & group );
620
+ if (OMPI_SUCCESS != ret ) {
621
+ OPAL_THREAD_UNLOCK (& (module -> lock ));
622
+ return ret ;
623
+ }
624
+
625
+ num_peers = sync -> num_peers ;
626
+ assert (ompi_group_size (group ) == num_peers );
627
+ peers = ompi_osc_rdma_get_peers (module , group );
628
+ if (NULL == peers ) {
629
+ OPAL_THREAD_UNLOCK (& (module -> lock ));
630
+ return OMPI_ERR_OUT_OF_RESOURCE ;
631
+ }
632
+
633
+ module -> state -> num_fenced_peers = 0 ;
634
+ OPAL_THREAD_UNLOCK (& (module -> lock ));
635
+ ret = module -> comm -> c_coll -> coll_barrier (module -> comm , module -> comm -> c_coll -> coll_barrier_module );
636
+ if (ret ) {
637
+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_ERROR , "barrier failed!" );
638
+ return ret ;
639
+ }
640
+
641
+ /* for each process in the group increment their number of fenced peers */
642
+ for (int i = 0 ; i < num_peers ; ++ i ) {
643
+ ompi_osc_rdma_peer_t * peer = peers [i ];
644
+ intptr_t target = (intptr_t ) peer -> state + offsetof (ompi_osc_rdma_state_t , num_fenced_peers );
645
+
646
+ /* the usage of peer local state requires selected btls to support remote completion,
647
+ * if that is the case, this function will not have been called
648
+ */
649
+ assert (!ompi_osc_rdma_peer_local_state (peer ));
650
+ ret = ompi_osc_rdma_lock_btl_op (module , peer , target , MCA_BTL_ATOMIC_ADD , 1 , true);
651
+ if (OMPI_SUCCESS != ret ) {
652
+ return ret ;
653
+ }
654
+ }
655
+
656
+ ompi_osc_rdma_release_peers (peers , num_peers );
657
+ ompi_group_free (& group );
658
+
659
+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "increased fenced_peer counter of all peers" );
660
+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "waiting for all peers to increase my counter" );
661
+ while (num_peers != state -> num_fenced_peers ) {
662
+ ompi_osc_rdma_progress (module );
663
+ opal_atomic_mb ();
664
+ }
665
+
666
+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "received fence message from all peers" );
667
+ return OMPI_SUCCESS ;
668
+ }
669
+
593
670
int ompi_osc_rdma_fence_atomic (int mpi_assert , ompi_win_t * win )
594
671
{
595
672
ompi_osc_rdma_module_t * module = GET_MODULE (win );
@@ -627,7 +704,18 @@ int ompi_osc_rdma_fence_atomic (int mpi_assert, ompi_win_t *win)
627
704
ompi_osc_rdma_sync_rdma_complete (& module -> all_sync );
628
705
629
706
/* ensure all writes to my memory are complete (both local stores, and RMA operations) */
630
- ret = module -> comm -> c_coll -> coll_barrier (module -> comm , module -> comm -> c_coll -> coll_barrier_module );
707
+ if (module -> btl_support_remote_completion ) {
708
+ /* if all selected btls support remote completion, then all RMA operations have finished
709
+ * on remote side. A barrier is enough to complete the fence.
710
+ */
711
+ ret = module -> comm -> c_coll -> coll_barrier (module -> comm , module -> comm -> c_coll -> coll_barrier_module );
712
+ } else {
713
+ /*
714
+ * if any selected btl does not support remote completion, we will have to send a completion
715
+ * message (through the same endpoint of data transfer) to every peer, then wait for a message from every peer.
716
+ */
717
+ ret = ompi_osc_rdma_fence_barrier_by_ordered_channel (win );
718
+ }
631
719
632
720
if (mpi_assert & MPI_MODE_NOSUCCEED ) {
633
721
/* as specified in MPI-3 p 438 3-5 the fence can end an epoch. it isn't explicitly
0 commit comments