@@ -78,7 +78,7 @@ static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, s
78
78
static int ompi_osc_rdma_component_select (struct ompi_win_t * win , void * * base , size_t size , int disp_unit ,
79
79
struct ompi_communicator_t * comm , struct opal_info_t * info ,
80
80
int flavor , int * model );
81
- static int ompi_osc_rdma_query_btls (ompi_communicator_t * comm , ompi_osc_rdma_module_t * module );
81
+ static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t * comm , ompi_osc_rdma_module_t * module );
82
82
static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t * comm , ompi_osc_rdma_module_t * module );
83
83
84
84
static const char * ompi_osc_rdma_set_no_lock_info (opal_infosubscriber_t * obj , const char * key , const char * value );
@@ -395,7 +395,7 @@ static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, s
395
395
}
396
396
#endif /* OPAL_CUDA_SUPPORT */
397
397
398
- if (OMPI_SUCCESS == ompi_osc_rdma_query_btls (comm , NULL )) {
398
+ if (OMPI_SUCCESS == ompi_osc_rdma_query_accelerated_btls (comm , NULL )) {
399
399
return mca_osc_rdma_component .priority ;
400
400
}
401
401
@@ -882,7 +882,7 @@ static void ompi_osc_rdma_ensure_local_add_procs (void)
882
882
* @return OMPI_ERR_UNREACH if no BTLs can be found that match
883
883
*
884
884
* In this case an "alternate" BTL is a BTL does not meet the
885
- * requirements of a BTL outlined in ompi_osc_rdma_query_btls ().
885
+ * requirements of a BTL outlined in ompi_osc_rdma_query_accelerated_btls ().
886
886
* Either it does not provide connectivity to all peers, provide
887
887
* remote completion, or natively support put/get/atomic.. Since more
888
888
* than one BTL may be needed for this support the OSC component will
@@ -937,6 +937,20 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o
937
937
return btls_found > 0 ? OMPI_SUCCESS : OMPI_ERR_UNREACH ;
938
938
}
939
939
940
+ /* Check for BTL requirements:
941
+ * 1) RDMA (put/get) and ATOMIC operations. We only require cswap
942
+ * and fetch and add and will emulate other opterations with those
943
+ * two as necessary.
944
+ * 2) Remote Completion
945
+ */
946
+ static bool ompi_osc_rdma_check_accelerated_btl (struct mca_btl_base_module_t * btl )
947
+ {
948
+ return ((btl -> btl_flags & MCA_BTL_FLAGS_RDMA ) &&
949
+ (btl -> btl_flags & MCA_BTL_FLAGS_ATOMIC_FOPS ) &&
950
+ (btl -> btl_flags & MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION ) &&
951
+ (btl -> btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_ADD ));
952
+ }
953
+
940
954
/*
941
955
* Attempt to find a BTL that can be used for native RDMA
942
956
*
@@ -957,18 +971,12 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o
957
971
* If module is NULL, the code acts as a query mechanism to find any
958
972
* potential BTLs, and is used to implement osc_rdma_query().
959
973
*/
960
- static int ompi_osc_rdma_query_btls (ompi_communicator_t * comm , ompi_osc_rdma_module_t * module )
974
+ static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t * comm , ompi_osc_rdma_module_t * module )
961
975
{
962
- struct mca_btl_base_module_t * * possible_btls = NULL ;
963
976
int comm_size = ompi_comm_size (comm );
964
- int comm_rank = ompi_comm_rank (comm );
965
- int rc = OMPI_SUCCESS , max_btls = 0 ;
966
- unsigned int selected_latency = INT_MAX ;
967
- struct mca_btl_base_module_t * selected_btl = NULL ;
968
- mca_btl_base_selected_module_t * item ;
969
- int * btl_counts = NULL ;
977
+ struct mca_btl_base_module_t * selected_btl ;
978
+ mca_bml_base_endpoint_t * base_endpoint ;
970
979
char * * btls_to_use ;
971
- void * tmp ;
972
980
973
981
if (module ) {
974
982
ompi_osc_rdma_selected_btl_insert (module , NULL , 0 );
@@ -980,37 +988,30 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo
980
988
in general usage. */
981
989
btls_to_use = opal_argv_split (ompi_osc_rdma_full_connectivity_btls , ',' );
982
990
if (btls_to_use ) {
983
- /* rdma and atomics are only supported with BTLs at the moment
984
- * If a btl does not support remote completion, it cannot be used as the primary btl.
985
- * It can still be selected as an alternate btl */
991
+ mca_btl_base_selected_module_t * item ;
992
+
993
+ selected_btl = NULL ;
994
+
995
+ /* rdma and atomics are only supported with BTLs at the moment */
986
996
OPAL_LIST_FOREACH (item , & mca_btl_base_modules_initialized , mca_btl_base_selected_module_t ) {
987
997
for (int i = 0 ; btls_to_use [i ] ; ++ i ) {
988
998
if (0 != strcmp (btls_to_use [i ], item -> btl_module -> btl_component -> btl_version .mca_component_name )) {
989
999
continue ;
990
1000
}
991
1001
992
- if ((item -> btl_module -> btl_flags & (MCA_BTL_FLAGS_RDMA )) == MCA_BTL_FLAGS_RDMA &&
993
- (item -> btl_module -> btl_flags & (MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS | MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION ))) {
994
- if (!selected_btl || item -> btl_module -> btl_latency < selected_btl -> btl_latency ) {
1002
+ if (ompi_osc_rdma_check_accelerated_btl (item -> btl_module )) {
1003
+ if (NULL == selected_btl || item -> btl_module -> btl_latency < selected_btl -> btl_latency ) {
995
1004
selected_btl = item -> btl_module ;
996
1005
}
997
1006
}
998
1007
}
999
1008
}
1000
1009
1001
1010
opal_argv_free (btls_to_use );
1002
- }
1003
1011
1004
- if (NULL != selected_btl ) {
1005
- if (module ) {
1006
- ompi_osc_rdma_selected_btl_insert (module , selected_btl , 0 );
1007
- module -> btls_in_use = 1 ;
1008
- module -> use_memory_registration = selected_btl -> btl_register_mem != NULL ;
1012
+ if (NULL != selected_btl ) {
1013
+ goto btl_selection_complete ;
1009
1014
}
1010
-
1011
- OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_INFO , "selected btl: %s" ,
1012
- selected_btl -> btl_component -> btl_version .mca_component_name );
1013
- return OMPI_SUCCESS ;
1014
1015
}
1015
1016
1016
1017
/* if osc/rdma gets selected we need to ensure that all local procs have been added */
@@ -1021,123 +1022,78 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo
1021
1022
* other requirements was not found. Look for BTLs that may be
1022
1023
* able to talk to all peers. This is obviously more expensive
1023
1024
* than the check above.
1025
+ *
1026
+ * This algorithm skips a potential use case: it requires
1027
+ * reachability to self, which is not strictly needed if BTL and
1028
+ * CPU atomics are atomic with each other. However, the set of
1029
+ * BTLs which can not send to self, which have RDMA semantics, an
1030
+ * which have the rquired atomicity is currently the null set and
1031
+ * almost certain to remain the null set, so we keep it simple.
1032
+ *
1033
+ * We only want BTLs that can reach all peers, so use rank 0's BTL
1034
+ * list as the list of all available BTLs. Any BTL that cannot
1035
+ * be used to communicate with rank 0 necessarily is not in the
1036
+ * list of all available BTLs for this algorithm.
1024
1037
*/
1038
+ base_endpoint = mca_bml_base_get_endpoint (ompi_comm_peer_lookup (comm , 0 ));
1039
+ if (NULL == base_endpoint ) {
1040
+ return OMPI_ERR_UNREACH ;
1041
+ }
1025
1042
1026
- for (int rank = 0 ; rank < comm_size ; ++ rank ) {
1027
- ompi_proc_t * proc = ompi_comm_peer_lookup (comm , rank );
1028
- mca_bml_base_endpoint_t * endpoint ;
1029
- int num_btls , prev_max ;
1030
- bool found_btl = false;
1031
-
1032
- endpoint = mca_bml_base_get_endpoint (proc );
1033
- if (NULL == endpoint ) {
1034
- /* can't continue if some peer is unreachable */
1035
- rc = OMPI_ERR_UNREACH ;
1036
- break ;
1037
- }
1043
+ selected_btl = NULL ;
1044
+ for (size_t i_btl = 0 ;
1045
+ i_btl < mca_bml_base_btl_array_get_size (& base_endpoint -> btl_rdma );
1046
+ ++ i_btl ) {
1047
+ bool have_connectivity = true;
1048
+ struct mca_bml_base_btl_t * examine_bml_btl ;
1049
+ struct mca_btl_base_module_t * examine_btl ;
1038
1050
1039
- num_btls = mca_bml_base_btl_array_get_size (& endpoint -> btl_rdma );
1040
- if (0 == num_btls ) {
1041
- rc = OMPI_ERR_NOT_AVAILABLE ;
1042
- /* at least one rank doesn't have an RDMA capable btl */
1043
- break ;
1051
+ examine_bml_btl = mca_bml_base_btl_array_get_index (& base_endpoint -> btl_rdma , i_btl );
1052
+ if (NULL == examine_bml_btl ) {
1053
+ return OMPI_ERR_NOT_FOUND ;
1044
1054
}
1055
+ examine_btl = examine_bml_btl -> btl ;
1045
1056
1046
- prev_max = max_btls ;
1047
-
1048
- max_btls = (max_btls > num_btls ) ? max_btls : num_btls ;
1049
-
1050
- tmp = realloc (possible_btls , sizeof (void * ) * max_btls );
1051
- if (NULL == tmp ) {
1052
- rc = OMPI_ERR_OUT_OF_RESOURCE ;
1053
- break ;
1057
+ /* skip any BTL which doesn't meet our requirements */
1058
+ if (!ompi_osc_rdma_check_accelerated_btl (examine_btl )) {
1059
+ continue ;
1054
1060
}
1055
- possible_btls = tmp ;
1056
1061
1057
- for (int j = prev_max ; j < max_btls ; ++ j ) {
1058
- possible_btls [j ] = NULL ;
1059
- }
1062
+ /* check connectivity across all ranks */
1063
+ for (int rank = 0 ; rank < comm_size ; ++ rank ) {
1064
+ ompi_proc_t * proc = ompi_comm_peer_lookup (comm , rank );
1065
+ mca_bml_base_endpoint_t * endpoint ;
1060
1066
1061
- tmp = realloc (btl_counts , sizeof (int ) * max_btls );
1062
- if (NULL == tmp ) {
1063
- rc = OMPI_ERR_OUT_OF_RESOURCE ;
1064
- break ;
1065
- }
1066
- btl_counts = tmp ;
1067
-
1068
- for (int i_btl = 0 ; i_btl < num_btls ; ++ i_btl ) {
1069
- /* Check for BTL requirements:
1070
- * 1) RDMA (put/get) and ATOMIC operations. We only
1071
- * require cswap and fetch and add and will emulate
1072
- * other opterations with those two as necessary.
1073
- * 2) Remote Completion
1074
- *
1075
- * If the BTL meets all those requirements, increment the
1076
- * btl_counts to indicate that this btl can talk to the
1077
- * current peer proc.
1078
- */
1079
- if (((endpoint -> btl_rdma .bml_btls [i_btl ].btl -> btl_flags & (MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS )) ==
1080
- (MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS )) &&
1081
- (endpoint -> btl_rdma .bml_btls [i_btl ].btl -> btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_ADD ) &&
1082
- (endpoint -> btl_rdma .bml_btls [i_btl ].btl -> btl_flags & MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION )) {
1083
- for (int j = 0 ; j < max_btls ; ++ j ) {
1084
- if (endpoint -> btl_rdma .bml_btls [i_btl ].btl == possible_btls [j ]) {
1085
- ++ btl_counts [j ];
1086
- found_btl = true;
1087
- break ;
1088
- } else if (NULL == possible_btls [j ]) {
1089
- possible_btls [j ] = endpoint -> btl_rdma .bml_btls [i_btl ].btl ;
1090
- btl_counts [j ] = 1 ;
1091
- found_btl = true;
1092
- break ;
1093
- }
1094
- }
1067
+ endpoint = mca_bml_base_get_endpoint (proc );
1068
+ if (NULL == endpoint ) {
1069
+ have_connectivity = false;
1070
+ break ;
1095
1071
}
1096
- }
1097
-
1098
- /* any non-local rank must have a usable btl */
1099
- if (!found_btl && comm_rank != rank ) {
1100
- /* no btl = no rdma/atomics */
1101
- rc = OMPI_ERR_UNREACH ;
1102
- break ;
1103
- }
1104
- }
1105
1072
1106
- if (OMPI_SUCCESS != rc ) {
1107
- free (possible_btls );
1108
- free (btl_counts );
1109
- return rc ;
1110
- }
1111
-
1112
- for (int i = 0 ; i < max_btls ; ++ i ) {
1113
- int btl_count = btl_counts [i ];
1114
-
1115
- if (NULL == possible_btls [i ]) {
1116
- break ;
1117
- }
1118
-
1119
- if (possible_btls [i ]-> btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB ) {
1120
- /* The onesided component can, if BTL atomics are atomic
1121
- relative to CPU atomics, handle atomics to self, so
1122
- increment the counter once to cover that case. */
1123
- btl_count ++ ;
1073
+ if (NULL == mca_bml_base_btl_array_find (& endpoint -> btl_rdma ,
1074
+ examine_btl )) {
1075
+ have_connectivity = false;
1076
+ break ;
1077
+ }
1124
1078
}
1125
1079
1126
- if (btl_count >= comm_size && possible_btls [i ]-> btl_latency < selected_latency ) {
1127
- selected_btl = possible_btls [i ];
1128
- selected_latency = possible_btls [i ]-> btl_latency ;
1080
+ /* if we have connectivity, displace currently selected btl if
1081
+ * this one has lower latency; we prioritize latency over all
1082
+ * other parameters
1083
+ */
1084
+ if (have_connectivity ) {
1085
+ if (NULL == selected_btl || examine_btl -> btl_latency < selected_btl -> btl_latency ) {
1086
+ selected_btl = examine_btl ;
1087
+ }
1129
1088
}
1130
1089
}
1131
1090
1132
- free (possible_btls );
1133
- free (btl_counts );
1134
-
1135
1091
if (NULL == selected_btl ) {
1136
1092
OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_INFO , "no suitable btls found" );
1137
- /* no btl = no rdma/atomics */
1138
1093
return OMPI_ERR_NOT_AVAILABLE ;
1139
1094
}
1140
1095
1096
+ btl_selection_complete :
1141
1097
if (module ) {
1142
1098
ompi_osc_rdma_selected_btl_insert (module , selected_btl , 0 );
1143
1099
module -> btls_in_use = 1 ;
@@ -1414,7 +1370,7 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
1414
1370
}
1415
1371
1416
1372
/* find rdma capable endpoints */
1417
- ret = ompi_osc_rdma_query_btls (module -> comm , module );
1373
+ ret = ompi_osc_rdma_query_accelerated_btls (module -> comm , module );
1418
1374
if (OMPI_SUCCESS != ret ) {
1419
1375
OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_WARN , "could not find a suitable btl. falling back on "
1420
1376
"active-message BTLs" );
0 commit comments