@@ -85,7 +85,6 @@ static const char* ompi_osc_rdma_set_no_lock_info(opal_infosubscriber_t *obj, co
85
85
86
86
static char * ompi_osc_rdma_btl_names ;
87
87
static char * ompi_osc_rdma_mtl_names ;
88
- static char * ompi_osc_rdma_btl_alternate_names ;
89
88
90
89
static const mca_base_var_enum_value_t ompi_osc_rdma_locking_modes [] = {
91
90
{.value = OMPI_OSC_RDMA_LOCKING_TWO_LEVEL , .string = "two_level" },
@@ -266,14 +265,6 @@ static int ompi_osc_rdma_component_register (void)
266
265
MCA_BASE_VAR_SCOPE_GROUP , & ompi_osc_rdma_btl_names );
267
266
free (description_str );
268
267
269
- ompi_osc_rdma_btl_alternate_names = "sm,tcp" ;
270
- opal_asprintf (& description_str , "Comma-delimited list of alternate BTL component names to allow without verifying "
271
- "connectivity (default: %s)" , ompi_osc_rdma_btl_alternate_names );
272
- (void ) mca_base_component_var_register (& mca_osc_rdma_component .super .osc_version , "alternate_btls" , description_str ,
273
- MCA_BASE_VAR_TYPE_STRING , NULL , 0 , 0 , OPAL_INFO_LVL_3 ,
274
- MCA_BASE_VAR_SCOPE_GROUP , & ompi_osc_rdma_btl_alternate_names );
275
- free (description_str );
276
-
277
268
ompi_osc_rdma_mtl_names = "psm2" ;
278
269
opal_asprintf (& description_str , "Comma-delimited list of MTL component names to lower the priority of rdma "
279
270
"osc component (default: %s)" , ompi_osc_rdma_mtl_names );
@@ -919,56 +910,67 @@ static void ompi_osc_rdma_ensure_local_add_procs (void)
919
910
* @return OMPI_SUCCESS if BTLs can be found
920
911
* @return OMPI_ERR_UNREACH if no BTLs can be found that match
921
912
*
922
- * In this case an "alternate" BTL is a BTL that does not provide true RDMA but
923
- * can use active messages using the BTL base AM RDMA/atomics. Since more than
924
- * one BTL may be needed for this support the OSC component will disable the
925
- * use of registration-based RDMA (these BTLs will not be used) and will use
926
- * any remaining BTL. By default the BTLs used will be tcp and sm but any single
927
- * (or pair) of BTLs may be used.
913
+ * This function is used when there ompi_osc_rdm_query_btls() failed to find
914
+ * a single btl that can communicate with all peers and supports remote completion.
915
+ * In this case, osc/rdma will use mulitple btls for communications. One process
916
+ * can use different btl to communicate with different peer. Such btls are called
917
+ * "alternate btls".
918
+ *
919
+ * For an alternate btl, this function disabled its native implementation of
920
+ * RDMA and atomics, and made osc/rdma to always use the active message RDMA/atomics
921
+ * with the alternate btl.
922
+ *
923
+ * The reason to disable an alternate btl's native atomics is because
924
+ * When multiple alternate btls are being used, the atomicity accross btl's own
925
+ * atomics is not guaranteed. Therefore, osc/rdma must use active message atomics.
926
+ *
927
+ * The reason to disable an alternate btls' native RDMA put and get is because
928
+ * it signficantly simplified osc/rdma's completion. The simplication came in two
929
+ * areas:
930
+ *
931
+ * First, active message RDMA supports remote completion (when a btl's native
932
+ * RDMA is disabled). Remote completion is required by several key components
933
+ * of osc/rdma: the usage of cpu atomics to update peer's state, the usage
934
+ * of local leader to update peer's state, and its fence implementation.
935
+ * If osc/rdma do not use active message RDMA on alternate btls, it will
936
+ * have to keep track of each selected btl's support of remote completion.
937
+ * If any selected btl does not support remote completion, it will have to
938
+ * disable the usage of cpu atomics, disable the usage of local leader,
939
+ * and implement a different fence mechanism.
940
+ *
941
+ * Second, active message RDMA does not use memory registration explicitly,
942
+ * therefore using it eliminates the need to store and exchange multiple
943
+ * memory registrations.
928
944
*/
929
945
static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t * comm , ompi_osc_rdma_module_t * module )
930
946
{
931
947
mca_btl_base_selected_module_t * item ;
932
- char * * btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names , ',' );
933
948
int btls_found = 0 ;
934
949
935
- btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names , ',' );
936
- if (NULL == btls_to_use ) {
937
- OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_INFO , "no alternate BTLs requested: %s" , ompi_osc_rdma_btl_alternate_names );
938
- return OMPI_ERR_UNREACH ;
939
- }
940
-
941
950
if (module ) {
942
951
module -> btls_in_use = 0 ;
943
952
}
944
953
945
954
/* rdma and atomics are only supported with BTLs at the moment */
946
- for (int i = 0 ; btls_to_use [i ] ; ++ i ) {
947
- OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_INFO , "checking for btl %s" , btls_to_use [i ]);
948
- OPAL_LIST_FOREACH (item , & mca_btl_base_modules_initialized , mca_btl_base_selected_module_t ) {
949
- if (NULL != item -> btl_module -> btl_register_mem ) {
950
- OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_INFO , "skipping RDMA btl when searching for alternate BTL" );
951
- continue ;
952
- }
953
-
954
- if (0 != strcmp (btls_to_use [i ], item -> btl_module -> btl_component -> btl_version .mca_component_name )) {
955
- OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_INFO , "skipping btl %s" ,
956
- item -> btl_module -> btl_component -> btl_version .mca_component_name );
957
- continue ;
958
- }
955
+ OPAL_LIST_FOREACH (item , & mca_btl_base_modules_initialized , mca_btl_base_selected_module_t ) {
956
+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_INFO , "found alternate btl %s" ,
957
+ item -> btl_module -> btl_component -> btl_version .mca_component_name );
959
958
960
- OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_INFO , "found alternate btl %s" , btls_to_use [i ]);
959
+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_INFO , "disabing btl's native support of RDMA and ATOMIC" );
960
+ item -> btl_module -> btl_flags &= ~(MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_ATOMIC_FOPS );
961
961
962
- ++ btls_found ;
963
- if (module ) {
964
- mca_btl_base_am_rdma_init (item -> btl_module );
965
- ompi_osc_rdma_selected_btl_insert (module , item -> btl_module , module -> btls_in_use ++ );
966
- }
967
-
962
+ ++ btls_found ;
963
+ if (module ) {
964
+ mca_btl_base_am_rdma_init (item -> btl_module );
965
+ assert (item -> btl_module -> btl_flags & MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION );
966
+ ompi_osc_rdma_selected_btl_insert (module , item -> btl_module , module -> btls_in_use ++ );
968
967
}
969
968
}
970
969
971
- opal_argv_free (btls_to_use );
970
+ /* active message RDMA/atomics does not require explicit memory registration */
971
+ if (NULL != module ) {
972
+ module -> use_memory_registration = false;
973
+ }
972
974
973
975
return btls_found > 0 ? OMPI_SUCCESS : OMPI_ERR_UNREACH ;
974
976
}
@@ -1003,7 +1005,7 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo
1003
1005
}
1004
1006
1005
1007
if ((item -> btl_module -> btl_flags & (MCA_BTL_FLAGS_RDMA )) == MCA_BTL_FLAGS_RDMA &&
1006
- (item -> btl_module -> btl_flags & (MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS ))) {
1008
+ (item -> btl_module -> btl_flags & (MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS | MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION ))) {
1007
1009
if (!selected_btl || item -> btl_module -> btl_latency < selected_btl -> btl_latency ) {
1008
1010
selected_btl = item -> btl_module ;
1009
1011
}
@@ -1072,10 +1074,14 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo
1072
1074
btl_counts = tmp ;
1073
1075
1074
1076
for (int i_btl = 0 ; i_btl < num_btls ; ++ i_btl ) {
1075
- /* for this implementation we need only compare-and-swap and fetch-and-add */
1077
+ /* for this implementation we need only compare-and-swap and fetch-and-add
1078
+ *
1079
+ * If a btl does not support remote completion, it cannot be used as the primary btl.
1080
+ * It can still be selected as an alternate btl */
1076
1081
if ((endpoint -> btl_rdma .bml_btls [i_btl ].btl -> btl_flags & (MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS )) ==
1077
1082
(MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS ) && (endpoint -> btl_rdma .bml_btls [i_btl ].btl -> btl_atomic_flags &
1078
- MCA_BTL_ATOMIC_SUPPORTS_ADD )) {
1083
+ MCA_BTL_ATOMIC_SUPPORTS_ADD ) &&
1084
+ (endpoint -> btl_rdma .bml_btls [i_btl ].btl -> btl_flags & MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION )) {
1079
1085
for (int j = 0 ; j < max_btls ; ++ j ) {
1080
1086
if (endpoint -> btl_rdma .bml_btls [i_btl ].btl == possible_btls [j ]) {
1081
1087
++ btl_counts [j ];
0 commit comments