Skip to content

Commit 59a4528

Browse files
committed
osc/rdma: adjustment on btl selection logic
This patch makes the following adjustments to the btl selection logic. First, when selecting a primary btl, this patch requires the primary btl to support remote completion. Second, when selecting alternate btls, this patch allowes any btl to be used as an alternate btl. Prior to this patch, only a list of pre-defined btls can be used as alternate btls. Finally, when a btl is used as an atlernate btl, this patch disables its native atomic and RDMA support, and always use active message RDMA/atomics with it. The reason for these changes are: First, these changes ensured the selected btls of osc/rdma always support remote completion (because active message RDMA/atomics supports remote completion when an btl's native RDMA is disabled). Remote completion is essential for several key components of osc/rdma: the usage of cpu atomics to update peer's state, the usage of local leader to update peer's state and its fence implementation. Therefore the assurance of btl's support of remote completion greatly simplified osc/rdma's implementation. Second, these changes eliminated the need to save and exchange more than 1 memory reigstration, because active message RDMA/atomic does not require explicit memory registration. Third, these changes ensured the correctness of atomic operations. When multiple alternate btls are used, atomicity cannot be guarenteed accross each btl's native implementation of atomics. Finally, these changes allowed more btls to be used as alternate btls, especially btl/self. Signed-off-by: Wei Zhang <[email protected]>
1 parent b277aba commit 59a4528

File tree

1 file changed

+52
-46
lines changed

1 file changed

+52
-46
lines changed

ompi/mca/osc/rdma/osc_rdma_component.c

Lines changed: 52 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,6 @@ static const char* ompi_osc_rdma_set_no_lock_info(opal_infosubscriber_t *obj, co
8585

8686
static char *ompi_osc_rdma_btl_names;
8787
static char *ompi_osc_rdma_mtl_names;
88-
static char *ompi_osc_rdma_btl_alternate_names;
8988

9089
static const mca_base_var_enum_value_t ompi_osc_rdma_locking_modes[] = {
9190
{.value = OMPI_OSC_RDMA_LOCKING_TWO_LEVEL, .string = "two_level"},
@@ -266,14 +265,6 @@ static int ompi_osc_rdma_component_register (void)
266265
MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_btl_names);
267266
free(description_str);
268267

269-
ompi_osc_rdma_btl_alternate_names = "sm,tcp";
270-
opal_asprintf(&description_str, "Comma-delimited list of alternate BTL component names to allow without verifying "
271-
"connectivity (default: %s)", ompi_osc_rdma_btl_alternate_names);
272-
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "alternate_btls", description_str,
273-
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3,
274-
MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_btl_alternate_names);
275-
free(description_str);
276-
277268
ompi_osc_rdma_mtl_names = "psm2";
278269
opal_asprintf(&description_str, "Comma-delimited list of MTL component names to lower the priority of rdma "
279270
"osc component (default: %s)", ompi_osc_rdma_mtl_names);
@@ -919,56 +910,67 @@ static void ompi_osc_rdma_ensure_local_add_procs (void)
919910
* @return OMPI_SUCCESS if BTLs can be found
920911
* @return OMPI_ERR_UNREACH if no BTLs can be found that match
921912
*
922-
* In this case an "alternate" BTL is a BTL that does not provide true RDMA but
923-
* can use active messages using the BTL base AM RDMA/atomics. Since more than
924-
* one BTL may be needed for this support the OSC component will disable the
925-
* use of registration-based RDMA (these BTLs will not be used) and will use
926-
* any remaining BTL. By default the BTLs used will be tcp and sm but any single
927-
* (or pair) of BTLs may be used.
913+
* This function is used when there ompi_osc_rdm_query_btls() failed to find
914+
* a single btl that can communicate with all peers and supports remote completion.
915+
* In this case, osc/rdma will use mulitple btls for communications. One process
916+
* can use different btl to communicate with different peer. Such btls are called
917+
* "alternate btls".
918+
*
919+
* For an alternate btl, this function disabled its native implementation of
920+
* RDMA and atomics, and made osc/rdma to always use the active message RDMA/atomics
921+
* with the alternate btl.
922+
*
923+
* The reason to disable an alternate btl's native atomics is because
924+
* When multiple alternate btls are being used, the atomicity accross btl's own
925+
* atomics is not guaranteed. Therefore, osc/rdma must use active message atomics.
926+
*
927+
* The reason to disable an alternate btls' native RDMA put and get is because
928+
* it signficantly simplified osc/rdma's completion. The simplication came in two
929+
* areas:
930+
*
931+
* First, active message RDMA supports remote completion (when a btl's native
932+
* RDMA is disabled). Remote completion is required by several key components
933+
* of osc/rdma: the usage of cpu atomics to update peer's state, the usage
934+
* of local leader to update peer's state, and its fence implementation.
935+
* If osc/rdma do not use active message RDMA on alternate btls, it will
936+
* have to keep track of each selected btl's support of remote completion.
937+
* If any selected btl does not support remote completion, it will have to
938+
* disable the usage of cpu atomics, disable the usage of local leader,
939+
* and implement a different fence mechanism.
940+
*
941+
* Second, active message RDMA does not use memory registration explicitly,
942+
* therefore using it eliminates the need to store and exchange multiple
943+
* memory registrations.
928944
*/
929945
static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module)
930946
{
931947
mca_btl_base_selected_module_t *item;
932-
char **btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ',');
933948
int btls_found = 0;
934949

935-
btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ',');
936-
if (NULL == btls_to_use) {
937-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "no alternate BTLs requested: %s", ompi_osc_rdma_btl_alternate_names);
938-
return OMPI_ERR_UNREACH;
939-
}
940-
941950
if (module) {
942951
module->btls_in_use = 0;
943952
}
944953

945954
/* rdma and atomics are only supported with BTLs at the moment */
946-
for (int i = 0 ; btls_to_use[i] ; ++i) {
947-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "checking for btl %s", btls_to_use[i]);
948-
OPAL_LIST_FOREACH(item, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) {
949-
if (NULL != item->btl_module->btl_register_mem) {
950-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "skipping RDMA btl when searching for alternate BTL");
951-
continue;
952-
}
953-
954-
if (0 != strcmp (btls_to_use[i], item->btl_module->btl_component->btl_version.mca_component_name)) {
955-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "skipping btl %s",
956-
item->btl_module->btl_component->btl_version.mca_component_name);
957-
continue;
958-
}
955+
OPAL_LIST_FOREACH(item, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) {
956+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "found alternate btl %s",
957+
item->btl_module->btl_component->btl_version.mca_component_name);
959958

960-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "found alternate btl %s", btls_to_use[i]);
959+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "disabing btl's native support of RDMA and ATOMIC");
960+
item->btl_module->btl_flags &= ~(MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_ATOMIC_FOPS );
961961

962-
++btls_found;
963-
if (module) {
964-
mca_btl_base_am_rdma_init(item->btl_module);
965-
ompi_osc_rdma_selected_btl_insert(module, item->btl_module, module->btls_in_use++);
966-
}
967-
962+
++btls_found;
963+
if (module) {
964+
mca_btl_base_am_rdma_init(item->btl_module);
965+
assert(item->btl_module->btl_flags & MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION);
966+
ompi_osc_rdma_selected_btl_insert(module, item->btl_module, module->btls_in_use++);
968967
}
969968
}
970969

971-
opal_argv_free (btls_to_use);
970+
/* active message RDMA/atomics does not require explicit memory registration */
971+
if (NULL != module) {
972+
module->use_memory_registration = false;
973+
}
972974

973975
return btls_found > 0 ? OMPI_SUCCESS : OMPI_ERR_UNREACH;
974976
}
@@ -1003,7 +1005,7 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo
10031005
}
10041006

10051007
if ((item->btl_module->btl_flags & (MCA_BTL_FLAGS_RDMA)) == MCA_BTL_FLAGS_RDMA &&
1006-
(item->btl_module->btl_flags & (MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS))) {
1008+
(item->btl_module->btl_flags & (MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS | MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION))) {
10071009
if (!selected_btl || item->btl_module->btl_latency < selected_btl->btl_latency) {
10081010
selected_btl = item->btl_module;
10091011
}
@@ -1072,10 +1074,14 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo
10721074
btl_counts = tmp;
10731075

10741076
for (int i_btl = 0 ; i_btl < num_btls ; ++i_btl) {
1075-
/* for this implementation we need only compare-and-swap and fetch-and-add */
1077+
/* for this implementation we need only compare-and-swap and fetch-and-add
1078+
*
1079+
* If a btl does not support remote completion, it cannot be used as the primary btl.
1080+
* It can still be selected as an alternate btl */
10761081
if ((endpoint->btl_rdma.bml_btls[i_btl].btl->btl_flags & (MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS)) ==
10771082
(MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS) && (endpoint->btl_rdma.bml_btls[i_btl].btl->btl_atomic_flags &
1078-
MCA_BTL_ATOMIC_SUPPORTS_ADD)) {
1083+
MCA_BTL_ATOMIC_SUPPORTS_ADD) &&
1084+
(endpoint->btl_rdma.bml_btls[i_btl].btl->btl_flags & MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION)) {
10791085
for (int j = 0 ; j < max_btls ; ++j) {
10801086
if (endpoint->btl_rdma.bml_btls[i_btl].btl == possible_btls[j]) {
10811087
++btl_counts[j];

0 commit comments

Comments
 (0)