Skip to content

Commit e0a5b8d

Browse files
committed
osc/rdma: Simplify accelerated BTL selection
Skipping the over-optimization of supporting a BTL that can talk native RDMA to everyone but self (we don't have one of those today) allows a large simplification of the accelerated BTL selection on the OSC RDMA component. Signed-off-by: Brian Barrett <[email protected]>
1 parent e509302 commit e0a5b8d

File tree

1 file changed

+82
-126
lines changed

1 file changed

+82
-126
lines changed

ompi/mca/osc/rdma/osc_rdma_component.c

Lines changed: 82 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, s
7878
static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, size_t size, int disp_unit,
7979
struct ompi_communicator_t *comm, struct opal_info_t *info,
8080
int flavor, int *model);
81-
static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module);
81+
static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module);
8282
static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module);
8383

8484
static const char* ompi_osc_rdma_set_no_lock_info(opal_infosubscriber_t *obj, const char *key, const char *value);
@@ -395,7 +395,7 @@ static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, s
395395
}
396396
#endif /* OPAL_CUDA_SUPPORT */
397397

398-
if (OMPI_SUCCESS == ompi_osc_rdma_query_btls (comm, NULL)) {
398+
if (OMPI_SUCCESS == ompi_osc_rdma_query_accelerated_btls (comm, NULL)) {
399399
return mca_osc_rdma_component.priority;
400400
}
401401

@@ -882,7 +882,7 @@ static void ompi_osc_rdma_ensure_local_add_procs (void)
882882
* @return OMPI_ERR_UNREACH if no BTLs can be found that match
883883
*
884884
* In this case an "alternate" BTL is a BTL does not meet the
885-
* requirements of a BTL outlined in ompi_osc_rdma_query_btls().
885+
* requirements of a BTL outlined in ompi_osc_rdma_query_accelerated_btls().
886886
* Either it does not provide connectivity to all peers, provide
887887
* remote completion, or natively support put/get/atomic.. Since more
888888
* than one BTL may be needed for this support the OSC component will
@@ -937,6 +937,20 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o
937937
return btls_found > 0 ? OMPI_SUCCESS : OMPI_ERR_UNREACH;
938938
}
939939

940+
/* Check for BTL requirements:
941+
* 1) RDMA (put/get) and ATOMIC operations. We only require cswap
942+
* and fetch and add and will emulate other opterations with those
943+
* two as necessary.
944+
* 2) Remote Completion
945+
*/
946+
static bool ompi_osc_rdma_check_accelerated_btl(struct mca_btl_base_module_t *btl)
947+
{
948+
return ((btl->btl_flags & MCA_BTL_FLAGS_RDMA) &&
949+
(btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_FOPS) &&
950+
(btl->btl_flags & MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION) &&
951+
(btl->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_ADD));
952+
}
953+
940954
/*
941955
* Attempt to find a BTL that can be used for native RDMA
942956
*
@@ -957,18 +971,12 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o
957971
* If module is NULL, the code acts as a query mechanism to find any
958972
* potential BTLs, and is used to implement osc_rdma_query().
959973
*/
960-
static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module)
974+
static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module)
961975
{
962-
struct mca_btl_base_module_t **possible_btls = NULL;
963976
int comm_size = ompi_comm_size (comm);
964-
int comm_rank = ompi_comm_rank (comm);
965-
int rc = OMPI_SUCCESS, max_btls = 0;
966-
unsigned int selected_latency = INT_MAX;
967-
struct mca_btl_base_module_t *selected_btl = NULL;
968-
mca_btl_base_selected_module_t *item;
969-
int *btl_counts = NULL;
977+
struct mca_btl_base_module_t *selected_btl;
978+
mca_bml_base_endpoint_t *base_endpoint;
970979
char **btls_to_use;
971-
void *tmp;
972980

973981
if (module) {
974982
ompi_osc_rdma_selected_btl_insert(module, NULL, 0);
@@ -980,37 +988,30 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo
980988
in general usage. */
981989
btls_to_use = opal_argv_split (ompi_osc_rdma_full_connectivity_btls, ',');
982990
if (btls_to_use) {
983-
/* rdma and atomics are only supported with BTLs at the moment
984-
* If a btl does not support remote completion, it cannot be used as the primary btl.
985-
* It can still be selected as an alternate btl */
991+
mca_btl_base_selected_module_t *item;
992+
993+
selected_btl = NULL;
994+
995+
/* rdma and atomics are only supported with BTLs at the moment */
986996
OPAL_LIST_FOREACH(item, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) {
987997
for (int i = 0 ; btls_to_use[i] ; ++i) {
988998
if (0 != strcmp (btls_to_use[i], item->btl_module->btl_component->btl_version.mca_component_name)) {
989999
continue;
9901000
}
9911001

992-
if ((item->btl_module->btl_flags & (MCA_BTL_FLAGS_RDMA)) == MCA_BTL_FLAGS_RDMA &&
993-
(item->btl_module->btl_flags & (MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS | MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION))) {
994-
if (!selected_btl || item->btl_module->btl_latency < selected_btl->btl_latency) {
1002+
if (ompi_osc_rdma_check_accelerated_btl(item->btl_module)) {
1003+
if (NULL == selected_btl || item->btl_module->btl_latency < selected_btl->btl_latency) {
9951004
selected_btl = item->btl_module;
9961005
}
9971006
}
9981007
}
9991008
}
10001009

10011010
opal_argv_free (btls_to_use);
1002-
}
10031011

1004-
if (NULL != selected_btl) {
1005-
if (module) {
1006-
ompi_osc_rdma_selected_btl_insert(module, selected_btl, 0);
1007-
module->btls_in_use = 1;
1008-
module->use_memory_registration = selected_btl->btl_register_mem != NULL;
1012+
if (NULL != selected_btl) {
1013+
goto btl_selection_complete;
10091014
}
1010-
1011-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "selected btl: %s",
1012-
selected_btl->btl_component->btl_version.mca_component_name);
1013-
return OMPI_SUCCESS;
10141015
}
10151016

10161017
/* if osc/rdma gets selected we need to ensure that all local procs have been added */
@@ -1021,123 +1022,78 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo
10211022
* other requirements was not found. Look for BTLs that may be
10221023
* able to talk to all peers. This is obviously more expensive
10231024
* than the check above.
1025+
*
1026+
* This algorithm skips a potential use case: it requires
1027+
* reachability to self, which is not strictly needed if BTL and
1028+
* CPU atomics are atomic with each other. However, the set of
1029+
* BTLs which can not send to self, which have RDMA semantics, an
1030+
* which have the rquired atomicity is currently the null set and
1031+
* almost certain to remain the null set, so we keep it simple.
1032+
*
1033+
* We only want BTLs that can reach all peers, so use rank 0's BTL
1034+
* list as the list of all available BTLs. Any BTL that cannot
1035+
* be used to communicate with rank 0 necessarily is not in the
1036+
* list of all available BTLs for this algorithm.
10241037
*/
1038+
base_endpoint = mca_bml_base_get_endpoint(ompi_comm_peer_lookup(comm, 0));
1039+
if (NULL == base_endpoint) {
1040+
return OMPI_ERR_UNREACH;
1041+
}
10251042

1026-
for (int rank = 0 ; rank < comm_size ; ++rank) {
1027-
ompi_proc_t *proc = ompi_comm_peer_lookup (comm, rank);
1028-
mca_bml_base_endpoint_t *endpoint;
1029-
int num_btls, prev_max;
1030-
bool found_btl = false;
1031-
1032-
endpoint = mca_bml_base_get_endpoint (proc);
1033-
if (NULL == endpoint) {
1034-
/* can't continue if some peer is unreachable */
1035-
rc = OMPI_ERR_UNREACH;
1036-
break;
1037-
}
1043+
selected_btl = NULL;
1044+
for (size_t i_btl = 0 ;
1045+
i_btl < mca_bml_base_btl_array_get_size(&base_endpoint->btl_rdma);
1046+
++i_btl) {
1047+
bool have_connectivity = true;
1048+
struct mca_bml_base_btl_t *examine_bml_btl;
1049+
struct mca_btl_base_module_t *examine_btl;
10381050

1039-
num_btls = mca_bml_base_btl_array_get_size (&endpoint->btl_rdma);
1040-
if (0 == num_btls) {
1041-
rc = OMPI_ERR_NOT_AVAILABLE;
1042-
/* at least one rank doesn't have an RDMA capable btl */
1043-
break;
1051+
examine_bml_btl = mca_bml_base_btl_array_get_index(&base_endpoint->btl_rdma, i_btl);
1052+
if (NULL == examine_bml_btl) {
1053+
return OMPI_ERR_NOT_FOUND;
10441054
}
1055+
examine_btl = examine_bml_btl->btl;
10451056

1046-
prev_max = max_btls;
1047-
1048-
max_btls = (max_btls > num_btls) ? max_btls : num_btls;
1049-
1050-
tmp = realloc (possible_btls, sizeof (void *) * max_btls);
1051-
if (NULL == tmp) {
1052-
rc = OMPI_ERR_OUT_OF_RESOURCE;
1053-
break;
1057+
/* skip any BTL which doesn't meet our requirements */
1058+
if (!ompi_osc_rdma_check_accelerated_btl(examine_btl)) {
1059+
continue;
10541060
}
1055-
possible_btls = tmp;
10561061

1057-
for (int j = prev_max ; j < max_btls ; ++j) {
1058-
possible_btls[j] = NULL;
1059-
}
1062+
/* check connectivity across all ranks */
1063+
for (int rank = 0 ; rank < comm_size ; ++rank) {
1064+
ompi_proc_t *proc = ompi_comm_peer_lookup(comm, rank);
1065+
mca_bml_base_endpoint_t *endpoint;
10601066

1061-
tmp = realloc (btl_counts, sizeof (int) * max_btls);
1062-
if (NULL == tmp) {
1063-
rc = OMPI_ERR_OUT_OF_RESOURCE;
1064-
break;
1065-
}
1066-
btl_counts = tmp;
1067-
1068-
for (int i_btl = 0 ; i_btl < num_btls ; ++i_btl) {
1069-
/* Check for BTL requirements:
1070-
* 1) RDMA (put/get) and ATOMIC operations. We only
1071-
* require cswap and fetch and add and will emulate
1072-
* other opterations with those two as necessary.
1073-
* 2) Remote Completion
1074-
*
1075-
* If the BTL meets all those requirements, increment the
1076-
* btl_counts to indicate that this btl can talk to the
1077-
* current peer proc.
1078-
*/
1079-
if (((endpoint->btl_rdma.bml_btls[i_btl].btl->btl_flags & (MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS)) ==
1080-
(MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS)) &&
1081-
(endpoint->btl_rdma.bml_btls[i_btl].btl->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_ADD) &&
1082-
(endpoint->btl_rdma.bml_btls[i_btl].btl->btl_flags & MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION)) {
1083-
for (int j = 0 ; j < max_btls ; ++j) {
1084-
if (endpoint->btl_rdma.bml_btls[i_btl].btl == possible_btls[j]) {
1085-
++btl_counts[j];
1086-
found_btl = true;
1087-
break;
1088-
} else if (NULL == possible_btls[j]) {
1089-
possible_btls[j] = endpoint->btl_rdma.bml_btls[i_btl].btl;
1090-
btl_counts[j] = 1;
1091-
found_btl = true;
1092-
break;
1093-
}
1094-
}
1067+
endpoint = mca_bml_base_get_endpoint(proc);
1068+
if (NULL == endpoint) {
1069+
have_connectivity = false;
1070+
break;
10951071
}
1096-
}
1097-
1098-
/* any non-local rank must have a usable btl */
1099-
if (!found_btl && comm_rank != rank) {
1100-
/* no btl = no rdma/atomics */
1101-
rc = OMPI_ERR_UNREACH;
1102-
break;
1103-
}
1104-
}
11051072

1106-
if (OMPI_SUCCESS != rc) {
1107-
free (possible_btls);
1108-
free (btl_counts);
1109-
return rc;
1110-
}
1111-
1112-
for (int i = 0 ; i < max_btls ; ++i) {
1113-
int btl_count = btl_counts[i];
1114-
1115-
if (NULL == possible_btls[i]) {
1116-
break;
1117-
}
1118-
1119-
if (possible_btls[i]->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB) {
1120-
/* The onesided component can, if BTL atomics are atomic
1121-
relative to CPU atomics, handle atomics to self, so
1122-
increment the counter once to cover that case. */
1123-
btl_count++;
1073+
if (NULL == mca_bml_base_btl_array_find(&endpoint->btl_rdma,
1074+
examine_btl)) {
1075+
have_connectivity = false;
1076+
break;
1077+
}
11241078
}
11251079

1126-
if (btl_count >= comm_size && possible_btls[i]->btl_latency < selected_latency) {
1127-
selected_btl = possible_btls[i];
1128-
selected_latency = possible_btls[i]->btl_latency;
1080+
/* if we have connectivity, displace currently selected btl if
1081+
* this one has lower latency; we prioritize latency over all
1082+
* other parameters
1083+
*/
1084+
if (have_connectivity) {
1085+
if (NULL == selected_btl || examine_btl->btl_latency < selected_btl->btl_latency) {
1086+
selected_btl = examine_btl;
1087+
}
11291088
}
11301089
}
11311090

1132-
free (possible_btls);
1133-
free (btl_counts);
1134-
11351091
if (NULL == selected_btl) {
11361092
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "no suitable btls found");
1137-
/* no btl = no rdma/atomics */
11381093
return OMPI_ERR_NOT_AVAILABLE;
11391094
}
11401095

1096+
btl_selection_complete:
11411097
if (module) {
11421098
ompi_osc_rdma_selected_btl_insert(module, selected_btl, 0);
11431099
module->btls_in_use = 1;
@@ -1414,7 +1370,7 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
14141370
}
14151371

14161372
/* find rdma capable endpoints */
1417-
ret = ompi_osc_rdma_query_btls (module->comm, module);
1373+
ret = ompi_osc_rdma_query_accelerated_btls (module->comm, module);
14181374
if (OMPI_SUCCESS != ret) {
14191375
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_WARN, "could not find a suitable btl. falling back on "
14201376
"active-message BTLs");

0 commit comments

Comments
 (0)