From b061f96156b56d7990756e3e2759708fcf065dad Mon Sep 17 00:00:00 2001 From: Wenduo Wang Date: Fri, 1 Dec 2023 16:43:23 -0800 Subject: [PATCH 1/2] opal/ofi: fix round-robin selection logic This change fixes current round-robin selection logic: - Only providers of the same type should be considered, i.e. providers that match the head of the list. This deviates from the documented behavior. - For unbound process the selection should be based on its local rank, i.e. rank among processes on the same node. Currently only the first NIC will be selected. Signed-off-by: Wenduo Wang --- opal/mca/common/ofi/common_ofi.c | 51 ++++++++++++++++++++----- opal/mca/common/ofi/help-common-ofi.txt | 6 +++ 2 files changed, 48 insertions(+), 9 deletions(-) diff --git a/opal/mca/common/ofi/common_ofi.c b/opal/mca/common/ofi/common_ofi.c index f0da0f4a52c..098e54503e1 100644 --- a/opal/mca/common/ofi/common_ofi.c +++ b/opal/mca/common/ofi/common_ofi.c @@ -724,16 +724,46 @@ static int get_nearest_nic(hwloc_topology_t topology, struct fi_info *provider_l return ret; } -static struct fi_info *select_provider_round_robin(struct fi_info *provider_list, uint32_t rank, - size_t num_providers) +/** + * @brief Selects a provider from the list in a round-robin fashion + * + * This function implements a round-robin algorithm to select a provider from + * the provided list based on a rank. Only providers of the same type as the + * first provider are eligible for selection. + * + * @param[in] provider_list A list of providers to select from. + * @param[out] rank A rank metric for the current process, such as + * the rank on the same node or CPU package. + * @return Pointer to the selected provider + */ +static struct fi_info *select_provider_round_robin(struct fi_info *provider_list, uint32_t rank) { - uint32_t provider_rank = rank % num_providers; - struct fi_info *current_provider = provider_list; + uint32_t provider_rank = 0, current_rank = 0; + size_t num_providers = 0; + struct fi_info *current_provider = NULL; - for (uint32_t i = 0; i < provider_rank; ++i) { + for (current_provider = provider_list; NULL != current_provider;) { + if (OPAL_SUCCESS == check_provider_attr(provider_list, current_provider)) { + ++num_providers; + } current_provider = current_provider->next; } + current_provider = provider_list; + if (2 > num_providers) { + goto out; + } + + provider_rank = rank % num_providers; + + while (NULL != current_provider) { + if (OPAL_SUCCESS == check_provider_attr(provider_list, current_provider) + && provider_rank == current_rank++) { + break; + } + current_provider = current_provider->next; + } +out: return current_provider; } @@ -850,7 +880,7 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list, { int ret, num_providers = 0; struct fi_info *provider = NULL; - uint32_t package_rank = 0; + uint32_t package_rank = process_info->my_local_rank; num_providers = count_providers(provider_list); if (!process_info->proc_is_bound || 2 > num_providers) { @@ -876,7 +906,12 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list, #endif /* OPAL_OFI_PCI_DATA_AVAILABLE */ round_robin: - provider = select_provider_round_robin(provider_list, package_rank, num_providers); + if (!process_info->proc_is_bound && 1 < num_providers + && opal_output_get_verbosity(opal_common_ofi.output) >= 1) { + opal_show_help("help-common-ofi.txt", "unbound_process", true, 1); + } + + provider = select_provider_round_robin(provider_list, package_rank); out: #if OPAL_ENABLE_DEBUG opal_output_verbose(1, opal_common_ofi.output, "package rank: %d device: %s", package_rank, @@ -950,5 +985,3 @@ OPAL_DECLSPEC int opal_common_ofi_fi_getname(fid_t fid, void **addr, size_t *add } return ret; } - - diff --git a/opal/mca/common/ofi/help-common-ofi.txt b/opal/mca/common/ofi/help-common-ofi.txt index 44366a64c5f..de3630f7e7a 100644 --- a/opal/mca/common/ofi/help-common-ofi.txt +++ b/opal/mca/common/ofi/help-common-ofi.txt @@ -7,6 +7,12 @@ # # $HEADER$ # +[unbound_process] +Open MPI's OFI driver detected multiple NICs on the system but cannot select an +optimal device because the current process is not bound. This may negatively +impact performance. This can be resolved by specifying "--bind-to ..." on +command line. + [package_rank failed] Open MPI's OFI driver detected multiple equidistant NICs from the current process, but had insufficient information to ensure MPI processes fairly pick a NIC for use. From 3aba0bb5c5f3a7cd8713ad9dd49bfd4f1ce58206 Mon Sep 17 00:00:00 2001 From: Wenduo Wang Date: Fri, 1 Dec 2023 16:38:47 -0800 Subject: [PATCH 2/2] opal/ofi: update nic selection function doc The documentation needs an update to reflect latest implementation. The original cpuset matching logic has been replaced with a new distance calculation algorithm. This change also clarifies the round-robin selection process when we need to break a tie. Signed-off-by: Wenduo Wang --- opal/mca/common/ofi/common_ofi.c | 12 +++-- opal/mca/common/ofi/common_ofi.h | 84 ++++++++++++++++---------------- 2 files changed, 50 insertions(+), 46 deletions(-) diff --git a/opal/mca/common/ofi/common_ofi.c b/opal/mca/common/ofi/common_ofi.c index 098e54503e1..c922ea218d4 100644 --- a/opal/mca/common/ofi/common_ofi.c +++ b/opal/mca/common/ofi/common_ofi.c @@ -623,10 +623,10 @@ static int get_provider_distance(struct fi_info *provider, hwloc_topology_t topo /** * @brief Get the nearest device to the current thread * - * Use the PMIx server or calculate the device distances, then out of the set of - * returned distances find the subset of the nearest devices. This can be - * 0 or more. - * If there are multiple equidistant devices, break the tie using the rank. + * Compute the distances from the current thread to each NIC in provider_list, + * and select the NIC with the shortest distance. + * If there are multiple equidistant devices, break the tie using local rank + * to balance NIC utilization. * * @param[in] topoloy hwloc topology * @param[in] provider_list List of providers to select from @@ -898,6 +898,10 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list, package_rank = get_package_rank(process_info); #if OPAL_OFI_PCI_DATA_AVAILABLE + /** + * If provider PCI BDF information is available, we calculate its physical distance + * to the current process, and select the provider with the shortest distance. + */ ret = get_nearest_nic(opal_hwloc_topology, provider_list, num_providers, package_rank, &provider); if (OPAL_SUCCESS == ret) { diff --git a/opal/mca/common/ofi/common_ofi.h b/opal/mca/common/ofi/common_ofi.h index 10992410019..edc5be8a1fe 100644 --- a/opal/mca/common/ofi/common_ofi.h +++ b/opal/mca/common/ofi/common_ofi.h @@ -103,47 +103,47 @@ OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item); /** * Selects NIC (provider) based on hardware locality * - * In multi-nic situations, use hardware topology to pick the "best" - * of the selected NICs. - * There are 3 main cases that this covers: - * - * 1. If the first provider passed into this function is the only valid - * provider, this provider is returned. - * - * 2. If there is more than 1 provider that matches the type of the first - * provider in the list, and the BDF data - * is available then a provider is selected based on locality of device - * cpuset and process cpuset and tries to ensure that processes - * are distributed evenly across NICs. This has two separate - * cases: - * - * i. There is one or more provider local to the process: - * - * (local rank % number of providers of the same type - * that share the process cpuset) is used to select one - * of these providers. - * - * ii. There is no provider that is local to the process: - * - * (local rank % number of providers of the same type) - * is used to select one of these providers - * - * 3. If there is more than 1 providers of the same type in the - * list, and the BDF data is not available (the ofi version does - * not support fi_info.nic or the provider does not support BDF) - * then (local rank % number of providers of the same type) is - * used to select one of these providers - * - * @param provider_list (IN) struct fi_info* An initially selected - * provider NIC. The provider name and - * attributes are used to restrict NIC - * selection. This provider is returned if the - * NIC selection fails. - * - * @param provider (OUT) struct fi_info* object with the selected - * provider if the selection succeeds - * if the selection fails, returns the fi_info - * object that was initially provided. + * The selection is based on the following priority: + * + * Single-NIC: + * + * If only 1 provider is available, always return that provider. + * + * Multi-NIC: + * + * 1. If the process is NOT bound, pick a NIC using (local rank % number + * of providers of the same type). This gives a fair chance to each + * qualified NIC and balances overall utilization. + * + * 2. If the process is bound, we compare providers in the list that have + * the same type as the first provider, and find the provider with the + * shortest distance to the current process. + * + * i. If the provider has PCI BDF data, we attempt to compute the + * distance between the NIC and the current process cpuset. The NIC + * with the shortest distance is returned. + * + * * For equidistant NICs, we select a NIC in round-robin fashion + * using the package rank of the current process, i.e. (package + * rank % number of providers with the same distance). + * + * ii. If we cannot compute the distance between the NIC and the + * current process, e.g. PCI BDF data is not available, a NIC will be + * selected in a round-robin fashion using package rank, i.e. (package + * rank % number of providers of the same type). + * + * @param[in] provider_list struct fi_info* An initially selected + * provider NIC. The provider name and + * attributes are used to restrict NIC + * selection. This provider is returned if the + * NIC selection fails. + * + * @param[in] process_info opal_process_info_t* The current process info + * + * @param[out] provider struct fi_info* object with the selected + * provider if the selection succeeds + * if the selection fails, returns the fi_info + * object that was initially provided. * * All errors should be recoverable and will return the initially provided * provider. However, if an error occurs we can no longer guarantee @@ -152,7 +152,7 @@ OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item); * */ OPAL_DECLSPEC struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list, - opal_process_info_t *process_info); + opal_process_info_t *process_info); /** * Obtain EP endpoint name