diff --git a/opal/mca/common/ofi/common_ofi.c b/opal/mca/common/ofi/common_ofi.c index f0da0f4a52c..c922ea218d4 100644 --- a/opal/mca/common/ofi/common_ofi.c +++ b/opal/mca/common/ofi/common_ofi.c @@ -623,10 +623,10 @@ static int get_provider_distance(struct fi_info *provider, hwloc_topology_t topo /** * @brief Get the nearest device to the current thread * - * Use the PMIx server or calculate the device distances, then out of the set of - * returned distances find the subset of the nearest devices. This can be - * 0 or more. - * If there are multiple equidistant devices, break the tie using the rank. + * Compute the distances from the current thread to each NIC in provider_list, + * and select the NIC with the shortest distance. + * If there are multiple equidistant devices, break the tie using local rank + * to balance NIC utilization. * * @param[in] topoloy hwloc topology * @param[in] provider_list List of providers to select from @@ -724,16 +724,46 @@ static int get_nearest_nic(hwloc_topology_t topology, struct fi_info *provider_l return ret; } -static struct fi_info *select_provider_round_robin(struct fi_info *provider_list, uint32_t rank, - size_t num_providers) +/** + * @brief Selects a provider from the list in a round-robin fashion + * + * This function implements a round-robin algorithm to select a provider from + * the provided list based on a rank. Only providers of the same type as the + * first provider are eligible for selection. + * + * @param[in] provider_list A list of providers to select from. + * @param[out] rank A rank metric for the current process, such as + * the rank on the same node or CPU package. + * @return Pointer to the selected provider + */ +static struct fi_info *select_provider_round_robin(struct fi_info *provider_list, uint32_t rank) { - uint32_t provider_rank = rank % num_providers; - struct fi_info *current_provider = provider_list; + uint32_t provider_rank = 0, current_rank = 0; + size_t num_providers = 0; + struct fi_info *current_provider = NULL; - for (uint32_t i = 0; i < provider_rank; ++i) { + for (current_provider = provider_list; NULL != current_provider;) { + if (OPAL_SUCCESS == check_provider_attr(provider_list, current_provider)) { + ++num_providers; + } current_provider = current_provider->next; } + current_provider = provider_list; + if (2 > num_providers) { + goto out; + } + + provider_rank = rank % num_providers; + + while (NULL != current_provider) { + if (OPAL_SUCCESS == check_provider_attr(provider_list, current_provider) + && provider_rank == current_rank++) { + break; + } + current_provider = current_provider->next; + } +out: return current_provider; } @@ -850,7 +880,7 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list, { int ret, num_providers = 0; struct fi_info *provider = NULL; - uint32_t package_rank = 0; + uint32_t package_rank = process_info->my_local_rank; num_providers = count_providers(provider_list); if (!process_info->proc_is_bound || 2 > num_providers) { @@ -868,6 +898,10 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list, package_rank = get_package_rank(process_info); #if OPAL_OFI_PCI_DATA_AVAILABLE + /** + * If provider PCI BDF information is available, we calculate its physical distance + * to the current process, and select the provider with the shortest distance. + */ ret = get_nearest_nic(opal_hwloc_topology, provider_list, num_providers, package_rank, &provider); if (OPAL_SUCCESS == ret) { @@ -876,7 +910,12 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list, #endif /* OPAL_OFI_PCI_DATA_AVAILABLE */ round_robin: - provider = select_provider_round_robin(provider_list, package_rank, num_providers); + if (!process_info->proc_is_bound && 1 < num_providers + && opal_output_get_verbosity(opal_common_ofi.output) >= 1) { + opal_show_help("help-common-ofi.txt", "unbound_process", true, 1); + } + + provider = select_provider_round_robin(provider_list, package_rank); out: #if OPAL_ENABLE_DEBUG opal_output_verbose(1, opal_common_ofi.output, "package rank: %d device: %s", package_rank, @@ -950,5 +989,3 @@ OPAL_DECLSPEC int opal_common_ofi_fi_getname(fid_t fid, void **addr, size_t *add } return ret; } - - diff --git a/opal/mca/common/ofi/common_ofi.h b/opal/mca/common/ofi/common_ofi.h index 10992410019..edc5be8a1fe 100644 --- a/opal/mca/common/ofi/common_ofi.h +++ b/opal/mca/common/ofi/common_ofi.h @@ -103,47 +103,47 @@ OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item); /** * Selects NIC (provider) based on hardware locality * - * In multi-nic situations, use hardware topology to pick the "best" - * of the selected NICs. - * There are 3 main cases that this covers: - * - * 1. If the first provider passed into this function is the only valid - * provider, this provider is returned. - * - * 2. If there is more than 1 provider that matches the type of the first - * provider in the list, and the BDF data - * is available then a provider is selected based on locality of device - * cpuset and process cpuset and tries to ensure that processes - * are distributed evenly across NICs. This has two separate - * cases: - * - * i. There is one or more provider local to the process: - * - * (local rank % number of providers of the same type - * that share the process cpuset) is used to select one - * of these providers. - * - * ii. There is no provider that is local to the process: - * - * (local rank % number of providers of the same type) - * is used to select one of these providers - * - * 3. If there is more than 1 providers of the same type in the - * list, and the BDF data is not available (the ofi version does - * not support fi_info.nic or the provider does not support BDF) - * then (local rank % number of providers of the same type) is - * used to select one of these providers - * - * @param provider_list (IN) struct fi_info* An initially selected - * provider NIC. The provider name and - * attributes are used to restrict NIC - * selection. This provider is returned if the - * NIC selection fails. - * - * @param provider (OUT) struct fi_info* object with the selected - * provider if the selection succeeds - * if the selection fails, returns the fi_info - * object that was initially provided. + * The selection is based on the following priority: + * + * Single-NIC: + * + * If only 1 provider is available, always return that provider. + * + * Multi-NIC: + * + * 1. If the process is NOT bound, pick a NIC using (local rank % number + * of providers of the same type). This gives a fair chance to each + * qualified NIC and balances overall utilization. + * + * 2. If the process is bound, we compare providers in the list that have + * the same type as the first provider, and find the provider with the + * shortest distance to the current process. + * + * i. If the provider has PCI BDF data, we attempt to compute the + * distance between the NIC and the current process cpuset. The NIC + * with the shortest distance is returned. + * + * * For equidistant NICs, we select a NIC in round-robin fashion + * using the package rank of the current process, i.e. (package + * rank % number of providers with the same distance). + * + * ii. If we cannot compute the distance between the NIC and the + * current process, e.g. PCI BDF data is not available, a NIC will be + * selected in a round-robin fashion using package rank, i.e. (package + * rank % number of providers of the same type). + * + * @param[in] provider_list struct fi_info* An initially selected + * provider NIC. The provider name and + * attributes are used to restrict NIC + * selection. This provider is returned if the + * NIC selection fails. + * + * @param[in] process_info opal_process_info_t* The current process info + * + * @param[out] provider struct fi_info* object with the selected + * provider if the selection succeeds + * if the selection fails, returns the fi_info + * object that was initially provided. * * All errors should be recoverable and will return the initially provided * provider. However, if an error occurs we can no longer guarantee @@ -152,7 +152,7 @@ OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item); * */ OPAL_DECLSPEC struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list, - opal_process_info_t *process_info); + opal_process_info_t *process_info); /** * Obtain EP endpoint name diff --git a/opal/mca/common/ofi/help-common-ofi.txt b/opal/mca/common/ofi/help-common-ofi.txt index 44366a64c5f..de3630f7e7a 100644 --- a/opal/mca/common/ofi/help-common-ofi.txt +++ b/opal/mca/common/ofi/help-common-ofi.txt @@ -7,6 +7,12 @@ # # $HEADER$ # +[unbound_process] +Open MPI's OFI driver detected multiple NICs on the system but cannot select an +optimal device because the current process is not bound. This may negatively +impact performance. This can be resolved by specifying "--bind-to ..." on +command line. + [package_rank failed] Open MPI's OFI driver detected multiple equidistant NICs from the current process, but had insufficient information to ensure MPI processes fairly pick a NIC for use.