Skip to content

ofi nic selection bugfix and document update #12144

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 50 additions & 13 deletions opal/mca/common/ofi/common_ofi.c
Original file line number Diff line number Diff line change
Expand Up @@ -623,10 +623,10 @@ static int get_provider_distance(struct fi_info *provider, hwloc_topology_t topo
/**
* @brief Get the nearest device to the current thread
*
* Use the PMIx server or calculate the device distances, then out of the set of
* returned distances find the subset of the nearest devices. This can be
* 0 or more.
* If there are multiple equidistant devices, break the tie using the rank.
* Compute the distances from the current thread to each NIC in provider_list,
* and select the NIC with the shortest distance.
* If there are multiple equidistant devices, break the tie using local rank
* to balance NIC utilization.
*
* @param[in] topoloy hwloc topology
* @param[in] provider_list List of providers to select from
Expand Down Expand Up @@ -724,16 +724,46 @@ static int get_nearest_nic(hwloc_topology_t topology, struct fi_info *provider_l
return ret;
}

static struct fi_info *select_provider_round_robin(struct fi_info *provider_list, uint32_t rank,
size_t num_providers)
/**
* @brief Selects a provider from the list in a round-robin fashion
*
* This function implements a round-robin algorithm to select a provider from
* the provided list based on a rank. Only providers of the same type as the
* first provider are eligible for selection.
*
* @param[in] provider_list A list of providers to select from.
* @param[out] rank A rank metric for the current process, such as
* the rank on the same node or CPU package.
* @return Pointer to the selected provider
*/
static struct fi_info *select_provider_round_robin(struct fi_info *provider_list, uint32_t rank)
{
uint32_t provider_rank = rank % num_providers;
struct fi_info *current_provider = provider_list;
uint32_t provider_rank = 0, current_rank = 0;
size_t num_providers = 0;
struct fi_info *current_provider = NULL;

for (uint32_t i = 0; i < provider_rank; ++i) {
for (current_provider = provider_list; NULL != current_provider;) {
if (OPAL_SUCCESS == check_provider_attr(provider_list, current_provider)) {
++num_providers;
}
current_provider = current_provider->next;
}

current_provider = provider_list;
if (2 > num_providers) {
goto out;
}

provider_rank = rank % num_providers;

while (NULL != current_provider) {
if (OPAL_SUCCESS == check_provider_attr(provider_list, current_provider)
&& provider_rank == current_rank++) {
break;
}
current_provider = current_provider->next;
}
out:
return current_provider;
}

Expand Down Expand Up @@ -850,7 +880,7 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
{
int ret, num_providers = 0;
struct fi_info *provider = NULL;
uint32_t package_rank = 0;
uint32_t package_rank = process_info->my_local_rank;

num_providers = count_providers(provider_list);
if (!process_info->proc_is_bound || 2 > num_providers) {
Expand All @@ -868,6 +898,10 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
package_rank = get_package_rank(process_info);

#if OPAL_OFI_PCI_DATA_AVAILABLE
/**
* If provider PCI BDF information is available, we calculate its physical distance
* to the current process, and select the provider with the shortest distance.
*/
ret = get_nearest_nic(opal_hwloc_topology, provider_list, num_providers, package_rank,
&provider);
if (OPAL_SUCCESS == ret) {
Expand All @@ -876,7 +910,12 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */

round_robin:
provider = select_provider_round_robin(provider_list, package_rank, num_providers);
if (!process_info->proc_is_bound && 1 < num_providers
&& opal_output_get_verbosity(opal_common_ofi.output) >= 1) {
opal_show_help("help-common-ofi.txt", "unbound_process", true, 1);
}

provider = select_provider_round_robin(provider_list, package_rank);
out:
#if OPAL_ENABLE_DEBUG
opal_output_verbose(1, opal_common_ofi.output, "package rank: %d device: %s", package_rank,
Expand Down Expand Up @@ -950,5 +989,3 @@ OPAL_DECLSPEC int opal_common_ofi_fi_getname(fid_t fid, void **addr, size_t *add
}
return ret;
}


84 changes: 42 additions & 42 deletions opal/mca/common/ofi/common_ofi.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,47 +103,47 @@ OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item);
/**
* Selects NIC (provider) based on hardware locality
*
* In multi-nic situations, use hardware topology to pick the "best"
* of the selected NICs.
* There are 3 main cases that this covers:
*
* 1. If the first provider passed into this function is the only valid
* provider, this provider is returned.
*
* 2. If there is more than 1 provider that matches the type of the first
* provider in the list, and the BDF data
* is available then a provider is selected based on locality of device
* cpuset and process cpuset and tries to ensure that processes
* are distributed evenly across NICs. This has two separate
* cases:
*
* i. There is one or more provider local to the process:
*
* (local rank % number of providers of the same type
* that share the process cpuset) is used to select one
* of these providers.
*
* ii. There is no provider that is local to the process:
*
* (local rank % number of providers of the same type)
* is used to select one of these providers
*
* 3. If there is more than 1 providers of the same type in the
* list, and the BDF data is not available (the ofi version does
* not support fi_info.nic or the provider does not support BDF)
* then (local rank % number of providers of the same type) is
* used to select one of these providers
*
* @param provider_list (IN) struct fi_info* An initially selected
* provider NIC. The provider name and
* attributes are used to restrict NIC
* selection. This provider is returned if the
* NIC selection fails.
*
* @param provider (OUT) struct fi_info* object with the selected
* provider if the selection succeeds
* if the selection fails, returns the fi_info
* object that was initially provided.
* The selection is based on the following priority:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jsquyres Does this explain the NIC selection well enough? What else do you think will make it better?

*
* Single-NIC:
*
* If only 1 provider is available, always return that provider.
*
* Multi-NIC:
*
* 1. If the process is NOT bound, pick a NIC using (local rank % number
* of providers of the same type). This gives a fair chance to each
* qualified NIC and balances overall utilization.
*
* 2. If the process is bound, we compare providers in the list that have
* the same type as the first provider, and find the provider with the
* shortest distance to the current process.
*
* i. If the provider has PCI BDF data, we attempt to compute the
* distance between the NIC and the current process cpuset. The NIC
* with the shortest distance is returned.
*
* * For equidistant NICs, we select a NIC in round-robin fashion
* using the package rank of the current process, i.e. (package
* rank % number of providers with the same distance).
*
* ii. If we cannot compute the distance between the NIC and the
* current process, e.g. PCI BDF data is not available, a NIC will be
* selected in a round-robin fashion using package rank, i.e. (package
* rank % number of providers of the same type).
*
* @param[in] provider_list struct fi_info* An initially selected
* provider NIC. The provider name and
* attributes are used to restrict NIC
* selection. This provider is returned if the
* NIC selection fails.
*
* @param[in] process_info opal_process_info_t* The current process info
*
* @param[out] provider struct fi_info* object with the selected
* provider if the selection succeeds
* if the selection fails, returns the fi_info
* object that was initially provided.
*
* All errors should be recoverable and will return the initially provided
* provider. However, if an error occurs we can no longer guarantee
Expand All @@ -152,7 +152,7 @@ OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item);
*
*/
OPAL_DECLSPEC struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
opal_process_info_t *process_info);
opal_process_info_t *process_info);

/**
* Obtain EP endpoint name
Expand Down
6 changes: 6 additions & 0 deletions opal/mca/common/ofi/help-common-ofi.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@
#
# $HEADER$
#
[unbound_process]
Open MPI's OFI driver detected multiple NICs on the system but cannot select an
optimal device because the current process is not bound. This may negatively
impact performance. This can be resolved by specifying "--bind-to ..." on
command line.

[package_rank failed]
Open MPI's OFI driver detected multiple equidistant NICs from the current process,
but had insufficient information to ensure MPI processes fairly pick a NIC for use.
Expand Down