Skip to content

Commit af181c3

Browse files
committed
opal/mca/ofi: select NIC closest to accelerator if requested
When accelerator is requested, select the closest NIC to the accelerator device. If the accelerator or NIC PCI information is not available, fallback to select the NIC on the closest package. Signed-off-by: Wenduo Wang <[email protected]>
1 parent c29f239 commit af181c3

File tree

1 file changed

+195
-5
lines changed

1 file changed

+195
-5
lines changed

opal/mca/common/ofi/common_ofi.c

Lines changed: 195 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030

3131
#include "common_ofi.h"
3232
#include "opal/constants.h"
33+
#include "opal/mca/accelerator/accelerator.h"
3334
#include "opal/mca/base/mca_base_framework.h"
3435
#include "opal/mca/base/mca_base_var.h"
3536
#include "opal/mca/hwloc/base/base.h"
@@ -38,6 +39,7 @@
3839
#include "opal/util/argv.h"
3940
#include "opal/util/show_help.h"
4041

42+
extern opal_accelerator_base_module_t opal_accelerator;
4143
opal_common_ofi_module_t opal_common_ofi = {.prov_include = NULL,
4244
.prov_exclude = NULL,
4345
.output = -1};
@@ -915,15 +917,184 @@ static uint32_t get_package_rank(opal_process_info_t *process_info)
915917
return (uint32_t) process_info->myprocid.rank;
916918
}
917919

920+
static int get_obj_depth(hwloc_obj_t obj, int *depth)
921+
{
922+
hwloc_obj_t parent = NULL;
923+
int depth_from_obj = 0;
924+
925+
/* For hwloc < 2.0, depth is unsigned type, but it could store a negative value */
926+
if (0 <= (int) obj->depth) {
927+
*depth = obj->depth;
928+
return OPAL_SUCCESS;
929+
}
930+
931+
parent = obj->parent;
932+
while (parent) {
933+
++depth_from_obj;
934+
if (0 <= (int) parent->depth) {
935+
*depth = parent->depth + depth_from_obj;
936+
return OPAL_SUCCESS;
937+
}
938+
parent = obj->parent;
939+
}
940+
941+
return OPAL_ERROR;
942+
}
943+
944+
#if OPAL_OFI_PCI_DATA_AVAILABLE
945+
/**
946+
* @brief Attempt to find a nearest provider from the accelerator.
947+
* Check if opal_accelerator is initialized with a valid PCI device, and find a provider from the
948+
* shortest distance.
949+
* Special cases:
950+
* 1. If not accelerator device is available, returns OPAL_ERR_NOT_AVAILABLE.
951+
* 2. If the provider does not have PCI attributers, we do not attempt to make a selection, and
952+
* return OPAL_ERR_NOT_AVAILABLE.
953+
* 3. If there are more than 1 providers with the same equal distance, break the tie using a modulo
954+
* i.e. (local rank on the same accelerator) % (number of nearest providers)
955+
* @param[in] provider_list linked list of providers
956+
* @param[in] num_providers number of providers
957+
* @param[in] accl_id Accelerator id
958+
* @param[in] device_rank local rank on the accelerator
959+
* @param[out] provider pointer to the selected provider
960+
* @return OPAL_SUCCESS if a provider is successfully selected
961+
* OPAL_ERR_NOT_AVAILABLE if a provider cannot be decided deterministically
962+
* OPAL_ERROR if a fatal error happened
963+
*/
964+
static int find_nearest_provider_from_accelerator(struct fi_info *provider_list,
965+
size_t num_providers,
966+
int accl_id,
967+
uint32_t device_rank,
968+
struct fi_info **provider)
969+
{
970+
hwloc_obj_t accl_dev = NULL, prov_dev = NULL, common_ancestor = NULL;
971+
int ret = -1, depth = -1, max_common_ancestor_depth = -1;
972+
opal_accelerator_pci_attr_t accl_pci_attr = {0};
973+
struct fi_info *current_provider = NULL;
974+
struct fi_pci_attr pci = {0};
975+
uint32_t near_provider_count = 0, provider_rank = 0;
976+
uint32_t distances[num_providers], *distance = distances;
977+
978+
memset(distances, 0, sizeof(distances));
979+
980+
ret = opal_accelerator.get_device_pci_attr(accl_id, &accl_pci_attr);
981+
if (OPAL_SUCCESS != ret) {
982+
opal_output_verbose(1, opal_common_ofi.output,
983+
"%s:%d:Accelerator PCI info is not available", __FILE__, __LINE__);
984+
return OPAL_ERROR;
985+
}
986+
987+
accl_dev = hwloc_get_pcidev_by_busid(opal_hwloc_topology, accl_pci_attr.domain_id,
988+
accl_pci_attr.bus_id, accl_pci_attr.device_id,
989+
accl_pci_attr.function_id);
990+
if (NULL == accl_dev) {
991+
opal_output_verbose(1, opal_common_ofi.output,
992+
"%s:%d:Failed to find accelerator PCI device", __FILE__, __LINE__);
993+
return OPAL_ERROR;
994+
}
995+
996+
opal_output_verbose(1, opal_common_ofi.output,
997+
"%s:%d:Found accelerator device %d: %04x:%02x:%02x.%x VID: %x DID: %x",
998+
__FILE__, __LINE__, accl_id, accl_pci_attr.domain_id, accl_pci_attr.bus_id,
999+
accl_pci_attr.device_id, accl_pci_attr.function_id,
1000+
accl_dev->attr->pcidev.vendor_id, accl_dev->attr->pcidev.device_id);
1001+
1002+
current_provider = provider_list;
1003+
while (NULL != current_provider) {
1004+
common_ancestor = NULL;
1005+
if (0 == check_provider_attr(provider_list, current_provider)
1006+
&& OPAL_SUCCESS == get_provider_nic_pci(current_provider, &pci)) {
1007+
prov_dev = hwloc_get_pcidev_by_busid(opal_hwloc_topology, pci.domain_id, pci.bus_id,
1008+
pci.device_id, pci.function_id);
1009+
if (NULL == prov_dev) {
1010+
opal_output_verbose(1, opal_common_ofi.output,
1011+
"%s:%d:Failed to find provider PCI device", __FILE__, __LINE__);
1012+
return OPAL_ERROR;
1013+
}
1014+
1015+
common_ancestor = hwloc_get_common_ancestor_obj(opal_hwloc_topology, accl_dev,
1016+
prov_dev);
1017+
if (!common_ancestor) {
1018+
opal_output_verbose(
1019+
1, opal_common_ofi.output,
1020+
"%s:%d:Failed to find common ancestor of accelerator and provider PCI device",
1021+
__FILE__, __LINE__);
1022+
/**
1023+
* Return error because any 2 PCI devices should share at least one common ancestor,
1024+
* i.e. root
1025+
*/
1026+
return OPAL_ERROR;
1027+
}
1028+
1029+
ret = get_obj_depth(common_ancestor, &depth);
1030+
if (OPAL_SUCCESS != ret) {
1031+
opal_output_verbose(1, opal_common_ofi.output,
1032+
"%s:%d:Failed to get common ancestor depth", __FILE__,
1033+
__LINE__);
1034+
return OPAL_ERROR;
1035+
}
1036+
1037+
if (max_common_ancestor_depth < depth) {
1038+
max_common_ancestor_depth = depth;
1039+
near_provider_count = 1;
1040+
} else if (max_common_ancestor_depth == depth) {
1041+
++near_provider_count;
1042+
}
1043+
}
1044+
1045+
*(distance++) = !common_ancestor ? 0 : depth;
1046+
current_provider = current_provider->next;
1047+
}
1048+
1049+
if (0 == near_provider_count || 0 > max_common_ancestor_depth) {
1050+
opal_output_verbose(1, opal_common_ofi.output, "%s:%d:Provider does not have PCI device",
1051+
__FILE__, __LINE__);
1052+
return OPAL_ERR_NOT_AVAILABLE;
1053+
}
1054+
1055+
provider_rank = device_rank % near_provider_count;
1056+
1057+
distance = distances;
1058+
current_provider = provider_list;
1059+
while (NULL != current_provider) {
1060+
if ((uint32_t) max_common_ancestor_depth == *(distance++)
1061+
&& provider_rank == --near_provider_count) {
1062+
*provider = current_provider;
1063+
return OPAL_SUCCESS;
1064+
}
1065+
1066+
current_provider = current_provider->next;
1067+
}
1068+
1069+
assert(0 == near_provider_count);
1070+
1071+
return OPAL_ERROR;
1072+
}
1073+
#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */
1074+
1075+
9181076
struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
9191077
opal_process_info_t *process_info)
9201078
{
921-
int ret, num_providers = 0;
1079+
int ret, num_providers = 0, accel_id = -1;
9221080
struct fi_info *provider = NULL;
923-
uint32_t package_rank = process_info->my_local_rank;
1081+
uint32_t package_rank;
9241082

1083+
/* Current process' local rank on the same package(socket) */
1084+
package_rank = process_info->proc_is_bound ? get_package_rank(process_info)
1085+
: process_info->my_local_rank;
9251086
num_providers = count_providers(provider_list);
926-
if (!process_info->proc_is_bound || 2 > num_providers) {
1087+
1088+
#if OPAL_OFI_PCI_DATA_AVAILABLE
1089+
ret = opal_accelerator.get_device(&accel_id);
1090+
if (OPAL_SUCCESS != ret) {
1091+
opal_output_verbose(1, opal_common_ofi.output, "%s:%d:Accelerator is not available",
1092+
__FILE__, __LINE__);
1093+
accel_id = -1;
1094+
}
1095+
#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */
1096+
1097+
if ((!process_info->proc_is_bound && 0 > accel_id) || 2 > num_providers) {
9271098
goto round_robin;
9281099
}
9291100

@@ -935,9 +1106,28 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
9351106
__FILE__, __LINE__);
9361107
}
9371108

938-
package_rank = get_package_rank(process_info);
939-
9401109
#if OPAL_OFI_PCI_DATA_AVAILABLE
1110+
if (0 <= accel_id) {
1111+
/**
1112+
* If accelerator is enabled, select the closest provider to the accelerator.
1113+
* Note: the function expects a local rank on the accelerator to break ties if there are
1114+
* multiple equidistant providers. package_rank is NOT an accurate measure, but a proxy.
1115+
*/
1116+
ret = find_nearest_provider_from_accelerator(provider_list, num_providers, accel_id,
1117+
package_rank, &provider);
1118+
if (OPAL_SUCCESS == ret) {
1119+
goto out;
1120+
}
1121+
1122+
opal_output_verbose(1, opal_common_ofi.output,
1123+
"%s:%d:Failed to find a provider close to the accelerator. Error: %d",
1124+
__FILE__, __LINE__, ret);
1125+
1126+
if (!process_info->proc_is_bound) {
1127+
goto round_robin;
1128+
}
1129+
}
1130+
9411131
/**
9421132
* If provider PCI BDF information is available, we calculate its physical distance
9431133
* to the current process, and select the provider with the shortest distance.

0 commit comments

Comments
 (0)