Skip to content

Commit 7d20b86

Browse files
authored
Merge pull request #11716 from wenduwan/accelerator_awareness
opal/mca/ofi: select NIC closest to accelerator if requested
2 parents e231408 + 66912b9 commit 7d20b86

File tree

1 file changed

+219
-5
lines changed

1 file changed

+219
-5
lines changed

opal/mca/common/ofi/common_ofi.c

Lines changed: 219 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030

3131
#include "common_ofi.h"
3232
#include "opal/constants.h"
33+
#include "opal/mca/accelerator/accelerator.h"
3334
#include "opal/mca/base/mca_base_framework.h"
3435
#include "opal/mca/base/mca_base_var.h"
3536
#include "opal/mca/hwloc/base/base.h"
@@ -38,13 +39,15 @@
3839
#include "opal/util/argv.h"
3940
#include "opal/util/show_help.h"
4041

42+
extern opal_accelerator_base_module_t opal_accelerator;
4143
opal_common_ofi_module_t opal_common_ofi = {.prov_include = NULL,
4244
.prov_exclude = NULL,
4345
.output = -1};
4446
static const char default_prov_exclude_list[] = "shm,sockets,tcp,udp,rstream,usnic,net";
4547
static opal_mutex_t opal_common_ofi_mutex = OPAL_MUTEX_STATIC_INIT;
4648
static int opal_common_ofi_verbose_level = 0;
4749
static int opal_common_ofi_init_ref_cnt = 0;
50+
static int opal_common_ofi_accelerator_rank = -1;
4851
#ifdef HAVE_STRUCT_FI_OPS_MEM_MONITOR
4952
static bool opal_common_ofi_installed_memory_monitor = false;
5053
#endif
@@ -324,6 +327,7 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
324327
static int include_index = -1;
325328
static int exclude_index = -1;
326329
static int verbose_index = -1;
330+
static int accelerator_rank_index = -1;
327331
int ret;
328332

329333
if (fi_version() < FI_VERSION(1, 0)) {
@@ -389,6 +393,19 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
389393
}
390394
}
391395

396+
if (0 > accelerator_rank_index) {
397+
accelerator_rank_index
398+
= mca_base_var_register("opal", "opal_common", "ofi", "accelerator_rank",
399+
"Process rank(non-negative) on the selected accelerator device",
400+
MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
401+
OPAL_INFO_LVL_1, MCA_BASE_VAR_SCOPE_LOCAL,
402+
&opal_common_ofi_accelerator_rank);
403+
if (0 > accelerator_rank_index) {
404+
ret = accelerator_rank_index;
405+
goto err;
406+
}
407+
}
408+
392409
if (component) {
393410
ret = mca_base_var_register_synonym(include_index,
394411
component->mca_project_name,
@@ -414,6 +431,15 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
414431
if (0 > ret) {
415432
goto err;
416433
}
434+
435+
ret = mca_base_var_register_synonym(accelerator_rank_index,
436+
component->mca_project_name,
437+
component->mca_type_name,
438+
component->mca_component_name,
439+
"accelerator_rank_index", 0);
440+
if (0 > ret) {
441+
goto err;
442+
}
417443
}
418444

419445
/* The frameworks initialize their output streams during
@@ -915,18 +941,193 @@ static uint32_t get_package_rank(opal_process_info_t *process_info)
915941
return (uint32_t) process_info->myprocid.rank;
916942
}
917943

944+
static int get_parent_distance(hwloc_obj_t parent, hwloc_obj_t child, int *distance)
945+
{
946+
int dist = 0;
947+
948+
while (child != parent) {
949+
if (!child) {
950+
return OPAL_ERROR;
951+
}
952+
child = child->parent;
953+
++dist;
954+
}
955+
956+
*distance = dist;
957+
return OPAL_SUCCESS;
958+
}
959+
960+
#if OPAL_OFI_PCI_DATA_AVAILABLE
961+
/**
962+
* @brief Attempt to find a nearest provider from the accelerator.
963+
* Check if opal_accelerator is initialized with a valid PCI device, and find a provider from the
964+
* shortest distance.
965+
* Special cases:
966+
* 1. If not accelerator device is available, returns OPAL_ERR_NOT_AVAILABLE.
967+
* 2. If the provider does not have PCI attributers, we do not attempt to make a selection, and
968+
* return OPAL_ERR_NOT_AVAILABLE.
969+
* 3. If there are more than 1 providers with the same equal distance, break the tie using a modulo
970+
* i.e. (local rank on the same accelerator) % (number of nearest providers)
971+
* @param[in] provider_list List of providers
972+
* @param[in] num_providers Number of providers
973+
* @param[in] accl_id Accelerator id
974+
* @param[in] device_rank Local rank on the accelerator
975+
* @param[out] provider Pointer to the selected provider
976+
* @return OPAL_SUCCESS if a provider is successfully selected
977+
* OPAL_ERR_NOT_AVAILABLE if a provider cannot be decided deterministically
978+
* OPAL_ERROR if a fatal error happened
979+
*/
980+
static int find_nearest_provider_from_accelerator(struct fi_info *provider_list,
981+
size_t num_providers,
982+
int accl_id,
983+
uint32_t device_rank,
984+
struct fi_info **provider)
985+
{
986+
hwloc_obj_t accl_dev = NULL, prov_dev = NULL, common_ancestor = NULL;
987+
int ret = -1, accl_distance = -1, prov_distance = -1, min_distance = INT_MAX;
988+
opal_accelerator_pci_attr_t accl_pci_attr = {0};
989+
struct fi_info *current_provider = NULL;
990+
struct fi_pci_attr pci = {0};
991+
uint32_t distances[num_providers], *distance = distances;
992+
uint32_t near_provider_count = 0, provider_rank = 0;
993+
994+
memset(distances, 0, sizeof(distances));
995+
996+
ret = opal_accelerator.get_device_pci_attr(accl_id, &accl_pci_attr);
997+
if (OPAL_SUCCESS != ret) {
998+
opal_output_verbose(1, opal_common_ofi.output,
999+
"%s:%d:Accelerator PCI info is not available", __FILE__, __LINE__);
1000+
return OPAL_ERROR;
1001+
}
1002+
1003+
accl_dev = hwloc_get_pcidev_by_busid(opal_hwloc_topology, accl_pci_attr.domain_id,
1004+
accl_pci_attr.bus_id, accl_pci_attr.device_id,
1005+
accl_pci_attr.function_id);
1006+
if (NULL == accl_dev) {
1007+
opal_output_verbose(1, opal_common_ofi.output,
1008+
"%s:%d:Failed to find accelerator PCI device", __FILE__, __LINE__);
1009+
return OPAL_ERROR;
1010+
}
1011+
1012+
opal_output_verbose(1, opal_common_ofi.output,
1013+
"%s:%d:Found accelerator device %d: %04x:%02x:%02x.%x VID: %x DID: %x",
1014+
__FILE__, __LINE__, accl_id, accl_pci_attr.domain_id, accl_pci_attr.bus_id,
1015+
accl_pci_attr.device_id, accl_pci_attr.function_id,
1016+
accl_dev->attr->pcidev.vendor_id, accl_dev->attr->pcidev.device_id);
1017+
1018+
current_provider = provider_list;
1019+
while (NULL != current_provider) {
1020+
common_ancestor = NULL;
1021+
if (0 == check_provider_attr(provider_list, current_provider)
1022+
&& OPAL_SUCCESS == get_provider_nic_pci(current_provider, &pci)) {
1023+
prov_dev = hwloc_get_pcidev_by_busid(opal_hwloc_topology, pci.domain_id, pci.bus_id,
1024+
pci.device_id, pci.function_id);
1025+
if (NULL == prov_dev) {
1026+
opal_output_verbose(1, opal_common_ofi.output,
1027+
"%s:%d:Failed to find provider PCI device", __FILE__, __LINE__);
1028+
return OPAL_ERROR;
1029+
}
1030+
1031+
common_ancestor = hwloc_get_common_ancestor_obj(opal_hwloc_topology, accl_dev,
1032+
prov_dev);
1033+
if (!common_ancestor) {
1034+
opal_output_verbose(
1035+
1, opal_common_ofi.output,
1036+
"%s:%d:Failed to find common ancestor of accelerator and provider PCI device",
1037+
__FILE__, __LINE__);
1038+
/**
1039+
* Return error because any 2 PCI devices should share at least one common ancestor,
1040+
* i.e. root
1041+
*/
1042+
return OPAL_ERROR;
1043+
}
1044+
1045+
ret = get_parent_distance(common_ancestor, accl_dev, &accl_distance);
1046+
if (OPAL_SUCCESS != ret) {
1047+
opal_output_verbose(
1048+
1, opal_common_ofi.output,
1049+
"%s:%d:Failed to get distance between common ancestor and accelerator device",
1050+
__FILE__, __LINE__);
1051+
return OPAL_ERROR;
1052+
}
1053+
1054+
ret = get_parent_distance(common_ancestor, prov_dev, &prov_distance);
1055+
if (OPAL_SUCCESS != ret) {
1056+
opal_output_verbose(
1057+
1, opal_common_ofi.output,
1058+
"%s:%d:Failed to get distance between common ancestor and provider device",
1059+
__FILE__, __LINE__);
1060+
return OPAL_ERROR;
1061+
}
1062+
1063+
if (min_distance > accl_distance + prov_distance) {
1064+
min_distance = accl_distance + prov_distance;
1065+
near_provider_count = 1;
1066+
} else if (min_distance == accl_distance + prov_distance) {
1067+
++near_provider_count;
1068+
}
1069+
}
1070+
1071+
*(distance++) = !common_ancestor ? 0 : accl_distance + prov_distance;
1072+
current_provider = current_provider->next;
1073+
}
1074+
1075+
if (0 == near_provider_count) {
1076+
opal_output_verbose(1, opal_common_ofi.output, "%s:%d:Provider does not have PCI device",
1077+
__FILE__, __LINE__);
1078+
return OPAL_ERR_NOT_AVAILABLE;
1079+
}
1080+
1081+
provider_rank = device_rank % near_provider_count;
1082+
1083+
distance = distances;
1084+
current_provider = provider_list;
1085+
near_provider_count = 0;
1086+
while (NULL != current_provider) {
1087+
if ((uint32_t) min_distance == *(distance++)
1088+
&& provider_rank == near_provider_count++) {
1089+
*provider = current_provider;
1090+
return OPAL_SUCCESS;
1091+
}
1092+
1093+
current_provider = current_provider->next;
1094+
}
1095+
1096+
assert(0 == near_provider_count);
1097+
1098+
return OPAL_ERROR;
1099+
}
1100+
#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */
1101+
1102+
9181103
struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
9191104
opal_process_info_t *process_info)
9201105
{
921-
int ret, num_providers = 0;
1106+
int ret, num_providers = 0, accel_id = -1;
9221107
struct fi_info *provider = NULL;
923-
uint32_t package_rank = process_info->my_local_rank;
1108+
uint32_t package_rank;
9241109

1110+
/* Current process' local rank on the same package(socket) */
1111+
package_rank = process_info->proc_is_bound ? get_package_rank(process_info)
1112+
: process_info->my_local_rank;
9251113
num_providers = count_providers(provider_list);
926-
if (!process_info->proc_is_bound || 2 > num_providers) {
1114+
1115+
#if OPAL_OFI_PCI_DATA_AVAILABLE
1116+
if (-1 < opal_common_ofi_accelerator_rank) {
1117+
ret = opal_accelerator.get_device(&accel_id);
1118+
if (OPAL_SUCCESS != ret) {
1119+
opal_output_verbose(1, opal_common_ofi.output, "%s:%d:Accelerator is not available",
1120+
__FILE__, __LINE__);
1121+
accel_id = -1;
1122+
}
1123+
}
1124+
#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */
1125+
1126+
if ((!process_info->proc_is_bound && 0 > accel_id) || 2 > num_providers) {
9271127
goto round_robin;
9281128
}
9291129

1130+
#if OPAL_OFI_PCI_DATA_AVAILABLE
9301131
/* Initialize opal_hwloc_topology if it is not already */
9311132
ret = opal_hwloc_base_get_topology();
9321133
if (0 > ret) {
@@ -935,9 +1136,22 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
9351136
__FILE__, __LINE__);
9361137
}
9371138

938-
package_rank = get_package_rank(process_info);
1139+
if (0 <= accel_id) {
1140+
ret = find_nearest_provider_from_accelerator(provider_list, num_providers, accel_id,
1141+
opal_common_ofi_accelerator_rank, &provider);
1142+
if (OPAL_SUCCESS == ret) {
1143+
goto out;
1144+
}
1145+
1146+
opal_output_verbose(1, opal_common_ofi.output,
1147+
"%s:%d:Failed to find a provider close to the accelerator. Error: %d",
1148+
__FILE__, __LINE__, ret);
1149+
1150+
if (!process_info->proc_is_bound) {
1151+
goto round_robin;
1152+
}
1153+
}
9391154

940-
#if OPAL_OFI_PCI_DATA_AVAILABLE
9411155
/**
9421156
* If provider PCI BDF information is available, we calculate its physical distance
9431157
* to the current process, and select the provider with the shortest distance.

0 commit comments

Comments
 (0)