30
30
31
31
#include "common_ofi.h"
32
32
#include "opal/constants.h"
33
+ #include "opal/mca/accelerator/accelerator.h"
33
34
#include "opal/mca/base/mca_base_framework.h"
34
35
#include "opal/mca/base/mca_base_var.h"
35
36
#include "opal/mca/hwloc/base/base.h"
38
39
#include "opal/util/argv.h"
39
40
#include "opal/util/show_help.h"
40
41
42
+ extern opal_accelerator_base_module_t opal_accelerator ;
41
43
opal_common_ofi_module_t opal_common_ofi = {.prov_include = NULL ,
42
44
.prov_exclude = NULL ,
43
45
.output = -1 };
44
46
static const char default_prov_exclude_list [] = "shm,sockets,tcp,udp,rstream,usnic,net" ;
45
47
static opal_mutex_t opal_common_ofi_mutex = OPAL_MUTEX_STATIC_INIT ;
46
48
static int opal_common_ofi_verbose_level = 0 ;
47
49
static int opal_common_ofi_init_ref_cnt = 0 ;
50
+ static int opal_common_ofi_accelerator_rank = -1 ;
48
51
#ifdef HAVE_STRUCT_FI_OPS_MEM_MONITOR
49
52
static bool opal_common_ofi_installed_memory_monitor = false;
50
53
#endif
@@ -324,6 +327,7 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
324
327
static int include_index = -1 ;
325
328
static int exclude_index = -1 ;
326
329
static int verbose_index = -1 ;
330
+ static int accelerator_rank_index = -1 ;
327
331
int ret ;
328
332
329
333
if (fi_version () < FI_VERSION (1 , 0 )) {
@@ -389,6 +393,19 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
389
393
}
390
394
}
391
395
396
+ if (0 > accelerator_rank_index ) {
397
+ accelerator_rank_index
398
+ = mca_base_var_register ("opal" , "opal_common" , "ofi" , "accelerator_rank" ,
399
+ "Process rank(non-negative) on the selected accelerator device" ,
400
+ MCA_BASE_VAR_TYPE_INT , NULL , 0 , MCA_BASE_VAR_FLAG_SETTABLE ,
401
+ OPAL_INFO_LVL_1 , MCA_BASE_VAR_SCOPE_LOCAL ,
402
+ & opal_common_ofi_accelerator_rank );
403
+ if (0 > accelerator_rank_index ) {
404
+ ret = accelerator_rank_index ;
405
+ goto err ;
406
+ }
407
+ }
408
+
392
409
if (component ) {
393
410
ret = mca_base_var_register_synonym (include_index ,
394
411
component -> mca_project_name ,
@@ -414,6 +431,15 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
414
431
if (0 > ret ) {
415
432
goto err ;
416
433
}
434
+
435
+ ret = mca_base_var_register_synonym (accelerator_rank_index ,
436
+ component -> mca_project_name ,
437
+ component -> mca_type_name ,
438
+ component -> mca_component_name ,
439
+ "accelerator_rank_index" , 0 );
440
+ if (0 > ret ) {
441
+ goto err ;
442
+ }
417
443
}
418
444
419
445
/* The frameworks initialize their output streams during
@@ -915,18 +941,193 @@ static uint32_t get_package_rank(opal_process_info_t *process_info)
915
941
return (uint32_t ) process_info -> myprocid .rank ;
916
942
}
917
943
944
+ static int get_parent_distance (hwloc_obj_t parent , hwloc_obj_t child , int * distance )
945
+ {
946
+ int dist = 0 ;
947
+
948
+ while (child != parent ) {
949
+ if (!child ) {
950
+ return OPAL_ERROR ;
951
+ }
952
+ child = child -> parent ;
953
+ ++ dist ;
954
+ }
955
+
956
+ * distance = dist ;
957
+ return OPAL_SUCCESS ;
958
+ }
959
+
960
+ #if OPAL_OFI_PCI_DATA_AVAILABLE
961
+ /**
962
+ * @brief Attempt to find a nearest provider from the accelerator.
963
+ * Check if opal_accelerator is initialized with a valid PCI device, and find a provider from the
964
+ * shortest distance.
965
+ * Special cases:
966
+ * 1. If not accelerator device is available, returns OPAL_ERR_NOT_AVAILABLE.
967
+ * 2. If the provider does not have PCI attributers, we do not attempt to make a selection, and
968
+ * return OPAL_ERR_NOT_AVAILABLE.
969
+ * 3. If there are more than 1 providers with the same equal distance, break the tie using a modulo
970
+ * i.e. (local rank on the same accelerator) % (number of nearest providers)
971
+ * @param[in] provider_list List of providers
972
+ * @param[in] num_providers Number of providers
973
+ * @param[in] accl_id Accelerator id
974
+ * @param[in] device_rank Local rank on the accelerator
975
+ * @param[out] provider Pointer to the selected provider
976
+ * @return OPAL_SUCCESS if a provider is successfully selected
977
+ * OPAL_ERR_NOT_AVAILABLE if a provider cannot be decided deterministically
978
+ * OPAL_ERROR if a fatal error happened
979
+ */
980
+ static int find_nearest_provider_from_accelerator (struct fi_info * provider_list ,
981
+ size_t num_providers ,
982
+ int accl_id ,
983
+ uint32_t device_rank ,
984
+ struct fi_info * * provider )
985
+ {
986
+ hwloc_obj_t accl_dev = NULL , prov_dev = NULL , common_ancestor = NULL ;
987
+ int ret = -1 , accl_distance = -1 , prov_distance = -1 , min_distance = INT_MAX ;
988
+ opal_accelerator_pci_attr_t accl_pci_attr = {0 };
989
+ struct fi_info * current_provider = NULL ;
990
+ struct fi_pci_attr pci = {0 };
991
+ uint32_t distances [num_providers ], * distance = distances ;
992
+ uint32_t near_provider_count = 0 , provider_rank = 0 ;
993
+
994
+ memset (distances , 0 , sizeof (distances ));
995
+
996
+ ret = opal_accelerator .get_device_pci_attr (accl_id , & accl_pci_attr );
997
+ if (OPAL_SUCCESS != ret ) {
998
+ opal_output_verbose (1 , opal_common_ofi .output ,
999
+ "%s:%d:Accelerator PCI info is not available" , __FILE__ , __LINE__ );
1000
+ return OPAL_ERROR ;
1001
+ }
1002
+
1003
+ accl_dev = hwloc_get_pcidev_by_busid (opal_hwloc_topology , accl_pci_attr .domain_id ,
1004
+ accl_pci_attr .bus_id , accl_pci_attr .device_id ,
1005
+ accl_pci_attr .function_id );
1006
+ if (NULL == accl_dev ) {
1007
+ opal_output_verbose (1 , opal_common_ofi .output ,
1008
+ "%s:%d:Failed to find accelerator PCI device" , __FILE__ , __LINE__ );
1009
+ return OPAL_ERROR ;
1010
+ }
1011
+
1012
+ opal_output_verbose (1 , opal_common_ofi .output ,
1013
+ "%s:%d:Found accelerator device %d: %04x:%02x:%02x.%x VID: %x DID: %x" ,
1014
+ __FILE__ , __LINE__ , accl_id , accl_pci_attr .domain_id , accl_pci_attr .bus_id ,
1015
+ accl_pci_attr .device_id , accl_pci_attr .function_id ,
1016
+ accl_dev -> attr -> pcidev .vendor_id , accl_dev -> attr -> pcidev .device_id );
1017
+
1018
+ current_provider = provider_list ;
1019
+ while (NULL != current_provider ) {
1020
+ common_ancestor = NULL ;
1021
+ if (0 == check_provider_attr (provider_list , current_provider )
1022
+ && OPAL_SUCCESS == get_provider_nic_pci (current_provider , & pci )) {
1023
+ prov_dev = hwloc_get_pcidev_by_busid (opal_hwloc_topology , pci .domain_id , pci .bus_id ,
1024
+ pci .device_id , pci .function_id );
1025
+ if (NULL == prov_dev ) {
1026
+ opal_output_verbose (1 , opal_common_ofi .output ,
1027
+ "%s:%d:Failed to find provider PCI device" , __FILE__ , __LINE__ );
1028
+ return OPAL_ERROR ;
1029
+ }
1030
+
1031
+ common_ancestor = hwloc_get_common_ancestor_obj (opal_hwloc_topology , accl_dev ,
1032
+ prov_dev );
1033
+ if (!common_ancestor ) {
1034
+ opal_output_verbose (
1035
+ 1 , opal_common_ofi .output ,
1036
+ "%s:%d:Failed to find common ancestor of accelerator and provider PCI device" ,
1037
+ __FILE__ , __LINE__ );
1038
+ /**
1039
+ * Return error because any 2 PCI devices should share at least one common ancestor,
1040
+ * i.e. root
1041
+ */
1042
+ return OPAL_ERROR ;
1043
+ }
1044
+
1045
+ ret = get_parent_distance (common_ancestor , accl_dev , & accl_distance );
1046
+ if (OPAL_SUCCESS != ret ) {
1047
+ opal_output_verbose (
1048
+ 1 , opal_common_ofi .output ,
1049
+ "%s:%d:Failed to get distance between common ancestor and accelerator device" ,
1050
+ __FILE__ , __LINE__ );
1051
+ return OPAL_ERROR ;
1052
+ }
1053
+
1054
+ ret = get_parent_distance (common_ancestor , prov_dev , & prov_distance );
1055
+ if (OPAL_SUCCESS != ret ) {
1056
+ opal_output_verbose (
1057
+ 1 , opal_common_ofi .output ,
1058
+ "%s:%d:Failed to get distance between common ancestor and provider device" ,
1059
+ __FILE__ , __LINE__ );
1060
+ return OPAL_ERROR ;
1061
+ }
1062
+
1063
+ if (min_distance > accl_distance + prov_distance ) {
1064
+ min_distance = accl_distance + prov_distance ;
1065
+ near_provider_count = 1 ;
1066
+ } else if (min_distance == accl_distance + prov_distance ) {
1067
+ ++ near_provider_count ;
1068
+ }
1069
+ }
1070
+
1071
+ * (distance ++ ) = !common_ancestor ? 0 : accl_distance + prov_distance ;
1072
+ current_provider = current_provider -> next ;
1073
+ }
1074
+
1075
+ if (0 == near_provider_count ) {
1076
+ opal_output_verbose (1 , opal_common_ofi .output , "%s:%d:Provider does not have PCI device" ,
1077
+ __FILE__ , __LINE__ );
1078
+ return OPAL_ERR_NOT_AVAILABLE ;
1079
+ }
1080
+
1081
+ provider_rank = device_rank % near_provider_count ;
1082
+
1083
+ distance = distances ;
1084
+ current_provider = provider_list ;
1085
+ near_provider_count = 0 ;
1086
+ while (NULL != current_provider ) {
1087
+ if ((uint32_t ) min_distance == * (distance ++ )
1088
+ && provider_rank == near_provider_count ++ ) {
1089
+ * provider = current_provider ;
1090
+ return OPAL_SUCCESS ;
1091
+ }
1092
+
1093
+ current_provider = current_provider -> next ;
1094
+ }
1095
+
1096
+ assert (0 == near_provider_count );
1097
+
1098
+ return OPAL_ERROR ;
1099
+ }
1100
+ #endif /* OPAL_OFI_PCI_DATA_AVAILABLE */
1101
+
1102
+
918
1103
struct fi_info * opal_common_ofi_select_provider (struct fi_info * provider_list ,
919
1104
opal_process_info_t * process_info )
920
1105
{
921
- int ret , num_providers = 0 ;
1106
+ int ret , num_providers = 0 , accel_id = -1 ;
922
1107
struct fi_info * provider = NULL ;
923
- uint32_t package_rank = process_info -> my_local_rank ;
1108
+ uint32_t package_rank ;
924
1109
1110
+ /* Current process' local rank on the same package(socket) */
1111
+ package_rank = process_info -> proc_is_bound ? get_package_rank (process_info )
1112
+ : process_info -> my_local_rank ;
925
1113
num_providers = count_providers (provider_list );
926
- if (!process_info -> proc_is_bound || 2 > num_providers ) {
1114
+
1115
+ #if OPAL_OFI_PCI_DATA_AVAILABLE
1116
+ if (-1 < opal_common_ofi_accelerator_rank ) {
1117
+ ret = opal_accelerator .get_device (& accel_id );
1118
+ if (OPAL_SUCCESS != ret ) {
1119
+ opal_output_verbose (1 , opal_common_ofi .output , "%s:%d:Accelerator is not available" ,
1120
+ __FILE__ , __LINE__ );
1121
+ accel_id = -1 ;
1122
+ }
1123
+ }
1124
+ #endif /* OPAL_OFI_PCI_DATA_AVAILABLE */
1125
+
1126
+ if ((!process_info -> proc_is_bound && 0 > accel_id ) || 2 > num_providers ) {
927
1127
goto round_robin ;
928
1128
}
929
1129
1130
+ #if OPAL_OFI_PCI_DATA_AVAILABLE
930
1131
/* Initialize opal_hwloc_topology if it is not already */
931
1132
ret = opal_hwloc_base_get_topology ();
932
1133
if (0 > ret ) {
@@ -935,9 +1136,22 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
935
1136
__FILE__ , __LINE__ );
936
1137
}
937
1138
938
- package_rank = get_package_rank (process_info );
1139
+ if (0 <= accel_id ) {
1140
+ ret = find_nearest_provider_from_accelerator (provider_list , num_providers , accel_id ,
1141
+ opal_common_ofi_accelerator_rank , & provider );
1142
+ if (OPAL_SUCCESS == ret ) {
1143
+ goto out ;
1144
+ }
1145
+
1146
+ opal_output_verbose (1 , opal_common_ofi .output ,
1147
+ "%s:%d:Failed to find a provider close to the accelerator. Error: %d" ,
1148
+ __FILE__ , __LINE__ , ret );
1149
+
1150
+ if (!process_info -> proc_is_bound ) {
1151
+ goto round_robin ;
1152
+ }
1153
+ }
939
1154
940
- #if OPAL_OFI_PCI_DATA_AVAILABLE
941
1155
/**
942
1156
* If provider PCI BDF information is available, we calculate its physical distance
943
1157
* to the current process, and select the provider with the shortest distance.
0 commit comments