Skip to content

Commit 18a7064

Browse files
authored
Merge pull request #11710 from wenduwan/fix_package_rank
opal/ofi: follow up fixes to package rank calculation
2 parents 79ea49f + 1a1b84a commit 18a7064

File tree

1 file changed

+14
-7
lines changed

1 file changed

+14
-7
lines changed

opal/mca/common/ofi/common_ofi.c

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -713,7 +713,7 @@ static int count_providers(struct fi_info *provider_list)
713713
*/
714714
static uint32_t get_package_rank(opal_process_info_t *process_info)
715715
{
716-
int i;
716+
int i, level = 10;
717717
uint16_t relative_locality, *package_rank_ptr;
718718
uint32_t ranks_on_package = 0;
719719
opal_process_name_t pname;
@@ -752,10 +752,6 @@ static uint32_t get_package_rank(opal_process_info_t *process_info)
752752
for (i = 0; NULL != peers[i]; i++) {
753753
pname.vpid = strtoul(peers[i], NULL, 10);
754754

755-
if ((uint16_t) pname.vpid == process_info->my_local_rank) {
756-
return ranks_on_package;
757-
}
758-
759755
locality_string = NULL;
760756
// Get the LOCALITY_STRING for process[i]
761757
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING, &pname, &locality_string,
@@ -769,12 +765,22 @@ static uint32_t get_package_rank(opal_process_info_t *process_info)
769765
locality_string);
770766
free(locality_string);
771767

768+
if ((uint16_t) pname.vpid == process_info->my_local_rank) {
769+
return ranks_on_package;
770+
}
771+
772772
if (relative_locality & OPAL_PROC_ON_SOCKET) {
773773
ranks_on_package++;
774774
}
775775
}
776776
err:
777-
opal_show_help("help-common-ofi.txt", "package_rank failed", true);
777+
if (opal_output_get_verbosity(opal_common_ofi.output) >= level) {
778+
opal_show_help("help-common-ofi.txt", "package_rank failed", true, level);
779+
}
780+
781+
if (locality_string)
782+
free(locality_string);
783+
778784
return (uint32_t) process_info->myprocid.rank;
779785
}
780786

@@ -793,6 +799,7 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
793799
int ret;
794800
unsigned int num_provider = 0, provider_limit = 0;
795801
bool provider_found = false;
802+
uint32_t package_rank = 0;
796803

797804
/* Initialize opal_hwloc_topology if it is not already */
798805
ret = opal_hwloc_base_get_topology();
@@ -853,9 +860,9 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
853860
}
854861

855862
/* Select provider from local rank % number of providers */
856-
uint32_t package_rank = get_package_rank(process_info);
857863
if (num_provider >= 2) {
858864
// If there are multiple NICs "close" to the process, try to calculate package_rank
865+
package_rank = get_package_rank(process_info);
859866
provider = provider_table[package_rank % num_provider];
860867
} else if (num_provider == 1) {
861868
provider = provider_table[num_provider - 1];

0 commit comments

Comments
 (0)