Skip to content

Fix confusion between cpuset and locality #8199

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 11, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 3 additions & 16 deletions ompi/dpm/dpm.c
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
if (0 < opal_list_get_size(&ilist)) {
uint32_t *peer_ranks = NULL;
int prn, nprn = 0;
char *val, *mycpuset;
char *val;
uint16_t u16;
opal_process_name_t wildcard_rank;
/* convert the list of new procs to a proc_t array */
Expand All @@ -380,16 +380,6 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
opal_argv_free(peers);
}

/* get my locality string */
val = NULL;
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING,
OMPI_PROC_MY_NAME, &val, PMIX_STRING);
if (OPAL_SUCCESS == rc && NULL != val) {
mycpuset = val;
} else {
mycpuset = NULL;
}

i = 0;
OPAL_LIST_FOREACH(cd, &ilist, ompi_dpm_proct_caddy_t) {
proc = cd->p;
Expand All @@ -406,8 +396,8 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
val = NULL;
OPAL_MODEX_RECV_VALUE_IMMEDIATE(rc, PMIX_LOCALITY_STRING,
&proc->super.proc_name, &val, OPAL_STRING);
if (OPAL_SUCCESS == rc && NULL != val) {
u16 = opal_hwloc_compute_relative_locality(mycpuset, val);
if (OPAL_SUCCESS == rc && NULL != ompi_process_info.locality) {
u16 = opal_hwloc_compute_relative_locality(ompi_process_info.locality, val);
free(val);
} else {
/* all we can say is that it shares our node */
Expand All @@ -425,9 +415,6 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
}
++i;
}
if (NULL != mycpuset) {
free(mycpuset);
}
if (NULL != peer_ranks) {
free(peer_ranks);
}
Expand Down
16 changes: 11 additions & 5 deletions ompi/runtime/ompi_rte.c
Original file line number Diff line number Diff line change
Expand Up @@ -764,7 +764,7 @@ int ompi_rte_init(int *pargc, char ***pargv)

/* identify our location */
val = NULL;
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING,
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_CPUSET,
&opal_process_info.my_name, &val, PMIX_STRING);
if (PMIX_SUCCESS == rc && NULL != val) {
opal_process_info.cpuset = val;
Expand All @@ -774,6 +774,15 @@ int ompi_rte_init(int *pargc, char ***pargv)
opal_process_info.cpuset = NULL;
opal_process_info.proc_is_bound = false;
}
val = NULL;
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING,
&opal_process_info.my_name, &val, PMIX_STRING);
if (PMIX_SUCCESS == rc && NULL != val) {
opal_process_info.locality = val;
val = NULL; // protect the string
} else {
opal_process_info.locality = NULL;
}

/* retrieve the local peers - defaults to local node */
val = NULL;
Expand Down Expand Up @@ -811,7 +820,7 @@ int ompi_rte_init(int *pargc, char ***pargv)
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING,
&pname, &val, PMIX_STRING);
if (PMIX_SUCCESS == rc && NULL != val) {
u16 = opal_hwloc_compute_relative_locality(opal_process_info.cpuset, val);
u16 = opal_hwloc_compute_relative_locality(opal_process_info.locality, val);
free(val);
} else {
/* all we can say is that it shares our node */
Expand All @@ -826,9 +835,6 @@ int ompi_rte_init(int *pargc, char ***pargv)
ret = opal_pmix_convert_status(rc);
error = "local store of locality";
opal_argv_free(peers);
if (NULL != opal_process_info.cpuset) {
free(opal_process_info.cpuset);
}
goto error;
}
}
Expand Down
4 changes: 2 additions & 2 deletions opal/mca/common/ofi/common_ofi.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2020 Intel, Inc. All rights reserved.
* Copyright (c) 2017 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2020 Triad National Security, LLC. All rights
Expand Down Expand Up @@ -345,7 +345,7 @@ static uint32_t get_package_rank(opal_process_info_t *process_info)
}

// compute relative locality
relative_locality = opal_hwloc_compute_relative_locality(process_info->cpuset, locality_string);
relative_locality = opal_hwloc_compute_relative_locality(process_info->locality, locality_string);
free(locality_string);

if (relative_locality & OPAL_PROC_ON_SOCKET) {
Expand Down
1 change: 1 addition & 0 deletions opal/util/proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ opal_process_info_t opal_process_info = {
.my_local_rank = 0, /* I'm the only process around here */
.my_node_rank = 0,
.cpuset = NULL,
.locality = NULL,
.pid = 0,
.num_procs = 0,
.app_num = 0,
Expand Down
1 change: 1 addition & 0 deletions opal/util/proc.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ typedef struct opal_process_info_t {
uint16_t my_local_rank; /**< local rank on this node within my job */
uint16_t my_node_rank;
char *cpuset; /**< String-representation of bitmap where we are bound */
char *locality; /**< String-representation of process locality */
pid_t pid;
uint32_t num_procs;
uint32_t app_num;
Expand Down