Skip to content

Commit fd413e6

Browse files
committed
topo/treematch: fix topo_treematch_distgraph_create
Signed-off-by: Gilles Gouaillardet <[email protected]>
1 parent 0f93099 commit fd413e6

File tree

1 file changed

+67
-80
lines changed

1 file changed

+67
-80
lines changed

ompi/mca/topo/treematch/topo_treematch_dist_graph_create.c

Lines changed: 67 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
* Copyright (c) 2011-2015 INRIA. All rights reserved.
77
* Copyright (c) 2012-2015 Bordeaux Poytechnic Institute
88
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
9-
* Copyright (c) 2015-2016 Research Organization for Information Science
9+
* Copyright (c) 2015-2017 Research Organization for Information Science
1010
* and Technology (RIST). All rights reserved.
1111
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
1212
* reserved.
@@ -55,7 +55,16 @@
5555
#define MY_STRING_SIZE 64
5656
/*#define __DEBUG__ 1 */
5757

58-
58+
/**
59+
* This function is a allreduce between all processes to detect for oversubscription.
60+
* On each node, the local_procs will be a different array, that contains only the
61+
* local processes. Thus, that process will compute the node oversubscription and will
62+
* bring this value to the operation, while every other process on the node will
63+
* contribute 0.
64+
* Doing an AllReduce might be an overkill for this situation, but it should remain
65+
* more scalable than a star reduction (between the roots of each node (nodes_roots),
66+
* followed by a bcast to all processes.
67+
*/
5968
static int check_oversubscribing(int rank,
6069
int num_nodes,
6170
int num_objs_in_node,
@@ -64,48 +73,13 @@ static int check_oversubscribing(int rank,
6473
int *local_procs,
6574
ompi_communicator_t *comm_old)
6675
{
67-
int oversubscribed = 0;
68-
int local_oversub = 0;
69-
int err;
76+
int oversubscribed = 0, local_oversub = 0, err;
7077

78+
/* Only a single process per node, the local root, compute the oversubscription condition */
7179
if (rank == local_procs[0])
7280
if(num_objs_in_node < num_procs_in_node)
7381
local_oversub = 1;
7482

75-
if (rank == 0) {
76-
MPI_Request *reqs = (MPI_Request *)calloc(num_nodes-1, sizeof(MPI_Request));
77-
int *oversub = (int *)calloc(num_nodes, sizeof(int));
78-
int i;
79-
80-
oversub[0] = local_oversub;
81-
for(i = 1; i < num_nodes; i++)
82-
if (OMPI_SUCCESS != ( err = MCA_PML_CALL(irecv(&oversub[i], 1, MPI_INT,
83-
nodes_roots[i], 111, comm_old, &reqs[i-1])))) {
84-
/* NTH: more needs to be done to correctly clean up here */
85-
free (reqs);
86-
free (oversub);
87-
return err;
88-
}
89-
90-
if (OMPI_SUCCESS != ( err = ompi_request_wait_all(num_nodes-1,
91-
reqs, MPI_STATUSES_IGNORE))) {
92-
/* NTH: more needs to be done to correctly clean up here */
93-
free (reqs);
94-
free (oversub);
95-
return err;
96-
}
97-
98-
for(i = 0; i < num_nodes; i++)
99-
oversubscribed += oversub[i];
100-
101-
free(oversub);
102-
free(reqs);
103-
} else {
104-
if (rank == local_procs[0])
105-
if (OMPI_SUCCESS != (err = MCA_PML_CALL(send(&local_oversub, 1, MPI_INT, 0,
106-
111, MCA_PML_BASE_SEND_STANDARD, comm_old))))
107-
return err;
108-
}
10983

11084
if (OMPI_SUCCESS != (err = comm_old->c_coll->coll_bcast(&oversubscribed, 1,
11185
MPI_INT, 0, comm_old,
@@ -163,7 +137,7 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
163137
int num_procs_in_node = 0;
164138
int rank, size;
165139
int hwloc_err;
166-
int oversubscribing_objs = 0;
140+
int oversubscribing_objs = 0, oversubscribed_pus = 0;
167141
int i, j, idx;
168142
uint32_t val, *pval;
169143

@@ -269,8 +243,12 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
269243
hwloc_get_cpubind(opal_hwloc_topology,set,0);
270244
num_pus_in_node = hwloc_get_nbobjs_by_type(opal_hwloc_topology, HWLOC_OBJ_PU);
271245

272-
if(hwloc_bitmap_isincluded(root_obj->cpuset,set)){
273-
/* processes are not bound on the machine */
246+
/**
247+
* In all situations (including heterogeneous environments) all processes must execute
248+
* all the calls that involve collective communications, so we have to lay the logic
249+
* accordingly.
250+
*/
251+
if(hwloc_bitmap_isincluded(root_obj->cpuset,set)){ /* processes are not bound on the machine */
274252
#ifdef __DEBUG__
275253
if (0 == rank)
276254
fprintf(stdout,">>>>>>>>>>>>> Process Not bound <<<<<<<<<<<<<<<\n");
@@ -285,60 +263,70 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
285263
oversubscribing_objs = check_oversubscribing(rank,num_nodes,
286264
num_objs_in_node,num_procs_in_node,
287265
nodes_roots,local_procs,comm_old);
288-
if(oversubscribing_objs) {
266+
} else { /* the processes are already bound */
267+
object = hwloc_get_obj_covering_cpuset(opal_hwloc_topology,set);
268+
obj_rank = object->logical_index;
269+
effective_depth = object->depth;
270+
num_objs_in_node = hwloc_get_nbobjs_by_depth(opal_hwloc_topology, effective_depth);
271+
272+
/* Check for oversubscribing */
273+
oversubscribing_objs = check_oversubscribing(rank,num_nodes,
274+
num_objs_in_node,num_procs_in_node,
275+
nodes_roots,local_procs,comm_old);
276+
}
277+
278+
if(oversubscribing_objs) {
279+
if(hwloc_bitmap_isincluded(root_obj->cpuset,set)){ /* processes are not bound on the machine */
289280
#ifdef __DEBUG__
290281
fprintf(stdout,"Oversubscribing OBJ/CORES resources => Trying to use PUs \n");
291282
#endif
292-
int oversubscribed_pus = check_oversubscribing(rank,num_nodes,
293-
num_pus_in_node,num_procs_in_node,
294-
nodes_roots,local_procs,comm_old);
295-
if (oversubscribed_pus){
296-
#ifdef __DEBUG__
297-
fprintf(stdout,"Oversubscribing PUs resources => Rank Reordering Impossible \n");
298-
#endif
299-
FALLBACK();
300-
} else {
283+
oversubscribed_pus = check_oversubscribing(rank,num_nodes,
284+
num_pus_in_node,num_procs_in_node,
285+
nodes_roots,local_procs,comm_old);
286+
} else {
287+
/* Bound processes will participate with the same data as before */
288+
oversubscribed_pus = check_oversubscribing(rank,num_nodes,
289+
num_objs_in_node,num_procs_in_node,
290+
nodes_roots,local_procs,comm_old);
291+
}
292+
if (!oversubscribed_pus) {
293+
/* Update the data used to compute the correct binding */
294+
if(hwloc_bitmap_isincluded(root_obj->cpuset,set)){ /* processes are not bound on the machine */
301295
obj_rank = ompi_process_info.my_local_rank%num_pus_in_node;
302296
effective_depth = hwloc_topology_get_depth(opal_hwloc_topology) - 1;
303297
num_objs_in_node = num_pus_in_node;
304298
#ifdef __DEBUG__
305299
fprintf(stdout,"Process not bound : binding on PU#%i \n",obj_rank);
306300
#endif
307301
}
308-
} else {
309-
obj_rank = ompi_process_info.my_local_rank%num_objs_in_node;
310-
effective_depth = depth;
311-
object = hwloc_get_obj_by_depth(opal_hwloc_topology,effective_depth,obj_rank);
312-
if( NULL == object) FALLBACK();
313-
314-
hwloc_bitmap_copy(set,object->cpuset);
315-
hwloc_bitmap_singlify(set); /* we don't want the process to move */
316-
hwloc_err = hwloc_set_cpubind(opal_hwloc_topology,set,0);
317-
if( -1 == hwloc_err) FALLBACK();
318-
#ifdef __DEBUG__
319-
fprintf(stdout,"Process not bound : binding on OBJ#%i \n",obj_rank);
320-
#endif
321302
}
322-
} else { /* the processes are already bound */
323-
object = hwloc_get_obj_covering_cpuset(opal_hwloc_topology,set);
324-
obj_rank = object->logical_index;
325-
effective_depth = object->depth;
326-
num_objs_in_node = hwloc_get_nbobjs_by_depth(opal_hwloc_topology, effective_depth);
303+
}
327304

328-
/* Check for oversubscribing */
329-
oversubscribing_objs = check_oversubscribing(rank,num_nodes,
330-
num_objs_in_node,num_procs_in_node,
331-
nodes_roots,local_procs,comm_old);
332-
if(oversubscribing_objs) {
305+
if( !oversubscribing_objs && !oversubscribed_pus ) {
306+
if( hwloc_bitmap_isincluded(root_obj->cpuset,set) ) { /* processes are not bound on the machine */
307+
obj_rank = ompi_process_info.my_local_rank%num_objs_in_node;
308+
effective_depth = depth;
309+
object = hwloc_get_obj_by_depth(opal_hwloc_topology,effective_depth,obj_rank);
310+
if( NULL == object) FALLBACK();
311+
312+
hwloc_bitmap_copy(set,object->cpuset);
313+
hwloc_bitmap_singlify(set); /* we don't want the process to move */
314+
hwloc_err = hwloc_set_cpubind(opal_hwloc_topology,set,0);
315+
if( -1 == hwloc_err) FALLBACK();
316+
#ifdef __DEBUG__
317+
fprintf(stdout,"Process not bound : binding on OBJ#%i \n",obj_rank);
318+
#endif
319+
} else {
333320
#ifdef __DEBUG__
334-
fprintf(stdout,"Oversubscribing OBJ/CORES resources => Rank Reordering Impossible\n");
321+
fprintf(stdout,"Process %i bound on OBJ #%i \n",rank,obj_rank);
322+
fprintf(stdout,"=====> Num obj in node : %i | num pus in node : %i\n",num_objs_in_node,num_pus_in_node);
335323
#endif
336-
FALLBACK();
337324
}
325+
} else {
338326
#ifdef __DEBUG__
339-
fprintf(stdout,"Process %i bound on OBJ #%i \n",rank,obj_rank);
340-
fprintf(stdout,"=====> Num obj in node : %i | num pus in node : %i\n",num_objs_in_node,num_pus_in_node);
327+
fprintf(stdout,"Oversubscribing PUs resources => Rank Reordering Impossible \n");
341328
#endif
329+
FALLBACK();
342330
}
343331

344332
reqs = (MPI_Request *)calloc(num_procs_in_node-1,sizeof(MPI_Request));
@@ -493,7 +481,6 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
493481
for(i = 1; i < num_nodes ; i++)
494482
displs[i] = displs[i-1] + objs_per_node[i-1];
495483

496-
memset(reqs,0,(num_nodes-1)*sizeof(MPI_Request));
497484
memcpy(obj_mapping,obj_to_rank_in_comm,objs_per_node[0]*sizeof(int));
498485
for(i = 1; i < num_nodes ; i++)
499486
if (OMPI_SUCCESS != ( err = MCA_PML_CALL(irecv(obj_mapping + displs[i], objs_per_node[i], MPI_INT,

0 commit comments

Comments
 (0)