6
6
* Copyright (c) 2011-2015 INRIA. All rights reserved.
7
7
* Copyright (c) 2012-2015 Bordeaux Poytechnic Institute
8
8
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
9
- * Copyright (c) 2015-2016 Research Organization for Information Science
9
+ * Copyright (c) 2015-2017 Research Organization for Information Science
10
10
* and Technology (RIST). All rights reserved.
11
11
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
12
12
* reserved.
55
55
#define MY_STRING_SIZE 64
56
56
/*#define __DEBUG__ 1 */
57
57
58
-
58
+ /**
59
+ * This function is a allreduce between all processes to detect for oversubscription.
60
+ * On each node, the local_procs will be a different array, that contains only the
61
+ * local processes. Thus, that process will compute the node oversubscription and will
62
+ * bring this value to the operation, while every other process on the node will
63
+ * contribute 0.
64
+ * Doing an AllReduce might be an overkill for this situation, but it should remain
65
+ * more scalable than a star reduction (between the roots of each node (nodes_roots),
66
+ * followed by a bcast to all processes.
67
+ */
59
68
static int check_oversubscribing (int rank ,
60
69
int num_nodes ,
61
70
int num_objs_in_node ,
@@ -64,48 +73,13 @@ static int check_oversubscribing(int rank,
64
73
int * local_procs ,
65
74
ompi_communicator_t * comm_old )
66
75
{
67
- int oversubscribed = 0 ;
68
- int local_oversub = 0 ;
69
- int err ;
76
+ int oversubscribed = 0 , local_oversub = 0 , err ;
70
77
78
+ /* Only a single process per node, the local root, compute the oversubscription condition */
71
79
if (rank == local_procs [0 ])
72
80
if (num_objs_in_node < num_procs_in_node )
73
81
local_oversub = 1 ;
74
82
75
- if (rank == 0 ) {
76
- MPI_Request * reqs = (MPI_Request * )calloc (num_nodes - 1 , sizeof (MPI_Request ));
77
- int * oversub = (int * )calloc (num_nodes , sizeof (int ));
78
- int i ;
79
-
80
- oversub [0 ] = local_oversub ;
81
- for (i = 1 ; i < num_nodes ; i ++ )
82
- if (OMPI_SUCCESS != ( err = MCA_PML_CALL (irecv (& oversub [i ], 1 , MPI_INT ,
83
- nodes_roots [i ], 111 , comm_old , & reqs [i - 1 ])))) {
84
- /* NTH: more needs to be done to correctly clean up here */
85
- free (reqs );
86
- free (oversub );
87
- return err ;
88
- }
89
-
90
- if (OMPI_SUCCESS != ( err = ompi_request_wait_all (num_nodes - 1 ,
91
- reqs , MPI_STATUSES_IGNORE ))) {
92
- /* NTH: more needs to be done to correctly clean up here */
93
- free (reqs );
94
- free (oversub );
95
- return err ;
96
- }
97
-
98
- for (i = 0 ; i < num_nodes ; i ++ )
99
- oversubscribed += oversub [i ];
100
-
101
- free (oversub );
102
- free (reqs );
103
- } else {
104
- if (rank == local_procs [0 ])
105
- if (OMPI_SUCCESS != (err = MCA_PML_CALL (send (& local_oversub , 1 , MPI_INT , 0 ,
106
- 111 , MCA_PML_BASE_SEND_STANDARD , comm_old ))))
107
- return err ;
108
- }
109
83
110
84
if (OMPI_SUCCESS != (err = comm_old -> c_coll -> coll_bcast (& oversubscribed , 1 ,
111
85
MPI_INT , 0 , comm_old ,
@@ -163,7 +137,7 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
163
137
int num_procs_in_node = 0 ;
164
138
int rank , size ;
165
139
int hwloc_err ;
166
- int oversubscribing_objs = 0 ;
140
+ int oversubscribing_objs = 0 , oversubscribed_pus = 0 ;
167
141
int i , j , idx ;
168
142
uint32_t val , * pval ;
169
143
@@ -269,8 +243,12 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
269
243
hwloc_get_cpubind (opal_hwloc_topology ,set ,0 );
270
244
num_pus_in_node = hwloc_get_nbobjs_by_type (opal_hwloc_topology , HWLOC_OBJ_PU );
271
245
272
- if (hwloc_bitmap_isincluded (root_obj -> cpuset ,set )){
273
- /* processes are not bound on the machine */
246
+ /**
247
+ * In all situations (including heterogeneous environments) all processes must execute
248
+ * all the calls that involve collective communications, so we have to lay the logic
249
+ * accordingly.
250
+ */
251
+ if (hwloc_bitmap_isincluded (root_obj -> cpuset ,set )){ /* processes are not bound on the machine */
274
252
#ifdef __DEBUG__
275
253
if (0 == rank )
276
254
fprintf (stdout ,">>>>>>>>>>>>> Process Not bound <<<<<<<<<<<<<<<\n" );
@@ -285,60 +263,70 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
285
263
oversubscribing_objs = check_oversubscribing (rank ,num_nodes ,
286
264
num_objs_in_node ,num_procs_in_node ,
287
265
nodes_roots ,local_procs ,comm_old );
288
- if (oversubscribing_objs ) {
266
+ } else { /* the processes are already bound */
267
+ object = hwloc_get_obj_covering_cpuset (opal_hwloc_topology ,set );
268
+ obj_rank = object -> logical_index ;
269
+ effective_depth = object -> depth ;
270
+ num_objs_in_node = hwloc_get_nbobjs_by_depth (opal_hwloc_topology , effective_depth );
271
+
272
+ /* Check for oversubscribing */
273
+ oversubscribing_objs = check_oversubscribing (rank ,num_nodes ,
274
+ num_objs_in_node ,num_procs_in_node ,
275
+ nodes_roots ,local_procs ,comm_old );
276
+ }
277
+
278
+ if (oversubscribing_objs ) {
279
+ if (hwloc_bitmap_isincluded (root_obj -> cpuset ,set )){ /* processes are not bound on the machine */
289
280
#ifdef __DEBUG__
290
281
fprintf (stdout ,"Oversubscribing OBJ/CORES resources => Trying to use PUs \n" );
291
282
#endif
292
- int oversubscribed_pus = check_oversubscribing (rank ,num_nodes ,
293
- num_pus_in_node ,num_procs_in_node ,
294
- nodes_roots ,local_procs ,comm_old );
295
- if (oversubscribed_pus ){
296
- #ifdef __DEBUG__
297
- fprintf (stdout ,"Oversubscribing PUs resources => Rank Reordering Impossible \n" );
298
- #endif
299
- FALLBACK ();
300
- } else {
283
+ oversubscribed_pus = check_oversubscribing (rank ,num_nodes ,
284
+ num_pus_in_node ,num_procs_in_node ,
285
+ nodes_roots ,local_procs ,comm_old );
286
+ } else {
287
+ /* Bound processes will participate with the same data as before */
288
+ oversubscribed_pus = check_oversubscribing (rank ,num_nodes ,
289
+ num_objs_in_node ,num_procs_in_node ,
290
+ nodes_roots ,local_procs ,comm_old );
291
+ }
292
+ if (!oversubscribed_pus ) {
293
+ /* Update the data used to compute the correct binding */
294
+ if (hwloc_bitmap_isincluded (root_obj -> cpuset ,set )){ /* processes are not bound on the machine */
301
295
obj_rank = ompi_process_info .my_local_rank %num_pus_in_node ;
302
296
effective_depth = hwloc_topology_get_depth (opal_hwloc_topology ) - 1 ;
303
297
num_objs_in_node = num_pus_in_node ;
304
298
#ifdef __DEBUG__
305
299
fprintf (stdout ,"Process not bound : binding on PU#%i \n" ,obj_rank );
306
300
#endif
307
301
}
308
- } else {
309
- obj_rank = ompi_process_info .my_local_rank %num_objs_in_node ;
310
- effective_depth = depth ;
311
- object = hwloc_get_obj_by_depth (opal_hwloc_topology ,effective_depth ,obj_rank );
312
- if ( NULL == object ) FALLBACK ();
313
-
314
- hwloc_bitmap_copy (set ,object -> cpuset );
315
- hwloc_bitmap_singlify (set ); /* we don't want the process to move */
316
- hwloc_err = hwloc_set_cpubind (opal_hwloc_topology ,set ,0 );
317
- if ( -1 == hwloc_err ) FALLBACK ();
318
- #ifdef __DEBUG__
319
- fprintf (stdout ,"Process not bound : binding on OBJ#%i \n" ,obj_rank );
320
- #endif
321
302
}
322
- } else { /* the processes are already bound */
323
- object = hwloc_get_obj_covering_cpuset (opal_hwloc_topology ,set );
324
- obj_rank = object -> logical_index ;
325
- effective_depth = object -> depth ;
326
- num_objs_in_node = hwloc_get_nbobjs_by_depth (opal_hwloc_topology , effective_depth );
303
+ }
327
304
328
- /* Check for oversubscribing */
329
- oversubscribing_objs = check_oversubscribing (rank ,num_nodes ,
330
- num_objs_in_node ,num_procs_in_node ,
331
- nodes_roots ,local_procs ,comm_old );
332
- if (oversubscribing_objs ) {
305
+ if ( !oversubscribing_objs && !oversubscribed_pus ) {
306
+ if ( hwloc_bitmap_isincluded (root_obj -> cpuset ,set ) ) { /* processes are not bound on the machine */
307
+ obj_rank = ompi_process_info .my_local_rank %num_objs_in_node ;
308
+ effective_depth = depth ;
309
+ object = hwloc_get_obj_by_depth (opal_hwloc_topology ,effective_depth ,obj_rank );
310
+ if ( NULL == object ) FALLBACK ();
311
+
312
+ hwloc_bitmap_copy (set ,object -> cpuset );
313
+ hwloc_bitmap_singlify (set ); /* we don't want the process to move */
314
+ hwloc_err = hwloc_set_cpubind (opal_hwloc_topology ,set ,0 );
315
+ if ( -1 == hwloc_err ) FALLBACK ();
316
+ #ifdef __DEBUG__
317
+ fprintf (stdout ,"Process not bound : binding on OBJ#%i \n" ,obj_rank );
318
+ #endif
319
+ } else {
333
320
#ifdef __DEBUG__
334
- fprintf (stdout ,"Oversubscribing OBJ/CORES resources => Rank Reordering Impossible\n" );
321
+ fprintf (stdout ,"Process %i bound on OBJ #%i \n" ,rank ,obj_rank );
322
+ fprintf (stdout ,"=====> Num obj in node : %i | num pus in node : %i\n" ,num_objs_in_node ,num_pus_in_node );
335
323
#endif
336
- FALLBACK ();
337
324
}
325
+ } else {
338
326
#ifdef __DEBUG__
339
- fprintf (stdout ,"Process %i bound on OBJ #%i \n" ,rank ,obj_rank );
340
- fprintf (stdout ,"=====> Num obj in node : %i | num pus in node : %i\n" ,num_objs_in_node ,num_pus_in_node );
327
+ fprintf (stdout ,"Oversubscribing PUs resources => Rank Reordering Impossible \n" );
341
328
#endif
329
+ FALLBACK ();
342
330
}
343
331
344
332
reqs = (MPI_Request * )calloc (num_procs_in_node - 1 ,sizeof (MPI_Request ));
@@ -493,7 +481,6 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
493
481
for (i = 1 ; i < num_nodes ; i ++ )
494
482
displs [i ] = displs [i - 1 ] + objs_per_node [i - 1 ];
495
483
496
- memset (reqs ,0 ,(num_nodes - 1 )* sizeof (MPI_Request ));
497
484
memcpy (obj_mapping ,obj_to_rank_in_comm ,objs_per_node [0 ]* sizeof (int ));
498
485
for (i = 1 ; i < num_nodes ; i ++ )
499
486
if (OMPI_SUCCESS != ( err = MCA_PML_CALL (irecv (obj_mapping + displs [i ], objs_per_node [i ], MPI_INT ,
0 commit comments