Skip to content

Commit 0fcc996

Browse files
author
Ralph Castain
authored
Merge pull request #4532 from rhc54/topic/odls
Try adding local spawn threads by default to parallelize the fork/exec process.
2 parents b310add + 335fc96 commit 0fcc996

File tree

12 files changed

+250
-127
lines changed

12 files changed

+250
-127
lines changed

orte/mca/errmgr/default_orted/errmgr_default_orted.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
#include "orte/mca/ess/ess.h"
4747
#include "orte/mca/state/state.h"
4848

49+
#include "orte/runtime/orte_wait.h"
4950
#include "orte/runtime/orte_quit.h"
5051
#include "orte/runtime/orte_globals.h"
5152
#include "orte/runtime/data_type_support/orte_dt_support.h"
@@ -327,6 +328,7 @@ static void proc_errors(int fd, short args, void *cbdata)
327328
orte_plm_cmd_flag_t cmd;
328329
int rc=ORTE_SUCCESS;
329330
int i;
331+
orte_wait_tracker_t *t2;
330332

331333
ORTE_ACQUIRE_OBJECT(caddy);
332334

@@ -412,7 +414,14 @@ static void proc_errors(int fd, short args, void *cbdata)
412414
goto cleanup;
413415
}
414416
/* leave the exit code alone - process this as a waitpid */
415-
ompi_odls_base_default_wait_local_proc(child, NULL);
417+
t2 = OBJ_NEW(orte_wait_tracker_t);
418+
OBJ_RETAIN(child); // protect against race conditions
419+
t2->child = child;
420+
t2->evb = orte_event_base;
421+
opal_event_set(t2->evb, &t2->ev, -1,
422+
OPAL_EV_WRITE, orte_odls_base_default_wait_local_proc, t2);
423+
opal_event_set_priority(&t2->ev, ORTE_MSG_PRI);
424+
opal_event_active(&t2->ev, OPAL_EV_WRITE, 1);
416425
goto cleanup;
417426
}
418427
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,

orte/mca/odls/base/base.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
* All rights reserved.
1212
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
1313
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights reserved.
14+
* Copyright (c) 2017 Intel, Inc. All rights reserved.
1415
* $COPYRIGHT$
1516
*
1617
* Additional copyrights may follow
@@ -44,5 +45,9 @@ ORTE_DECLSPEC extern mca_base_framework_t orte_odls_base_framework;
4445
*/
4546
ORTE_DECLSPEC int orte_odls_base_select(void);
4647

48+
ORTE_DECLSPEC void orte_odls_base_start_threads(orte_job_t *jdata);
49+
50+
ORTE_DECLSPEC void orte_odls_base_harvest_threads(void);
51+
4752
END_C_DECLS
4853
#endif

orte/mca/odls/base/odls_base_default_fns.c

Lines changed: 67 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
* All rights reserved.
1616
* Copyright (c) 2011-2017 Cisco Systems, Inc. All rights reserved
1717
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
18-
* Copyright (c) 2014 Research Organization for Information Science
18+
* Copyright (c) 2014-2017 Research Organization for Information Science
1919
* and Technology (RIST). All rights reserved.
2020
* Copyright (c) 2017 Mellanox Technologies Ltd. All rights reserved.
2121
* Copyright (c) 2017 IBM Corporation. All rights reserved.
@@ -614,6 +614,9 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
614614
goto REPORT_ERROR;
615615
}
616616

617+
/* spin up the spawn threads */
618+
orte_odls_base_start_threads(jdata);
619+
617620
/* to save memory, purge the job map of all procs other than
618621
* our own - for daemons, this will completely release the
619622
* proc structures. For the HNP, the proc structs will
@@ -727,9 +730,6 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
727730
int rc, i;
728731
bool found;
729732
orte_proc_state_t state;
730-
char **argvptr;
731-
char *pathenv = NULL, *mpiexec_pathenv = NULL;
732-
char *full_search;
733733

734734
ORTE_ACQUIRE_OBJECT(cd);
735735

@@ -772,44 +772,6 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
772772
goto errorout;
773773
}
774774

775-
/* Search for the OMPI_exec_path and PATH settings in the environment. */
776-
for (argvptr = app->env; *argvptr != NULL; argvptr++) {
777-
if (0 == strncmp("OMPI_exec_path=", *argvptr, 15)) {
778-
mpiexec_pathenv = *argvptr + 15;
779-
}
780-
if (0 == strncmp("PATH=", *argvptr, 5)) {
781-
pathenv = *argvptr + 5;
782-
}
783-
}
784-
785-
/* If OMPI_exec_path is set (meaning --path was used), then create a
786-
temporary environment to be used in the search for the executable.
787-
The PATH setting in this temporary environment is a combination of
788-
the OMPI_exec_path and PATH values. If OMPI_exec_path is not set,
789-
then just use existing environment with PATH in it. */
790-
if (NULL != mpiexec_pathenv) {
791-
argvptr = NULL;
792-
if (pathenv != NULL) {
793-
asprintf(&full_search, "%s:%s", mpiexec_pathenv, pathenv);
794-
} else {
795-
asprintf(&full_search, "%s", mpiexec_pathenv);
796-
}
797-
opal_setenv("PATH", full_search, true, &argvptr);
798-
free(full_search);
799-
} else {
800-
argvptr = app->env;
801-
}
802-
803-
rc = orte_util_check_context_app(app, argvptr);
804-
/* do not ERROR_LOG - it will be reported elsewhere */
805-
if (NULL != mpiexec_pathenv) {
806-
opal_argv_free(argvptr);
807-
}
808-
if (ORTE_SUCCESS != rc) {
809-
state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
810-
goto errorout;
811-
}
812-
813775
/* did the user request we display output in xterms? */
814776
if (NULL != orte_xterm && !ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
815777
opal_list_item_t *nmitem;
@@ -878,15 +840,14 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
878840
cd->argv[0] = param;
879841
}
880842

881-
if (5 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
882-
opal_output(orte_odls_base_framework.framework_output, "%s odls:launch spawning child %s",
883-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
884-
ORTE_NAME_PRINT(&child->name));
843+
opal_output_verbose(5, orte_odls_base_framework.framework_output,
844+
"%s odls:launch spawning child %s",
845+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
846+
ORTE_NAME_PRINT(&child->name));
885847

848+
if (15 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
886849
/* dump what is going to be exec'd */
887-
if (7 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
888-
opal_dss.dump(orte_odls_base_framework.framework_output, app, ORTE_APP_CONTEXT);
889-
}
850+
opal_dss.dump(orte_odls_base_framework.framework_output, app, ORTE_APP_CONTEXT);
890851
}
891852

892853
if (ORTE_SUCCESS != (rc = cd->fork_local(cd))) {
@@ -923,6 +884,9 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
923884
orte_odls_spawn_caddy_t *cd;
924885
opal_event_base_t *evb;
925886
char *effective_dir = NULL;
887+
char **argvptr;
888+
char *pathenv = NULL, *mpiexec_pathenv = NULL;
889+
char *full_search;
926890

927891
ORTE_ACQUIRE_OBJECT(caddy);
928892

@@ -1105,6 +1069,44 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
11051069
goto GETOUT;
11061070
}
11071071

1072+
/* Search for the OMPI_exec_path and PATH settings in the environment. */
1073+
for (argvptr = app->env; *argvptr != NULL; argvptr++) {
1074+
if (0 == strncmp("OMPI_exec_path=", *argvptr, 15)) {
1075+
mpiexec_pathenv = *argvptr + 15;
1076+
}
1077+
if (0 == strncmp("PATH=", *argvptr, 5)) {
1078+
pathenv = *argvptr + 5;
1079+
}
1080+
}
1081+
1082+
/* If OMPI_exec_path is set (meaning --path was used), then create a
1083+
temporary environment to be used in the search for the executable.
1084+
The PATH setting in this temporary environment is a combination of
1085+
the OMPI_exec_path and PATH values. If OMPI_exec_path is not set,
1086+
then just use existing environment with PATH in it. */
1087+
if (NULL != mpiexec_pathenv) {
1088+
argvptr = NULL;
1089+
if (pathenv != NULL) {
1090+
asprintf(&full_search, "%s:%s", mpiexec_pathenv, pathenv);
1091+
} else {
1092+
asprintf(&full_search, "%s", mpiexec_pathenv);
1093+
}
1094+
opal_setenv("PATH", full_search, true, &argvptr);
1095+
free(full_search);
1096+
} else {
1097+
argvptr = app->env;
1098+
}
1099+
1100+
rc = orte_util_check_context_app(app, argvptr);
1101+
/* do not ERROR_LOG - it will be reported elsewhere */
1102+
if (NULL != mpiexec_pathenv) {
1103+
opal_argv_free(argvptr);
1104+
}
1105+
if (ORTE_SUCCESS != rc) {
1106+
goto GETOUT;
1107+
}
1108+
1109+
11081110
/* tell all children that they are being launched via ORTE */
11091111
opal_setenv(OPAL_MCA_PREFIX"orte_launch", "1", true, &app->env);
11101112

@@ -1186,10 +1188,17 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
11861188
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
11871189
ORTE_NAME_PRINT(&child->name)));
11881190

1191+
/* determine the thread that will handle this child */
1192+
++orte_odls_globals.next_base;
1193+
if (orte_odls_globals.num_threads <= orte_odls_globals.next_base) {
1194+
orte_odls_globals.next_base = 0;
1195+
}
1196+
evb = orte_odls_globals.ev_bases[orte_odls_globals.next_base];
1197+
11891198
/* set the waitpid callback here for thread protection and
11901199
* to ensure we can capture the callback on shortlived apps */
11911200
ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE);
1192-
orte_wait_cb(child, ompi_odls_base_default_wait_local_proc, NULL);
1201+
orte_wait_cb(child, orte_odls_base_default_wait_local_proc, evb, NULL);
11931202

11941203
/* dispatch this child to the next available launch thread */
11951204
cd = OBJ_NEW(orte_odls_spawn_caddy_t);
@@ -1228,16 +1237,11 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
12281237
goto GETOUT;
12291238
}
12301239
}
1231-
++orte_odls_globals.next_base;
1232-
if (orte_odls_globals.num_threads <= orte_odls_globals.next_base) {
1233-
orte_odls_globals.next_base = 0;
1234-
}
12351240
opal_output_verbose(1, orte_odls_base_framework.framework_output,
12361241
"%s odls:dispatch %s to thread %d",
12371242
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
12381243
ORTE_NAME_PRINT(&child->name),
12391244
orte_odls_globals.next_base);
1240-
evb = orte_odls_globals.ev_bases[orte_odls_globals.next_base];
12411245
opal_event_set(evb, &cd->ev, -1,
12421246
OPAL_EV_WRITE, orte_odls_base_spawn_proc, cd);
12431247
opal_event_set_priority(&cd->ev, ORTE_MSG_PRI);
@@ -1255,11 +1259,6 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
12551259
free(effective_dir);
12561260
effective_dir = NULL;
12571261
}
1258-
/* tell the state machine that all local procs for this job
1259-
* were launched so that it can do whatever it needs to do,
1260-
* like send a state update message for all procs to the HNP
1261-
*/
1262-
ORTE_ACTIVATE_JOB_STATE(jobdat, ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE);
12631262

12641263
ERROR_OUT:
12651264
/* ensure we reset our working directory back to our default location */
@@ -1323,8 +1322,10 @@ int orte_odls_base_default_signal_local_procs(const orte_process_name_t *proc, i
13231322
* Wait for a callback indicating the child has completed.
13241323
*/
13251324

1326-
void ompi_odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata)
1325+
void orte_odls_base_default_wait_local_proc(int fd, short sd, void *cbdata)
13271326
{
1327+
orte_wait_tracker_t *t2 = (orte_wait_tracker_t*)cbdata;
1328+
orte_proc_t *proc = t2->child;
13281329
int i;
13291330
orte_job_t *jobdat;
13301331
orte_proc_state_t state=ORTE_PROC_STATE_WAITPID_FIRED;
@@ -1528,6 +1529,8 @@ void ompi_odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata)
15281529
/* cancel the wait as this proc has already terminated */
15291530
orte_wait_cb_cancel(proc);
15301531
ORTE_ACTIVATE_PROC_STATE(&proc->name, state);
1532+
/* cleanup the tracker */
1533+
OBJ_RELEASE(t2);
15311534
}
15321535

15331536
typedef struct {
@@ -1903,17 +1906,17 @@ int orte_odls_base_default_restart_proc(orte_proc_t *child,
19031906
goto CLEANUP;
19041907
}
19051908
}
1906-
orte_wait_cb(child, ompi_odls_base_default_wait_local_proc, NULL);
1907-
19081909
++orte_odls_globals.next_base;
19091910
if (orte_odls_globals.num_threads <= orte_odls_globals.next_base) {
19101911
orte_odls_globals.next_base = 0;
19111912
}
1913+
evb = orte_odls_globals.ev_bases[orte_odls_globals.next_base];
1914+
orte_wait_cb(child, orte_odls_base_default_wait_local_proc, evb, NULL);
1915+
19121916
OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
19131917
"%s restarting app %s",
19141918
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->app));
19151919

1916-
evb = orte_odls_globals.ev_bases[orte_odls_globals.next_base];
19171920
opal_event_set(evb, &cd->ev, -1,
19181921
OPAL_EV_WRITE, orte_odls_base_spawn_proc, cd);
19191922
opal_event_set_priority(&cd->ev, ORTE_MSG_PRI);

0 commit comments

Comments
 (0)