Skip to content

Commit a863c26

Browse files
author
Ralph Castain
authored
Merge pull request #4628 from rhc54/topic/treespawn
Fix the tree-spawn-with-rollup
2 parents 3d80794 + 7a58f91 commit a863c26

File tree

3 files changed

+57
-37
lines changed

3 files changed

+57
-37
lines changed

orte/mca/plm/base/plm_base_launch_support.c

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1383,14 +1383,6 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
13831383
if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
13841384
ORTE_ERROR_LOG(rc);
13851385
ORTE_ACTIVATE_JOB_STATE(jdatorted, ORTE_JOB_STATE_FAILED_TO_START);
1386-
} else if (NULL != orte_tree_launch_cmd) {
1387-
/* if a tree-launch is underway, send the cmd back */
1388-
relay = OBJ_NEW(opal_buffer_t);
1389-
opal_dss.copy_payload(relay, orte_tree_launch_cmd);
1390-
orte_rml.send_buffer_nb(orte_mgmt_conduit,
1391-
sender, relay,
1392-
ORTE_RML_TAG_DAEMON,
1393-
orte_rml_send_callback, NULL);
13941386
}
13951387
}
13961388

orte/mca/plm/rsh/plm_rsh_module.c

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@
8888
#include "orte/mca/ess/base/base.h"
8989
#include "orte/mca/errmgr/errmgr.h"
9090
#include "orte/mca/grpcomm/base/base.h"
91+
#include "orte/mca/oob/base/base.h"
9192
#include "orte/mca/rmaps/rmaps.h"
9293
#include "orte/mca/routed/routed.h"
9394
#include "orte/mca/rml/base/rml_contact.h"
@@ -605,7 +606,6 @@ static int setup_launch(int *argcptr, char ***argvptr,
605606
(mca_plm_rsh_component.using_qrsh && mca_plm_rsh_component.daemonize_qrsh)) &&
606607
((!mca_plm_rsh_component.using_llspawn) ||
607608
(mca_plm_rsh_component.using_llspawn && mca_plm_rsh_component.daemonize_llspawn))) {
608-
opal_argv_append(&argc, &argv, "--daemonize");
609609
}
610610

611611
/*
@@ -617,9 +617,20 @@ static int setup_launch(int *argcptr, char ***argvptr,
617617
proc_vpid_index);
618618

619619
/* ensure that only the ssh plm is selected on the remote daemon */
620-
opal_argv_append_nosize(&argv, "-"OPAL_MCA_CMD_LINE_ID);
621-
opal_argv_append_nosize(&argv, "plm");
622-
opal_argv_append_nosize(&argv, "rsh");
620+
opal_argv_append(&argc, &argv, "-"OPAL_MCA_CMD_LINE_ID);
621+
opal_argv_append(&argc, &argv, "plm");
622+
opal_argv_append(&argc, &argv, "rsh");
623+
624+
/* if we are tree-spawning, tell our child daemons the
625+
* uri of their parent (me) */
626+
if (!mca_plm_rsh_component.no_tree_spawn) {
627+
opal_argv_append(&argc, &argv, "--tree-spawn");
628+
orte_oob_base_get_addr(&param);
629+
opal_argv_append(&argc, &argv, "-"OPAL_MCA_CMD_LINE_ID);
630+
opal_argv_append(&argc, &argv, "orte_parent_uri");
631+
opal_argv_append(&argc, &argv, param);
632+
free(param);
633+
}
623634

624635
/* unless told otherwise... */
625636
if (mca_plm_rsh_component.pass_environ_mca_params) {
@@ -795,11 +806,22 @@ static int remote_spawn(opal_buffer_t *launch)
795806
/* if we hit any errors, tell the HNP it was us */
796807
target.vpid = ORTE_PROC_MY_NAME->vpid;
797808

798-
/* extract the prefix from the launch buffer */
799-
n = 1;
800-
if (ORTE_SUCCESS != (rc = opal_dss.unpack(launch, &prefix, &n, OPAL_STRING))) {
801-
ORTE_ERROR_LOG(rc);
802-
goto cleanup;
809+
if (NULL != launch) {
810+
/* extract the prefix from the launch buffer */
811+
n = 1;
812+
if (ORTE_SUCCESS != (rc = opal_dss.unpack(launch, &prefix, &n, OPAL_STRING))) {
813+
ORTE_ERROR_LOG(rc);
814+
goto cleanup;
815+
}
816+
} else {
817+
/* check to see if enable-orterun-prefix-by-default was given - if
818+
* this is being done by a singleton, then orterun will not be there
819+
* to put the prefix in the app. So make sure we check to find it */
820+
if ((bool)ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT) {
821+
prefix = strdup(opal_install_dirs.prefix);
822+
} else {
823+
prefix = NULL;
824+
}
803825
}
804826

805827
/* get the updated routing list */

orte/orted/orted_main.c

Lines changed: 26 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ static void rollup(int status, orte_process_name_t* sender,
120120
static opal_buffer_t *bucket, *mybucket = NULL;
121121
static int ncollected = 0;
122122

123-
static char *orte_parent_uri;
123+
static char *orte_parent_uri = NULL;
124124

125125
static struct {
126126
bool debug;
@@ -187,6 +187,10 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = {
187187
&orted_globals.set_sid, OPAL_CMD_LINE_TYPE_BOOL,
188188
"Direct the orted to separate from the current session"},
189189

190+
{ NULL, '\0', "tree-spawn", "tree-spawn", 0,
191+
&orted_globals.tree_spawn, OPAL_CMD_LINE_TYPE_BOOL,
192+
"Tree-based spawn in progress" },
193+
190194
{ "tmpdir_base", '\0', NULL, "tmpdir", 1,
191195
NULL, OPAL_CMD_LINE_TYPE_STRING,
192196
"Set the root for the session directory tree" },
@@ -667,22 +671,19 @@ int orte_daemon(int argc, char *argv[])
667671
MCA_BASE_VAR_SCOPE_CONSTANT,
668672
&orte_parent_uri);
669673
if (NULL != orte_parent_uri) {
670-
orte_process_name_t parent;
671674
opal_value_t val;
672675

673676
/* set the contact info into our local database */
674-
ret = orte_rml_base_parse_uris(orte_parent_uri, &parent, NULL);
677+
ret = orte_rml_base_parse_uris(orte_parent_uri, ORTE_PROC_MY_PARENT, NULL);
675678
if (ORTE_SUCCESS != ret) {
676679
ORTE_ERROR_LOG(ret);
677-
free (orte_parent_uri);
678-
orte_parent_uri = NULL;
679680
goto DONE;
680681
}
681682
OBJ_CONSTRUCT(&val, opal_value_t);
682683
val.key = OPAL_PMIX_PROC_URI;
683684
val.type = OPAL_STRING;
684685
val.data.string = orte_parent_uri;
685-
if (OPAL_SUCCESS != (ret = opal_pmix.store_local(&parent, &val))) {
686+
if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_PARENT, &val))) {
686687
ORTE_ERROR_LOG(ret);
687688
OBJ_DESTRUCT(&val);
688689
goto DONE;
@@ -691,21 +692,22 @@ int orte_daemon(int argc, char *argv[])
691692
val.data.string = NULL;
692693
OBJ_DESTRUCT(&val);
693694

694-
/* don't need this value anymore */
695-
free(orte_parent_uri);
696-
orte_parent_uri = NULL;
697-
698695
/* tell the routed module that we have a path
699696
* back to the HNP
700697
*/
701-
if (ORTE_SUCCESS != (ret = orte_routed.update_route(NULL, ORTE_PROC_MY_HNP, &parent))) {
698+
if (ORTE_SUCCESS != (ret = orte_routed.update_route(NULL, ORTE_PROC_MY_HNP, ORTE_PROC_MY_PARENT))) {
699+
ORTE_ERROR_LOG(ret);
700+
goto DONE;
701+
}
702+
/* and a path to our parent */
703+
if (ORTE_SUCCESS != (ret = orte_routed.update_route(NULL, ORTE_PROC_MY_PARENT, ORTE_PROC_MY_PARENT))) {
702704
ORTE_ERROR_LOG(ret);
703705
goto DONE;
704706
}
705707
/* set the lifeline to point to our parent so that we
706708
* can handle the situation if that lifeline goes away
707709
*/
708-
if (ORTE_SUCCESS != (ret = orte_routed.set_lifeline(NULL, &parent))) {
710+
if (ORTE_SUCCESS != (ret = orte_routed.set_lifeline(NULL, ORTE_PROC_MY_PARENT))) {
709711
ORTE_ERROR_LOG(ret);
710712
goto DONE;
711713
}
@@ -717,12 +719,15 @@ int orte_daemon(int argc, char *argv[])
717719
*/
718720
if (!ORTE_PROC_IS_HNP) {
719721
orte_process_name_t target;
720-
target.jobid = ORTE_PROC_MY_NAME->jobid;
721722

722-
if (orte_fwd_mpirun_port || orte_static_ports) {
723-
/* setup the rollup callback */
724-
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ORTED_CALLBACK,
725-
ORTE_RML_PERSISTENT, rollup, NULL);
723+
/* setup the rollup callback */
724+
orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ORTED_CALLBACK,
725+
ORTE_RML_PERSISTENT, rollup, NULL);
726+
727+
/* define the target jobid */
728+
target.jobid = ORTE_PROC_MY_NAME->jobid;
729+
if (orte_fwd_mpirun_port || orte_static_ports || NULL != orte_parent_uri) {
730+
/* we start by sending to ourselves */
726731
target.vpid = ORTE_PROC_MY_NAME->vpid;
727732
/* since we will be waiting for any children to send us
728733
* their rollup info before sending to our parent, save
@@ -789,7 +794,6 @@ int orte_daemon(int argc, char *argv[])
789794
}
790795
OPAL_LIST_RELEASE(modex);
791796
} else {
792-
opal_output(0, "VAL KEY: %s", (NULL == val->key) ? "NULL" : val->key);
793797
/* single value */
794798
flag = 1;
795799
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT32))) {
@@ -965,6 +969,8 @@ int orte_daemon(int argc, char *argv[])
965969
i += 2;
966970
}
967971
}
972+
/* now launch any child daemons of ours */
973+
orte_plm.remote_spawn(orte_tree_launch_cmd);
968974
}
969975

970976
if (orte_debug_daemons_flag) {
@@ -1053,8 +1059,6 @@ static void rollup(int status, orte_process_name_t* sender,
10531059
int32_t i, flag, cnt;
10541060
opal_value_t *kv;
10551061

1056-
/* xfer the contents of the rollup to our bucket */
1057-
opal_dss.copy_payload(bucket, buffer);
10581062
ncollected++;
10591063

10601064
/* if the sender is ourselves, then we save that buffer
@@ -1064,6 +1068,8 @@ static void rollup(int status, orte_process_name_t* sender,
10641068
mybucket = OBJ_NEW(opal_buffer_t);
10651069
opal_dss.copy_payload(mybucket, buffer);
10661070
} else {
1071+
/* xfer the contents of the rollup to our bucket */
1072+
opal_dss.copy_payload(bucket, buffer);
10671073
/* the first entry in the bucket will be from our
10681074
* direct child - harvest it for connection info */
10691075
cnt = 1;

0 commit comments

Comments
 (0)