@@ -120,7 +120,7 @@ static void rollup(int status, orte_process_name_t* sender,
120
120
static opal_buffer_t * bucket , * mybucket = NULL ;
121
121
static int ncollected = 0 ;
122
122
123
- static char * orte_parent_uri ;
123
+ static char * orte_parent_uri = NULL ;
124
124
125
125
static struct {
126
126
bool debug ;
@@ -187,6 +187,10 @@ opal_cmd_line_init_t orte_cmd_line_opts[] = {
187
187
& orted_globals .set_sid , OPAL_CMD_LINE_TYPE_BOOL ,
188
188
"Direct the orted to separate from the current session" },
189
189
190
+ { NULL , '\0' , "tree-spawn" , "tree-spawn" , 0 ,
191
+ & orted_globals .tree_spawn , OPAL_CMD_LINE_TYPE_BOOL ,
192
+ "Tree-based spawn in progress" },
193
+
190
194
{ "tmpdir_base" , '\0' , NULL , "tmpdir" , 1 ,
191
195
NULL , OPAL_CMD_LINE_TYPE_STRING ,
192
196
"Set the root for the session directory tree" },
@@ -667,22 +671,19 @@ int orte_daemon(int argc, char *argv[])
667
671
MCA_BASE_VAR_SCOPE_CONSTANT ,
668
672
& orte_parent_uri );
669
673
if (NULL != orte_parent_uri ) {
670
- orte_process_name_t parent ;
671
674
opal_value_t val ;
672
675
673
676
/* set the contact info into our local database */
674
- ret = orte_rml_base_parse_uris (orte_parent_uri , & parent , NULL );
677
+ ret = orte_rml_base_parse_uris (orte_parent_uri , ORTE_PROC_MY_PARENT , NULL );
675
678
if (ORTE_SUCCESS != ret ) {
676
679
ORTE_ERROR_LOG (ret );
677
- free (orte_parent_uri );
678
- orte_parent_uri = NULL ;
679
680
goto DONE ;
680
681
}
681
682
OBJ_CONSTRUCT (& val , opal_value_t );
682
683
val .key = OPAL_PMIX_PROC_URI ;
683
684
val .type = OPAL_STRING ;
684
685
val .data .string = orte_parent_uri ;
685
- if (OPAL_SUCCESS != (ret = opal_pmix .store_local (& parent , & val ))) {
686
+ if (OPAL_SUCCESS != (ret = opal_pmix .store_local (ORTE_PROC_MY_PARENT , & val ))) {
686
687
ORTE_ERROR_LOG (ret );
687
688
OBJ_DESTRUCT (& val );
688
689
goto DONE ;
@@ -691,21 +692,22 @@ int orte_daemon(int argc, char *argv[])
691
692
val .data .string = NULL ;
692
693
OBJ_DESTRUCT (& val );
693
694
694
- /* don't need this value anymore */
695
- free (orte_parent_uri );
696
- orte_parent_uri = NULL ;
697
-
698
695
/* tell the routed module that we have a path
699
696
* back to the HNP
700
697
*/
701
- if (ORTE_SUCCESS != (ret = orte_routed .update_route (NULL , ORTE_PROC_MY_HNP , & parent ))) {
698
+ if (ORTE_SUCCESS != (ret = orte_routed .update_route (NULL , ORTE_PROC_MY_HNP , ORTE_PROC_MY_PARENT ))) {
699
+ ORTE_ERROR_LOG (ret );
700
+ goto DONE ;
701
+ }
702
+ /* and a path to our parent */
703
+ if (ORTE_SUCCESS != (ret = orte_routed .update_route (NULL , ORTE_PROC_MY_PARENT , ORTE_PROC_MY_PARENT ))) {
702
704
ORTE_ERROR_LOG (ret );
703
705
goto DONE ;
704
706
}
705
707
/* set the lifeline to point to our parent so that we
706
708
* can handle the situation if that lifeline goes away
707
709
*/
708
- if (ORTE_SUCCESS != (ret = orte_routed .set_lifeline (NULL , & parent ))) {
710
+ if (ORTE_SUCCESS != (ret = orte_routed .set_lifeline (NULL , ORTE_PROC_MY_PARENT ))) {
709
711
ORTE_ERROR_LOG (ret );
710
712
goto DONE ;
711
713
}
@@ -717,12 +719,15 @@ int orte_daemon(int argc, char *argv[])
717
719
*/
718
720
if (!ORTE_PROC_IS_HNP ) {
719
721
orte_process_name_t target ;
720
- target .jobid = ORTE_PROC_MY_NAME -> jobid ;
721
722
722
- if (orte_fwd_mpirun_port || orte_static_ports ) {
723
- /* setup the rollup callback */
724
- orte_rml .recv_buffer_nb (ORTE_NAME_WILDCARD , ORTE_RML_TAG_ORTED_CALLBACK ,
725
- ORTE_RML_PERSISTENT , rollup , NULL );
723
+ /* setup the rollup callback */
724
+ orte_rml .recv_buffer_nb (ORTE_NAME_WILDCARD , ORTE_RML_TAG_ORTED_CALLBACK ,
725
+ ORTE_RML_PERSISTENT , rollup , NULL );
726
+
727
+ /* define the target jobid */
728
+ target .jobid = ORTE_PROC_MY_NAME -> jobid ;
729
+ if (orte_fwd_mpirun_port || orte_static_ports || NULL != orte_parent_uri ) {
730
+ /* we start by sending to ourselves */
726
731
target .vpid = ORTE_PROC_MY_NAME -> vpid ;
727
732
/* since we will be waiting for any children to send us
728
733
* their rollup info before sending to our parent, save
@@ -789,7 +794,6 @@ int orte_daemon(int argc, char *argv[])
789
794
}
790
795
OPAL_LIST_RELEASE (modex );
791
796
} else {
792
- opal_output (0 , "VAL KEY: %s" , (NULL == val -> key ) ? "NULL" : val -> key );
793
797
/* single value */
794
798
flag = 1 ;
795
799
if (ORTE_SUCCESS != (ret = opal_dss .pack (buffer , & flag , 1 , OPAL_INT32 ))) {
@@ -965,6 +969,8 @@ int orte_daemon(int argc, char *argv[])
965
969
i += 2 ;
966
970
}
967
971
}
972
+ /* now launch any child daemons of ours */
973
+ orte_plm .remote_spawn (orte_tree_launch_cmd );
968
974
}
969
975
970
976
if (orte_debug_daemons_flag ) {
@@ -1053,8 +1059,6 @@ static void rollup(int status, orte_process_name_t* sender,
1053
1059
int32_t i , flag , cnt ;
1054
1060
opal_value_t * kv ;
1055
1061
1056
- /* xfer the contents of the rollup to our bucket */
1057
- opal_dss .copy_payload (bucket , buffer );
1058
1062
ncollected ++ ;
1059
1063
1060
1064
/* if the sender is ourselves, then we save that buffer
@@ -1064,6 +1068,8 @@ static void rollup(int status, orte_process_name_t* sender,
1064
1068
mybucket = OBJ_NEW (opal_buffer_t );
1065
1069
opal_dss .copy_payload (mybucket , buffer );
1066
1070
} else {
1071
+ /* xfer the contents of the rollup to our bucket */
1072
+ opal_dss .copy_payload (bucket , buffer );
1067
1073
/* the first entry in the bucket will be from our
1068
1074
* direct child - harvest it for connection info */
1069
1075
cnt = 1 ;
0 commit comments