Skip to content

orted/pmix: fix spawn in singleton mode #2084

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Sep 20, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 8 additions & 16 deletions orte/mca/ess/singleton/ess_singleton_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,7 @@ static int rte_init(void)
{
int rc, ret;
char *error = NULL;
char *envar, *ev1, *ev2;
uint64_t unique_key[2];
char *string_key;
char *ev1, *ev2;
opal_value_t *kv;
char *val;
int u32, *u32ptr;
Expand Down Expand Up @@ -265,19 +263,7 @@ static int rte_init(void)
* we can use the jobfam and stepid as unique keys
* because they are unique values assigned by the RM
*/
if (NULL == getenv(OPAL_MCA_PREFIX"orte_precondition_transports")) {
unique_key[0] = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid);
unique_key[1] = ORTE_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid);
if (NULL == (string_key = orte_pre_condition_transports_print(unique_key))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
asprintf(&envar, OPAL_MCA_PREFIX"orte_precondition_transports=%s", string_key);
putenv(envar);
added_transport_keys = true;
/* cannot free the envar as that messes up our environ */
free(string_key);
}
assert (NULL != getenv(OPAL_MCA_PREFIX"orte_precondition_transports"));

/* retrieve our topology */
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_TOPO,
Expand Down Expand Up @@ -586,6 +572,12 @@ static int fork_hnp(void)
memset(orted_uri, 0, buffer_length);

while (chunk == (rc = read(p[0], &orted_uri[num_chars_read], chunk))) {
if (rc < 0 && (EAGAIN == errno || EINTR == errno)) {
continue;
} else {
num_chars_read = 0;
break;
}
/* we read an entire buffer - better get more */
num_chars_read += chunk;
orted_uri = realloc((void*)orted_uri, buffer_length+ORTE_URI_MSG_LGTH);
Expand Down
28 changes: 23 additions & 5 deletions orte/orted/orted_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
* et Automatique. All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* Copyright (c) 2015-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
Expand Down Expand Up @@ -60,6 +60,7 @@
#include "opal/util/os_path.h"
#include "opal/util/printf.h"
#include "opal/util/argv.h"
#include "opal/util/fd.h"
#include "opal/runtime/opal.h"
#include "opal/mca/base/mca_base_var.h"
#include "opal/util/daemon_init.h"
Expand All @@ -74,6 +75,7 @@
#include "orte/util/nidmap.h"
#include "orte/util/parse_options.h"
#include "orte/mca/rml/base/rml_contact.h"
#include "orte/util/pre_condition_transports.h"

#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
Expand Down Expand Up @@ -526,7 +528,7 @@ int orte_daemon(int argc, char *argv[])
orte_node_t *node;
orte_app_context_t *app;
char *tmp, *nptr, *sysinfo;
char **singenv=NULL;
char **singenv=NULL, *string_key, *env_str;

/* setup the singleton's job */
jdata = OBJ_NEW(orte_job_t);
Expand Down Expand Up @@ -587,16 +589,29 @@ int orte_daemon(int argc, char *argv[])
proc->app_idx = 0;
ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_LOCAL);

/* set the ORTE_JOB_TRANSPORT_KEY from the environment */
orte_pre_condition_transports(jdata);

/* register the singleton's nspace with our PMIx server */
if (ORTE_SUCCESS != (ret = orte_pmix_server_register_nspace(jdata))) {
ORTE_ERROR_LOG(ret);
goto DONE;
}
/* use setup fork to create the envars needed by the singleton */
if (OPAL_SUCCESS != (ret = opal_pmix.server_setup_fork(&proc->name, &singenv))) {
ORTE_ERROR_LOG(ret);
goto DONE;
ORTE_ERROR_LOG(ret);
goto DONE;
}

/* append the transport key to the envars needed by the singleton */
if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_TRANSPORT_KEY, (void**)&string_key, OPAL_STRING) || NULL == string_key) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
goto DONE;
}
asprintf(&env_str, OPAL_MCA_PREFIX"orte_precondition_transports=%s", string_key);
opal_argv_append_nosize(&singenv, env_str);
free(env_str);

nptr = opal_argv_join(singenv, ',');
opal_argv_free(singenv);
/* create a string that contains our uri + sysinfo + PMIx server URI envars */
Expand All @@ -606,7 +621,10 @@ int orte_daemon(int argc, char *argv[])
free(nptr);

/* pass that info to the singleton */
write(orted_globals.uri_pipe, tmp, strlen(tmp)+1); /* need to add 1 to get the NULL */
if (OPAL_SUCCESS != (ret = opal_fd_write(orted_globals.uri_pipe, strlen(tmp)+1, tmp))) { ; /* need to add 1 to get the NULL */
ORTE_ERROR_LOG(ret);
goto DONE;
}

/* cleanup */
free(tmp);
Expand Down