Skip to content

Commit 681e9ff

Browse files
committed
orte: use a unique top_session_dir directory when possible.
currently, top_session_dir is based on hostname, uid, HNP pid and job family. there is a risk a top_session_dir exists and contains some old data when a job starts, leading to undefined behavior. if the app is started via mpirun, use a unique top_session_dir mkdtemp("$TMP/ompi.<hostname>.<uid>/XXXXXX") that is passed to fork'ed MPI tasks via the OPAL_MCA_PREFIX"orte_top_session_dir" environment variable. if the app is direct launched, then the current behavior is unchanged. direct launch behavior will be enhanced when PMIx is able to pass a per-node directory (PMIX_NSDIR ?) to a direct launched task.
1 parent f3f8aa8 commit 681e9ff

File tree

2 files changed

+36
-2
lines changed

2 files changed

+36
-2
lines changed

orte/orted/orted_main.c

+5
Original file line numberDiff line numberDiff line change
@@ -615,6 +615,11 @@ int orte_daemon(int argc, char *argv[])
615615
opal_argv_append_nosize(&singenv, env_str);
616616
free(env_str);
617617

618+
/* append the top session dir to the envars needed by the singleton */
619+
asprintf(&env_str, OPAL_MCA_PREFIX"orte_top_session_dir=%s", orte_process_info.top_session_dir);
620+
opal_argv_append_nosize(&singenv, env_str);
621+
free(env_str);
622+
618623
nptr = opal_argv_join(singenv, ',');
619624
opal_argv_free(singenv);
620625
/* create a string that contains our uri + sysinfo + PMIx server URI envars */

orte/util/session_dir.c

+31-2
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
1212
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
13-
* Copyright (c) 2015 Research Organization for Information Science
13+
* Copyright (c) 2015-2016 Research Organization for Information Science
1414
* and Technology (RIST). All rights reserved.
1515
* Copyright (c) 2015 Intel, Inc. All rights reserved.
1616
* $COPYRIGHT$
@@ -134,13 +134,24 @@ static int _setup_tmpdir_base(void)
134134
static int _setup_top_session_dir(void)
135135
{
136136
int rc = ORTE_SUCCESS;
137+
char *env;
137138
/* get the effective uid */
138139
uid_t uid = geteuid();
139140

141+
140142
/* construct the top_session_dir if we need */
141143
if (NULL == orte_process_info.top_session_dir) {
144+
env = getenv(OPAL_MCA_PREFIX"orte_top_session_dir");
145+
if (NULL != env) {
146+
orte_process_info.tmpdir_base = strdup(env);
147+
orte_process_info.top_session_dir = strdup(env);
148+
return ORTE_SUCCESS;
149+
}
150+
151+
assert(!ORTE_PROC_IS_APP || (NULL == getenv(OPAL_MCA_PREFIX"orte_launch")));
152+
142153
if (ORTE_SUCCESS != (rc = _setup_tmpdir_base())) {
143-
return rc;
154+
goto exit;
144155
}
145156
if( NULL == orte_process_info.nodename ||
146157
NULL == orte_process_info.tmpdir_base ){
@@ -156,6 +167,24 @@ static int _setup_top_session_dir(void)
156167
rc = ORTE_ERR_OUT_OF_RESOURCE;
157168
goto exit;
158169
}
170+
if (!ORTE_PROC_IS_APP) {
171+
char *dir = orte_process_info.top_session_dir;
172+
if (ORTE_SUCCESS != (rc = orte_create_dir(dir))) {
173+
goto exit;
174+
}
175+
if (0 > asprintf(&orte_process_info.top_session_dir, "%s/XXXXXX", dir)) {
176+
free(dir);
177+
orte_process_info.top_session_dir = NULL;
178+
rc = ORTE_ERR_OUT_OF_RESOURCE;
179+
goto exit;
180+
}
181+
free(dir);
182+
183+
if (NULL == mkdtemp(orte_process_info.top_session_dir)) {
184+
rc = ORTE_ERROR;
185+
goto exit;
186+
}
187+
}
159188
}
160189
exit:
161190
if( ORTE_SUCCESS != rc ){

0 commit comments

Comments
 (0)