@@ -114,7 +114,16 @@ static int orte_create_dir(char *directory)
114
114
115
115
/*
116
116
* Construct the fullpath to the session directory - it
117
- * will consist of "ompi.<hostname>.<pid>"
117
+ * will consist of "ompi.<hostname>.<effective-uid>", and
118
+ * have subdirs:
119
+ *
120
+ * pid - the pid of the mpirun that oversees this job. Note
121
+ * that direct-launched processes will have manufactured
122
+ * this value
123
+ *
124
+ * jobid - jobid of the application being executed
125
+ *
126
+ * vpid - vpid of the process
118
127
*/
119
128
int
120
129
orte_session_dir_get_name (char * * fulldirpath ,
@@ -132,10 +141,14 @@ orte_session_dir_get_name(char **fulldirpath,
132
141
bool prefix_provided = false;
133
142
int exit_status = ORTE_SUCCESS ;
134
143
size_t len ;
144
+ uid_t uid ;
135
145
136
146
/* Ensure that system info is set */
137
147
orte_proc_info ();
138
148
149
+ /* get the effective uid */
150
+ uid = geteuid ();
151
+
139
152
/*
140
153
* set the 'hostname'
141
154
*/
@@ -156,30 +169,48 @@ orte_session_dir_get_name(char **fulldirpath,
156
169
/* construct the frontend of the session directory*/
157
170
if (NULL != orte_process_info .top_session_dir ) {
158
171
frontend = strdup (orte_process_info .top_session_dir );
172
+ } else { /* If not set then construct it */
173
+ if (0 > asprintf (& frontend , "ompi.%s.%lu" , hostname , (unsigned long )uid )) {
174
+ ORTE_ERROR_LOG (ORTE_ERR_OUT_OF_RESOURCE );
175
+ exit_status = ORTE_ERR_OUT_OF_RESOURCE ;
176
+ goto cleanup ;
177
+ }
159
178
}
160
- else { /* If not set then construct it */
161
- if (0 > asprintf (& frontend , "ompi.%s.%lu" , hostname , (unsigned long )orte_process_info .pid )) {
179
+
180
+ /* construct the next level down, which belongs to the
181
+ * job family. This is related to the mpirun that launched
182
+ * the job, or is an arbitrary (agreed upon) value if
183
+ * direct launched */
184
+ if (ORTE_PROC_IS_HNP ) {
185
+ if (0 > asprintf (& jobfam , "pid.%lu" , (unsigned long )orte_process_info .pid )) {
162
186
ORTE_ERROR_LOG (ORTE_ERR_OUT_OF_RESOURCE );
163
187
exit_status = ORTE_ERR_OUT_OF_RESOURCE ;
164
188
goto cleanup ;
165
189
}
190
+ orte_process_info .jobfam_session_dir = strdup (jobfam );
191
+ } else if (NULL != orte_process_info .jobfam_session_dir ) {
192
+ /* we had a job family session dir passed down to us by mpirun */
193
+ jobfam = strdup (orte_process_info .jobfam_session_dir );
194
+ } else {
195
+ /* we were not given one, so define it */
196
+ if (NULL == proc ) {
197
+ jobfam = strdup ("jobfam" );
198
+ } else {
199
+ if (0 > asprintf (& jobfam , "jf.%d" , ORTE_JOB_FAMILY (proc -> jobid ))) {
200
+ ORTE_ERROR_LOG (ORTE_ERR_OUT_OF_RESOURCE );
201
+ exit_status = ORTE_ERR_OUT_OF_RESOURCE ;
202
+ goto cleanup ;
203
+ }
204
+ }
205
+ orte_process_info .jobfam_session_dir = strdup (jobfam );
166
206
}
167
207
168
208
/*
169
209
* Construct the session directory
170
210
*/
171
- /* If we were given a valid vpid then we can construct it fully into:
172
- * openmpi-sessions-USERNAME@HOSTNAME_BATCHID/JOB-FAMILY/JOBID/VPID
173
- */
211
+ /* If we were given a valid vpid then we can construct it fully */
174
212
if ( NULL != proc ) {
175
213
if (ORTE_VPID_INVALID != proc -> vpid ) {
176
-
177
- if (0 > asprintf (& jobfam , "%d" , ORTE_JOB_FAMILY (proc -> jobid ))) {
178
- ORTE_ERROR_LOG (ORTE_ERR_OUT_OF_RESOURCE );
179
- exit_status = ORTE_ERR_OUT_OF_RESOURCE ;
180
- goto cleanup ;
181
- }
182
-
183
214
if (0 > asprintf (& job , "%d" , ORTE_LOCAL_JOBID (proc -> jobid ))) {
184
215
ORTE_ERROR_LOG (ORTE_ERR_OUT_OF_RESOURCE );
185
216
exit_status = ORTE_ERR_OUT_OF_RESOURCE ;
@@ -192,23 +223,13 @@ orte_session_dir_get_name(char **fulldirpath,
192
223
goto cleanup ;
193
224
}
194
225
195
- sessions = opal_os_path ( false, frontend , jobfam , job , vpidstr , NULL );
226
+ sessions = opal_os_path (false, frontend , jobfam , job , vpidstr , NULL );
196
227
if ( NULL == sessions ) {
197
228
ORTE_ERROR_LOG (ORTE_ERROR );
198
229
exit_status = ORTE_ERROR ;
199
230
goto cleanup ;
200
231
}
201
- }
202
- /* If we were given a valid jobid then we can construct it partially into:
203
- * openmpi-sessions-USERNAME@HOSTNAME_BATCHID/JOB-FAMILY/JOBID
204
- */
205
- else if (ORTE_JOBID_INVALID != proc -> jobid ) {
206
- if (0 > asprintf (& jobfam , "%d" , ORTE_JOB_FAMILY (proc -> jobid ))) {
207
- ORTE_ERROR_LOG (ORTE_ERR_OUT_OF_RESOURCE );
208
- exit_status = ORTE_ERR_OUT_OF_RESOURCE ;
209
- goto cleanup ;
210
- }
211
-
232
+ } else if (ORTE_JOBID_INVALID != proc -> jobid ) {
212
233
if (0 > asprintf (& job , "%d" , ORTE_LOCAL_JOBID (proc -> jobid ))) {
213
234
ORTE_ERROR_LOG (ORTE_ERR_OUT_OF_RESOURCE );
214
235
exit_status = ORTE_ERR_OUT_OF_RESOURCE ;
@@ -221,14 +242,12 @@ orte_session_dir_get_name(char **fulldirpath,
221
242
exit_status = ORTE_ERROR ;
222
243
goto cleanup ;
223
244
}
224
- } /* if both are invalid */
225
- else {
245
+ } else {
226
246
sessions = strdup (frontend ); /* must dup this to avoid double-free later */
227
247
}
228
248
229
- } /* If we were not given a proc at all, then we just set it to frontend
230
- */
231
- else {
249
+ } else {
250
+ /* If we were not given a proc at all, then we just set it to frontend */
232
251
sessions = strdup (frontend ); /* must dup this to avoid double-free later */
233
252
}
234
253
@@ -666,14 +685,8 @@ static char *orte_build_job_session_dir(char *top_dir,
666
685
orte_process_name_t * proc ,
667
686
orte_jobid_t jobid )
668
687
{
669
- char * jobfam = NULL ;
670
688
char * job_session_dir ;
671
689
672
- if (0 > asprintf (& jobfam , "%d" , ORTE_JOB_FAMILY (proc -> jobid ))) {
673
- ORTE_ERROR_LOG (ORTE_ERR_OUT_OF_RESOURCE );
674
- return NULL ;
675
- }
676
-
677
690
if (ORTE_JOBID_WILDCARD != jobid ) {
678
691
char * job = NULL ;
679
692
@@ -682,19 +695,18 @@ static char *orte_build_job_session_dir(char *top_dir,
682
695
job_session_dir = NULL ;
683
696
goto out ;
684
697
}
685
- job_session_dir = opal_os_path (false, top_dir , jobfam , job , NULL );
698
+ job_session_dir = opal_os_path (false, top_dir , orte_process_info . jobfam_session_dir , job , NULL );
686
699
free (job );
687
700
if (NULL == job_session_dir ) {
688
701
ORTE_ERROR_LOG (ORTE_ERR_OUT_OF_RESOURCE );
689
702
}
690
703
} else {
691
- job_session_dir = opal_os_path (false, top_dir , jobfam , NULL );
704
+ job_session_dir = opal_os_path (false, top_dir , orte_process_info . jobfam_session_dir , NULL );
692
705
if ( NULL == job_session_dir ) {
693
706
ORTE_ERROR_LOG (ORTE_ERR_OUT_OF_RESOURCE );
694
707
}
695
708
}
696
709
697
710
out :
698
- free (jobfam );
699
711
return job_session_dir ;
700
712
}
0 commit comments