46
46
#include "orte/runtime/orte_globals.h"
47
47
#include "orte/mca/rml/rml.h"
48
48
49
- #include "pmix_server_internal.h"
49
+ #include "orte/orted/pmix/pmix_server.h"
50
+ #include "orte/orted/pmix/pmix_server_internal.h"
50
51
51
52
void pmix_server_launch_resp (int status , orte_process_name_t * sender ,
52
53
opal_buffer_t * buffer ,
@@ -327,6 +328,119 @@ int pmix_server_spawn_fn(opal_process_name_t *requestor,
327
328
return OPAL_SUCCESS ;
328
329
}
329
330
331
+ static void _cnct (int sd , short args , void * cbdata );
332
+
333
+ static void _cnlk (int status , opal_list_t * data , void * cbdata )
334
+ {
335
+ orte_pmix_server_op_caddy_t * cd = (orte_pmix_server_op_caddy_t * )cbdata ;
336
+ int rc , cnt ;
337
+ opal_pmix_pdata_t * pdat ;
338
+ orte_job_t * jdata ;
339
+ opal_buffer_t buf ;
340
+
341
+ /* if we failed to get the required data, then just inform
342
+ * the embedded server that the connect cannot succeed */
343
+ if (ORTE_SUCCESS != status || NULL == data ) {
344
+ if (NULL != cd -> cbfunc ) {
345
+ rc = status ;
346
+ goto release ;
347
+ }
348
+ }
349
+
350
+ /* register the returned data with the embedded PMIx server */
351
+ pdat = (opal_pmix_pdata_t * )opal_list_get_first (data );
352
+ if (OPAL_BYTE_OBJECT != pdat -> value .type ) {
353
+ rc = ORTE_ERR_BAD_PARAM ;
354
+ goto release ;
355
+ }
356
+ /* the data will consist of a packed buffer with the job data in it */
357
+ OBJ_CONSTRUCT (& buf , opal_buffer_t );
358
+ opal_dss .load (& buf , pdat -> value .data .bo .bytes , pdat -> value .data .bo .size );
359
+ pdat -> value .data .bo .bytes = NULL ;
360
+ pdat -> value .data .bo .size = 0 ;
361
+ cnt = 1 ;
362
+ if (OPAL_SUCCESS != (rc = opal_dss .unpack (& buf , & jdata , & cnt , ORTE_JOB ))) {
363
+ OBJ_DESTRUCT (& buf );
364
+ goto release ;
365
+ }
366
+ OBJ_DESTRUCT (& buf );
367
+ if (ORTE_SUCCESS != (rc = orte_pmix_server_register_nspace (jdata ))) {
368
+ OBJ_RELEASE (jdata );
369
+ goto release ;
370
+ }
371
+ OBJ_RELEASE (jdata ); // no reason to keep this around
372
+
373
+ /* restart the cnct processor */
374
+ ORTE_PMIX_OPERATION (cd -> procs , cd -> info , _cnct , cd -> cbfunc , cd -> cbdata );
375
+ OBJ_RELEASE (cd );
376
+
377
+ release :
378
+ if (NULL != cd -> cbfunc ) {
379
+ cd -> cbfunc (rc , cd -> cbdata );
380
+ }
381
+ OBJ_RELEASE (cd );
382
+ }
383
+
384
+ static void _cnct (int sd , short args , void * cbdata )
385
+ {
386
+ orte_pmix_server_op_caddy_t * cd = (orte_pmix_server_op_caddy_t * )cbdata ;
387
+ orte_namelist_t * nm ;
388
+ char * * keys = NULL , * key ;
389
+ orte_job_t * jdata ;
390
+ int rc = ORTE_SUCCESS ;
391
+
392
+ /* at some point, we need to add bookeeping to track which
393
+ * procs are "connected" so we know who to notify upon
394
+ * termination or failure. For now, we have to ensure
395
+ * that we have registered all participating nspaces so
396
+ * the embedded PMIx server can provide them to the client.
397
+ * Otherwise, the client will receive an error as it won't
398
+ * be able to resolve any of the required data for the
399
+ * missing nspaces */
400
+
401
+ /* cycle thru the procs */
402
+ OPAL_LIST_FOREACH (nm , cd -> procs , orte_namelist_t ) {
403
+ /* see if we have the job object for this job */
404
+ if (NULL == (jdata = orte_get_job_data_object (nm -> name .jobid ))) {
405
+ /* we don't know about this job. If our "global" data
406
+ * server is just our HNP, then we have no way of finding
407
+ * out about it, and all we can do is return an error */
408
+ if (orte_pmix_server_globals .server .jobid == ORTE_PROC_MY_HNP -> jobid &&
409
+ orte_pmix_server_globals .server .vpid == ORTE_PROC_MY_HNP -> vpid ) {
410
+ rc = ORTE_ERR_NOT_SUPPORTED ;
411
+ goto release ;
412
+ }
413
+ /* ask the global data server for the data - if we get it,
414
+ * then we can complete the request */
415
+ key = opal_convert_jobid_to_string (nm -> name .jobid );
416
+ opal_argv_append_nosize (& keys , key );
417
+ free (key );
418
+ if (ORTE_SUCCESS != (rc = pmix_server_lookup_fn (& nm -> name , keys , cd -> info , _cnlk , cd ))) {
419
+ opal_argv_free (keys );
420
+ goto release ;
421
+ }
422
+ opal_argv_free (keys );
423
+ /* the callback function on this lookup will return us to this
424
+ * routine so we can continue the process */
425
+ return ;
426
+ }
427
+ /* we know about the job - check to ensure it has been
428
+ * registered with the local PMIx server */
429
+ if (!orte_get_attribute (& jdata -> attributes , ORTE_JOB_NSPACE_REGISTERED , NULL , OPAL_BOOL )) {
430
+ /* it hasn't been registered yet, so register it now */
431
+ if (ORTE_SUCCESS != (rc = orte_pmix_server_register_nspace (jdata ))) {
432
+ goto release ;
433
+ }
434
+ }
435
+ }
436
+
437
+ release :
438
+ if (NULL != cd -> cbfunc ) {
439
+ cd -> cbfunc (rc , cd -> cbdata );
440
+ }
441
+ OBJ_RELEASE (cd );
442
+ }
443
+
330
444
int pmix_server_connect_fn (opal_list_t * procs , opal_list_t * info ,
331
445
opal_pmix_op_cbfunc_t cbfunc , void * cbdata )
332
446
{
@@ -335,26 +449,52 @@ int pmix_server_connect_fn(opal_list_t *procs, opal_list_t *info,
335
449
ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
336
450
(int )opal_list_get_size (procs ));
337
451
338
- /* for now, just ack the call */
339
- if (NULL != cbfunc ) {
340
- cbfunc ( OPAL_SUCCESS , cbdata ) ;
452
+ /* protect ourselves */
453
+ if (NULL == procs || 0 == opal_list_get_size ( procs ) ) {
454
+ return ORTE_ERR_BAD_PARAM ;
341
455
}
456
+ /* must thread shift this as we will be accessing global data */
457
+ ORTE_PMIX_OPERATION (procs , info , _cnct , cbfunc , cbdata );
458
+ return ORTE_SUCCESS ;
459
+ }
342
460
343
- return OPAL_SUCCESS ;
461
+ static void mdxcbfunc (int status ,
462
+ const char * data , size_t ndata , void * cbdata ,
463
+ opal_pmix_release_cbfunc_t relcbfunc , void * relcbdata )
464
+ {
465
+ orte_pmix_server_op_caddy_t * cd = (orte_pmix_server_op_caddy_t * )cbdata ;
466
+
467
+ /* ack the call */
468
+ if (NULL != cd -> cbfunc ) {
469
+ cd -> cbfunc (status , cd -> cbdata );
470
+ }
471
+ OBJ_RELEASE (cd );
344
472
}
345
473
346
474
int pmix_server_disconnect_fn (opal_list_t * procs , opal_list_t * info ,
347
475
opal_pmix_op_cbfunc_t cbfunc , void * cbdata )
348
476
{
477
+ orte_pmix_server_op_caddy_t * cd ;
478
+ int rc ;
479
+
349
480
opal_output_verbose (2 , orte_pmix_server_globals .output ,
350
481
"%s disconnect called with %d procs" ,
351
482
ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
352
483
(int )opal_list_get_size (procs ));
353
484
354
- /* for now, just ack the call */
355
- if (NULL != cbfunc ) {
356
- cbfunc (OPAL_SUCCESS , cbdata );
485
+ /* at some point, we need to add bookeeping to track which
486
+ * procs are "connected" so we know who to notify upon
487
+ * termination or failure. For now, just execute a fence
488
+ * Note that we do not need to thread-shift here as the
489
+ * fence function will do it for us */
490
+ cd = OBJ_NEW (orte_pmix_server_op_caddy_t );
491
+ cd -> cbfunc = cbfunc ;
492
+ cd -> cbdata = cbdata ;
493
+
494
+ if (ORTE_SUCCESS != (rc = pmix_server_fencenb_fn (procs , info , NULL , 0 ,
495
+ mdxcbfunc , cd ))) {
496
+ OBJ_RELEASE (cd );
357
497
}
358
498
359
- return OPAL_SUCCESS ;
499
+ return rc ;
360
500
}
0 commit comments