Skip to content

Commit 518dc6a

Browse files
committed
Stop multiple invocations of debugger-release
It is possible for a tool to receive two copies of a "debugger-release" event - one directly from the source and another from the server to which it is connected. This can create a race condition that thread-locks the tool. Add a new PMIX_EVENT_ONESHOT attribute by which a process can request that an event handler be atomically deregistered once handling of that event is complete. Thus, even if a second event is received, the event notification system will ignore it unless the process re-registers a handler for it. Signed-off-by: Ralph Castain <[email protected]>
1 parent 7b741c4 commit 518dc6a

File tree

7 files changed

+141
-119
lines changed

7 files changed

+141
-119
lines changed

include/pmix_common.h.in

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -432,7 +432,8 @@ typedef uint32_t pmix_rank_t;
432432
#define PMIX_EVENT_TEXT_MESSAGE "pmix.evtext" // (char*) text message suitable for output by recipient - e.g., describing
433433
// the cause of the event
434434
#define PMIX_EVENT_TIMESTAMP "pmix.evtstamp" // (time_t) System time when the associated event occurred.
435-
435+
#define PMIX_EVENT_ONESHOT "pmix.evone" // (bool) when registering, indicate that this event handler is to be deleted
436+
// after being invoked
436437

437438
/* fault tolerance-related events */
438439
#define PMIX_EVENT_TERMINATE_SESSION "pmix.evterm.sess" // (bool) RM intends to terminate session

src/client/pmix_client.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -559,7 +559,7 @@ PMIX_EXPORT pmix_status_t PMIx_Init(pmix_proc_t *proc, pmix_info_t info[], size_
559559
pmix_cmd_t cmd = PMIX_REQ_CMD;
560560
pmix_status_t code;
561561
pmix_proc_t wildcard;
562-
pmix_info_t ginfo, evinfo[2];
562+
pmix_info_t ginfo, evinfo[3];
563563
pmix_value_t *val = NULL;
564564
pmix_lock_t reglock, releaselock;
565565
size_t n;
@@ -884,11 +884,12 @@ PMIX_EXPORT pmix_status_t PMIx_Init(pmix_proc_t *proc, pmix_info_t info[], size_
884884
PMIX_CONSTRUCT_LOCK(&releaselock);
885885
PMIX_INFO_LOAD(&evinfo[0], PMIX_EVENT_RETURN_OBJECT, &releaselock, PMIX_POINTER);
886886
PMIX_INFO_LOAD(&evinfo[1], PMIX_EVENT_HDLR_NAME, "WAIT-FOR-DEBUGGER", PMIX_STRING);
887+
PMIX_INFO_LOAD(&evinfo[2], PMIX_EVENT_ONESHOT, NULL, PMIX_BOOL);
887888
pmix_output_verbose(2, pmix_client_globals.event_output,
888889
"[%s:%d] REGISTERING WAIT FOR DEBUGGER", pmix_globals.myid.nspace,
889890
pmix_globals.myid.rank);
890891
code = PMIX_DEBUGGER_RELEASE;
891-
PMIx_Register_event_handler(&code, 1, evinfo, 2, notification_fn, evhandler_reg_callbk,
892+
PMIx_Register_event_handler(&code, 1, evinfo, 3, notification_fn, evhandler_reg_callbk,
892893
(void *) &reglock);
893894
/* wait for registration to complete */
894895
PMIX_WAIT_THREAD(&reglock);

src/event/pmix_event.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929

3030
#include "pmix_common.h"
3131
#include "src/class/pmix_list.h"
32+
#include "src/mca/bfrops/bfrops_types.h"
3233
#include "src/util/pmix_output.h"
3334

3435
BEGIN_C_DECLS
@@ -70,6 +71,7 @@ typedef struct {
7071
char *name;
7172
size_t index;
7273
uint8_t precedence;
74+
bool oneshot;
7375
char *locator;
7476
pmix_proc_t source; // who generated this event
7577
/* When registering for events, callers can specify
@@ -206,6 +208,9 @@ PMIX_EXPORT bool pmix_notify_check_range(pmix_range_trkr_t *rng, const pmix_proc
206208
PMIX_EXPORT bool pmix_notify_check_affected(pmix_proc_t *interested, size_t ninterested,
207209
pmix_proc_t *affected, size_t naffected);
208210

211+
PMIX_EXPORT pmix_status_t pmix_deregister_event_hdlr(size_t event_hdlr_ref,
212+
pmix_buffer_t *msg);
213+
209214
/* invoke the server event notification handler */
210215
PMIX_EXPORT pmix_status_t pmix_server_notify_client_of_event(pmix_status_t status,
211216
const pmix_proc_t *source,

src/event/pmix_event_notification.c

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@ PMIX_EXPORT pmix_status_t PMIx_Notify_event(pmix_status_t status, const pmix_pro
5151
return PMIX_ERR_INIT;
5252
}
5353

54-
if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer) || PMIX_PEER_IS_TOOL(pmix_globals.mypeer)) {
54+
if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer) ||
55+
PMIX_PEER_IS_TOOL(pmix_globals.mypeer)) {
5556
PMIX_RELEASE_THREAD(&pmix_global_lock);
5657

5758
pmix_output_verbose(2, pmix_server_globals.event_output,
@@ -273,8 +274,10 @@ static pmix_status_t notify_server_of_event(pmix_status_t status, const pmix_pro
273274

274275
if (PMIX_RANGE_PROC_LOCAL != range && NULL != msg) {
275276
/* if this is a "lost-connection" event, then there is no
276-
* server to pass it to! */
277-
if (PMIX_ERR_LOST_CONNECTION == status) {
277+
* server to pass it to! Likewise, we don't pass it to
278+
* ourselves */
279+
if (PMIX_ERR_LOST_CONNECTION == status ||
280+
pmix_globals.mypeer == pmix_client_globals.myserver) {
278281
PMIX_RELEASE(msg);
279282
goto local;
280283
}
@@ -391,11 +394,15 @@ static void cycle_events(int sd, short args, void *cbdata)
391394

392395
/* if the caller indicates that the chain is completed,
393396
* or we completed the "last" event */
394-
if (PMIX_EVENT_ACTION_COMPLETE == chain->interim_status
395-
|| PMIX_EVENT_ORDER_LAST_OVERALL == chain->evhdlr->precedence || chain->endchain) {
397+
if (PMIX_EVENT_ACTION_COMPLETE == chain->interim_status ||
398+
PMIX_EVENT_ORDER_LAST_OVERALL == chain->evhdlr->precedence || chain->endchain) {
396399
if (PMIX_EVENT_ACTION_COMPLETE == chain->interim_status) {
397400
chain->interim_status = PMIX_SUCCESS;
398401
}
402+
if (chain->evhdlr->oneshot) {
403+
/* remove this handler */
404+
pmix_deregister_event_hdlr(chain->evhdlr->index, NULL);
405+
}
399406
goto complete;
400407
}
401408
item = NULL;
@@ -409,8 +416,7 @@ static void cycle_events(int sd, short args, void *cbdata)
409416
} else {
410417
item = &chain->evhdlr->super;
411418
}
412-
while (pmix_list_get_end(&pmix_globals.events.single_events)
413-
!= (item = pmix_list_get_next(item))) {
419+
while (pmix_list_get_end(&pmix_globals.events.single_events) != (item = pmix_list_get_next(item))) {
414420
nxt = (pmix_event_hdlr_t *) item;
415421
if (nxt->codes[0] == chain->status && pmix_notify_check_range(&nxt->rng, &chain->source)
416422
&& pmix_notify_check_affected(nxt->affected, nxt->naffected, chain->affected,
@@ -499,8 +505,7 @@ static void cycle_events(int sd, short args, void *cbdata)
499505
} else if (NULL == item) {
500506
item = &chain->evhdlr->super;
501507
}
502-
if (pmix_list_get_end(&pmix_globals.events.default_events)
503-
!= (item = pmix_list_get_next(item))) {
508+
if (pmix_list_get_end(&pmix_globals.events.default_events) != (item = pmix_list_get_next(item))) {
504509
nxt = (pmix_event_hdlr_t *) item;
505510
/* if this event handler provided a range, check to see if
506511
* the source fits within it */
@@ -687,7 +692,8 @@ void pmix_invoke_local_event_hdlr(pmix_event_chain_t *chain)
687692
}
688693
}
689694
if (!found) {
690-
pmix_output_verbose(8, pmix_client_globals.event_output, "%s %s:%d",
695+
pmix_output_verbose(8, pmix_client_globals.event_output,
696+
"%s Ignoring event %s:%d",
691697
PMIX_NAME_PRINT(&pmix_globals.myid), __FILE__, __LINE__);
692698
goto complete;
693699
}
@@ -1089,6 +1095,10 @@ static void _notify_client_event(int sd, short args, void *cbdata)
10891095
if (PMIX_CHECK_PROCID(&cd->source, &pr->peer->info->pname)) {
10901096
continue;
10911097
}
1098+
/* don't notify ourselves - we handle this internally */
1099+
if (PMIX_CHECK_PROCID(&pmix_globals.myid, &pr->peer->info->pname)) {
1100+
continue;
1101+
}
10921102
/* if we have already notified this client, then don't do it again */
10931103
matched = false;
10941104
PMIX_LIST_FOREACH (nm, &trk, pmix_namelist_t) {
@@ -1353,8 +1363,10 @@ void pmix_event_timeout_cb(int fd, short flags, void *arg)
13531363
PMIX_RETAIN(ch);
13541364

13551365
/* process this event thru the regular channels */
1356-
if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer) && !PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer)) {
1357-
pmix_server_notify_client_of_event(ch->status, &ch->source, ch->range, ch->info, ch->ninfo,
1366+
if (PMIX_PEER_IS_SERVER(pmix_globals.mypeer) &&
1367+
!PMIX_PEER_IS_LAUNCHER(pmix_globals.mypeer)) {
1368+
pmix_server_notify_client_of_event(ch->status, &ch->source, ch->range,
1369+
ch->info, ch->ninfo,
13581370
ch->final_cbfunc, ch->final_cbdata);
13591371
} else {
13601372
pmix_invoke_local_event_hdlr(ch);
@@ -1427,6 +1439,7 @@ static void sevcon(pmix_event_hdlr_t *p)
14271439
p->name = NULL;
14281440
p->index = UINT_MAX;
14291441
p->precedence = PMIX_EVENT_ORDER_NONE;
1442+
p->oneshot = false;
14301443
p->locator = NULL;
14311444
p->rng.range = PMIX_RANGE_UNDEF;
14321445
p->rng.procs = NULL;

0 commit comments

Comments
 (0)