Skip to content

Commit 34fc7dc

Browse files
author
Ralph Castain
committed
Transfer Anandhi's OFI RML component work to the OMPI tree
1 parent a55d574 commit 34fc7dc

22 files changed

+2895
-60
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -450,6 +450,8 @@ orte/test/system/pmi_abort
450450
orte/test/system/opal_hwloc
451451
orte/test/system/opal_db
452452
orte/test/system/ulfm
453+
orte/test/system/ofi_query_test
454+
orte/test/system/ofi_stress
453455
orte/test/system/pmixtool
454456

455457
orte/tools/orte-checkpoint/orte-checkpoint

orte/include/orte/constants.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,8 @@ enum {
149149
ORTE_ERR_OUT_OF_ORDER_MSG = (ORTE_ERR_BASE - 55),
150150
ORTE_ERR_OPEN_CHANNEL_DUPLICATE = (ORTE_ERR_BASE - 56),
151151
ORTE_ERR_FORCE_SELECT = (ORTE_ERR_BASE - 57),
152-
ORTE_ERR_JOB_CANCELLED = (ORTE_ERR_BASE - 58)
152+
ORTE_ERR_JOB_CANCELLED = (ORTE_ERR_BASE - 58),
153+
ORTE_ERR_CONDUIT_SEND_FAIL = (ORTE_ERR_BASE - 59)
153154
};
154155

155156
#define ORTE_ERR_MAX (ORTE_ERR_BASE - 100)

orte/mca/ess/base/ess_base_std_orted.c

Lines changed: 46 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,26 @@ int orte_ess_base_orted_setup(char **hosts)
365365
/* obviously, we have "reported" */
366366
jdata->num_reported = 1;
367367

368+
369+
//[A]
370+
/* setup the PMIx framework - ensure it skips all non-PMIx components,
371+
* but do not override anything we were given */
372+
opal_setenv("OMPI_MCA_pmix", "^s1,s2,cray,isolated", false, &environ);
373+
if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) {
374+
ORTE_ERROR_LOG(ret);
375+
error = "orte_pmix_base_open";
376+
goto error;
377+
}
378+
379+
if (ORTE_SUCCESS != (ret = opal_pmix_base_select())) {
380+
ORTE_ERROR_LOG(ret);
381+
error = "opal_pmix_base_select";
382+
goto error;
383+
}
384+
/* set the event base */
385+
opal_pmix_base_set_evbase(orte_event_base);
386+
//[A]
387+
368388
/* Setup the communication infrastructure */
369389
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
370390
ORTE_ERROR_LOG(ret);
@@ -389,6 +409,13 @@ int orte_ess_base_orted_setup(char **hosts)
389409
/* add our contact info */
390410
proc->rml_uri = orte_rml.get_contact_info();
391411

412+
/* setup the PMIx server */
413+
if (ORTE_SUCCESS != (ret = pmix_server_init())) {
414+
ORTE_ERROR_LOG(ret);
415+
error = "pmix server init";
416+
goto error;
417+
}
418+
392419
/* select the errmgr */
393420
if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
394421
ORTE_ERROR_LOG(ret);
@@ -499,26 +526,27 @@ int orte_ess_base_orted_setup(char **hosts)
499526

500527
/* setup the PMIx framework - ensure it skips all non-PMIx components,
501528
* but do not override anything we were given */
502-
opal_setenv("OMPI_MCA_pmix", "^s1,s2,cray,isolated", false, &environ);
503-
if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) {
504-
ORTE_ERROR_LOG(ret);
505-
error = "orte_pmix_base_open";
506-
goto error;
507-
}
508-
if (ORTE_SUCCESS != (ret = opal_pmix_base_select())) {
509-
ORTE_ERROR_LOG(ret);
510-
error = "opal_pmix_base_select";
511-
goto error;
512-
}
529+
//[A]
530+
//[A] opal_setenv("OMPI_MCA_pmix", "^s1,s2,cray,isolated", false, &environ);
531+
//[A] if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) {
532+
//[A] ORTE_ERROR_LOG(ret);
533+
//[A] error = "orte_pmix_base_open";
534+
//[A] goto error;
535+
//[A] }
536+
537+
//[A] if (ORTE_SUCCESS != (ret = opal_pmix_base_select())) {
538+
//[A]ORTE_ERROR_LOG(ret);
539+
//[A] error = "opal_pmix_base_select";
540+
//[A] goto error;
541+
//[A] }
513542
/* set the event base */
514-
opal_pmix_base_set_evbase(orte_event_base);
543+
//[A] opal_pmix_base_set_evbase(orte_event_base);
515544
/* setup the PMIx server */
516-
if (ORTE_SUCCESS != (ret = pmix_server_init())) {
517-
/* the server code already barked, so let's be quiet */
518-
ret = ORTE_ERR_SILENT;
519-
error = "pmix_server_init";
520-
goto error;
521-
}
545+
//[A] if (ORTE_SUCCESS != (ret = pmix_server_init())) {
546+
//[A] ORTE_ERROR_LOG(ret);
547+
//[A] error = "pmix server init";
548+
//[A] goto error;
549+
//[A] }
522550

523551
/* setup the routed info - the selected routed component
524552
* will know what to do.

orte/mca/ess/hnp/ess_hnp_module.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,8 @@ static int rte_init(void)
316316
}
317317
}
318318

319+
320+
319321
/* Setup the communication infrastructure */
320322
/*
321323
* OOB Layer
@@ -621,12 +623,12 @@ static int rte_init(void)
621623
goto error;
622624
}
623625
/* set the event base */
624-
opal_pmix_base_set_evbase(orte_event_base);
626+
opal_pmix_base_set_evbase(orte_event_base);
625627

626628
/* setup the routed info - the selected routed component
627629
* will know what to do.
628630
*/
629-
if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
631+
if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
630632
ORTE_ERROR_LOG(ret);
631633
error = "orte_routed.init_routes";
632634
goto error;

orte/mca/ess/pmi/ess_pmi_module.c

100644100755
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -423,7 +423,7 @@ static int rte_init(void)
423423
* in the job won't be executing this step, so we would hang
424424
*/
425425
if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) {
426-
opal_pmix.fence(NULL, 0);
426+
opal_pmix.fence(NULL, true);
427427
}
428428

429429
return ORTE_SUCCESS;

orte/mca/rml/base/base.h

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
*
2828
* RML Framework maintenence interface
2929
*
30-
* Interface for starting / stopping / controlling the RML framework,
30+
* Interface for starting / stopping / controlling the RML framework,307
3131
* as well as support for modifying RML datatypes.
3232
*
3333
* @note The only RML datatype exposed to the user is the RML tag.
@@ -131,9 +131,11 @@ typedef struct {
131131
union {
132132
orte_rml_callback_fn_t iov;
133133
orte_rml_buffer_callback_fn_t buffer;
134+
/* for the conduits (ofi) */
135+
orte_rml_transport_callback_fn_t iov_transport;
136+
orte_rml_buffer_transport_callback_fn_t buf_transport;
134137
} cbfunc;
135-
void *cbdata;
136-
138+
void *cbdata;
137139
/* pointer to the user's iovec array */
138140
struct iovec *iov;
139141
int count;
@@ -153,6 +155,8 @@ typedef struct {
153155
opal_object_t super;
154156
opal_event_t ev;
155157
orte_rml_send_t send;
158+
/* conduit_id */
159+
uint8_t conduit_id;
156160
} orte_rml_send_request_t;
157161
OBJ_CLASS_DECLARATION(orte_rml_send_request_t);
158162

@@ -305,6 +309,18 @@ ORTE_DECLSPEC int orte_rml_API_ft_event(int state);
305309

306310
ORTE_DECLSPEC void orte_rml_API_purge(orte_process_name_t *peer);
307311

312+
ORTE_DECLSPEC int orte_rml_API_query_transports(opal_value_t **providers);
313+
314+
ORTE_DECLSPEC int orte_rml_API_send_transport_nb(int conduit_id,orte_process_name_t* peer, struct iovec* msg,
315+
int count, orte_rml_tag_t tag,
316+
orte_rml_callback_fn_t cbfunc, void* cbdata);
317+
ORTE_DECLSPEC int orte_rml_API_send_buffer_transport_nb(int conduit_id,
318+
orte_process_name_t* peer,
319+
struct opal_buffer_t* buffer,
320+
orte_rml_tag_t tag,
321+
orte_rml_buffer_callback_fn_t cbfunc,
322+
void* cbdata);
323+
308324
END_C_DECLS
309325

310326
#endif /* MCA_RML_BASE_H */

orte/mca/rml/base/rml_base_frame.c

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,10 @@ orte_rml_base_module_t orte_rml = {
5252
orte_rml_API_add_exception_handler,
5353
orte_rml_API_del_exception_handler,
5454
orte_rml_API_ft_event,
55-
orte_rml_API_purge
55+
orte_rml_API_purge,
56+
orte_rml_API_query_transports,
57+
orte_rml_API_send_transport_nb,
58+
orte_rml_API_send_buffer_transport_nb
5659
};
5760

5861
orte_rml_base_t orte_rml_base = {{{0}}};
@@ -173,12 +176,14 @@ int orte_rml_base_select(void)
173176

174177
if (NULL == ((orte_rml_component_t *)component)->rml_init) {
175178
opal_output_verbose(10, orte_rml_base_framework.framework_output,
176-
"orte_rml_base_select: no init function; ignoring component [%s]",component->mca_component_name);
179+
"orte_rml_base_select: no init function; ignoring component [%s]",
180+
component->mca_component_name);
177181
} else {
178182
module = (mca_base_module_t *) ((orte_rml_component_t *)component)->rml_init(&priority);
179183
if (NULL == module) {
180184
opal_output_verbose(10, orte_rml_base_framework.framework_output,
181-
"orte_rml_base_select: init returned failure [%s]",component->mca_component_name);
185+
"orte_rml_base_select: init returned failure [%s]",
186+
component->mca_component_name);
182187
continue;
183188
}
184189

@@ -213,7 +218,7 @@ int orte_rml_base_select(void)
213218
opal_output(0, "\tComponent: %s Priority: %d", mod->component->mca_component_name, mod->pri);
214219
}
215220
}
216-
221+
217222
return ORTE_SUCCESS;
218223
}
219224

0 commit comments

Comments
 (0)