Skip to content

fix grpcomm errors bug 1215 #1254

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 72 additions & 1 deletion orte/mca/ess/base/ess_base_std_orted.c
Original file line number Diff line number Diff line change
Expand Up @@ -300,8 +300,79 @@ int orte_ess_base_orted_setup(char **hosts)
}
}
}
/* setup the global job and node arrays */
orte_job_data = OBJ_NEW(opal_pointer_array_t);
if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data,
1,
ORTE_GLOBAL_ARRAY_MAX_SIZE,
1))) {
ORTE_ERROR_LOG(ret);
error = "setup job array";
goto error;
}
orte_node_pool = OBJ_NEW(opal_pointer_array_t);
if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool,
ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
ORTE_GLOBAL_ARRAY_MAX_SIZE,
ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
ORTE_ERROR_LOG(ret);
error = "setup node array";
goto error;
}
orte_node_topologies = OBJ_NEW(opal_pointer_array_t);
if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies,
ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
ORTE_GLOBAL_ARRAY_MAX_SIZE,
ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
ORTE_ERROR_LOG(ret);
error = "setup node topologies array";
goto error;
}
/* Setup the job data object for the daemons */
/* create and store the job data object */
jdata = OBJ_NEW(orte_job_t);
jdata->jobid = ORTE_PROC_MY_NAME->jobid;
opal_pointer_array_set_item(orte_job_data, 0, jdata);
/* every job requires at least one app */
app = OBJ_NEW(orte_app_context_t);
opal_pointer_array_set_item(jdata->apps, 0, app);
jdata->num_apps++;
/* create and store a node object where we are */
node = OBJ_NEW(orte_node_t);
node->name = strdup(orte_process_info.nodename);
node->index = opal_pointer_array_set_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid, node);
/* point our topology to the one detected locally */
node->topology = opal_hwloc_topology;

/* create and store a proc object for us */
proc = OBJ_NEW(orte_proc_t);
proc->name.jobid = ORTE_PROC_MY_NAME->jobid;
proc->name.vpid = ORTE_PROC_MY_NAME->vpid;
proc->pid = orte_process_info.pid;
proc->rml_uri = orte_rml.get_contact_info();
proc->state = ORTE_PROC_STATE_RUNNING;
opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc);
/* record that the daemon (i.e., us) is on this node
* NOTE: we do not add the proc object to the node's
* proc array because we are not an application proc.
* Instead, we record it in the daemon field of the
* node object
*/
OBJ_RETAIN(proc); /* keep accounting straight */
node->daemon = proc;
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED);
node->state = ORTE_NODE_STATE_UP;
/* now point our proc node field to the node */
OBJ_RETAIN(node); /* keep accounting straight */
proc->node = node;
/* record that the daemon job is running */
jdata->num_procs = 1;
jdata->state = ORTE_JOB_STATE_RUNNING;
/* obviously, we have "reported" */
jdata->num_reported = 1;

/* Setup the communication infrastructure */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_oob_base_open";
goto error;
Expand Down