Skip to content

Commit dad72d3

Browse files
committed
osc/rdma: Fix some bugs running with btl/tcp.
- Make sure peer->state_endpoint is set correctly. - Fix double free of pending_op in ompi_osc_rdma_btl_fop() and ompi_osc_rdma_btl_op(). Cleanup/leaks: - Don't parse ompi_osc_rdma_btl_alternate_names twice. - free temp in allocate_state_shared(). Signed-off-by: Austen Lauria <[email protected]>
1 parent f455578 commit dad72d3

File tree

2 files changed

+9
-14
lines changed

2 files changed

+9
-14
lines changed

ompi/mca/osc/rdma/osc_rdma_component.c

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -529,6 +529,7 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s
529529
my_peer->state_handle = module->state_handle;
530530
my_peer->state_btl_index = my_peer->data_btl_index;
531531
my_peer->state_endpoint = my_peer->data_endpoint;
532+
assert(my_peer -> state_endpoint != NULL);
532533
}
533534

534535
if (MPI_WIN_FLAVOR_DYNAMIC != module->flavor) {
@@ -581,7 +582,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
581582
int my_rank = ompi_comm_rank (module->comm);
582583
int global_size = ompi_comm_size (module->comm);
583584
ompi_osc_rdma_region_t *state_region;
584-
struct _local_data *temp;
585+
struct _local_data *temp = NULL;
585586
char *data_file;
586587
int page_size = opal_getpagesize();
587588

@@ -625,6 +626,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
625626
}
626627

627628
do {
629+
free(temp); // free from prior iteration. Should be initialized to NULL.
628630
temp = calloc (local_size, sizeof (temp[0]));
629631
if (NULL == temp) {
630632
ret = OMPI_ERR_OUT_OF_RESOURCE;
@@ -788,10 +790,9 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
788790
peer->state_handle = (mca_btl_base_registration_handle_t *) state_region->btl_handle_data;
789791
}
790792
peer->state = (osc_rdma_counter_t) ((uintptr_t) state_region->base + state_base + module->state_size * i);
791-
if (i > 0) {
792-
peer->state_endpoint = local_leader->state_endpoint;
793-
peer->state_btl_index = local_leader->state_btl_index;
794-
}
793+
peer->state_endpoint = local_leader->data_endpoint; // data_endpoint initialized in ompi_osc_rdma_new_peer();
794+
peer->state_btl_index = local_leader->data_btl_index;
795+
assert(peer->state_endpoint != NULL);
795796
}
796797

797798
if (my_rank == peer_rank) {
@@ -914,10 +915,8 @@ static void ompi_osc_rdma_ensure_local_add_procs (void)
914915
static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module)
915916
{
916917
mca_btl_base_selected_module_t *item;
917-
char **btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ',');
918918
int btls_found = 0;
919-
920-
btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ',');
919+
char **btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ',');
921920
if (NULL == btls_to_use) {
922921
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "no alternate BTLs requested: %s", ompi_osc_rdma_btl_alternate_names);
923922
return OMPI_ERR_UNREACH;

ompi/mca/osc/rdma/osc_rdma_lock.h

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -89,10 +89,7 @@ static inline int ompi_osc_rdma_btl_fop (ompi_osc_rdma_module_t *module, uint8_t
8989
*result = ((int64_t *) pending_op->op_buffer)[0];
9090
ret = OMPI_SUCCESS;
9191
ompi_osc_rdma_atomic_complete (selected_btl, endpoint, pending_op->op_buffer,
92-
pending_op->op_frag->handle, (void *) pending_op, NULL, OPAL_SUCCESS);
93-
} else {
94-
/* need to release here because ompi_osc_rdma_atomic_complete was not called */
95-
OBJ_RELEASE(pending_op);
92+
NULL, (void *) pending_op, NULL, OPAL_SUCCESS);
9693
}
9794
} else if (wait_for_completion) {
9895
while (!pending_op->op_complete) {
@@ -193,6 +190,7 @@ static inline int ompi_osc_rdma_btl_cswap (ompi_osc_rdma_module_t *module, uint8
193190
{
194191
ompi_osc_rdma_pending_op_t *pending_op;
195192
mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index);
193+
assert(selected_btl != NULL);
196194
int ret;
197195

198196
pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t);
@@ -227,8 +225,6 @@ static inline int ompi_osc_rdma_btl_cswap (ompi_osc_rdma_module_t *module, uint8
227225
ret = OMPI_SUCCESS;
228226
}
229227

230-
/* need to release here because ompi_osc_rdma_atomic_complete was not called */
231-
OBJ_RELEASE(pending_op);
232228
} else {
233229
while (!pending_op->op_complete) {
234230
ompi_osc_rdma_progress (module);

0 commit comments

Comments
 (0)