Skip to content

Commit 23fd874

Browse files
committed
osc/rdma: Fix some bugs running with btl/tcp.
- Make sure peer->state_endpoint is set correctly. - Fix double free of pending_op in ompi_osc_rdma_btl_fop() and ompi_osc_rdma_btl_op(). Cleanup/leaks: - Don't parse ompi_osc_rdma_btl_alternate_names twice. - free temp in allocate_state_shared(). Signed-off-by: Austen Lauria <[email protected]>
1 parent ff1ba01 commit 23fd874

File tree

2 files changed

+12
-20
lines changed

2 files changed

+12
-20
lines changed

ompi/mca/osc/rdma/osc_rdma_component.c

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -581,7 +581,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
581581
int my_rank = ompi_comm_rank (module->comm);
582582
int global_size = ompi_comm_size (module->comm);
583583
ompi_osc_rdma_region_t *state_region;
584-
struct _local_data *temp;
584+
struct _local_data *temp = NULL;
585585
char *data_file;
586586
int page_size = opal_getpagesize();
587587

@@ -624,13 +624,12 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
624624
size += OPAL_ALIGN_PAD_AMOUNT(size, page_size);
625625
}
626626

627-
do {
628-
temp = calloc (local_size, sizeof (temp[0]));
629-
if (NULL == temp) {
630-
ret = OMPI_ERR_OUT_OF_RESOURCE;
631-
break;
632-
}
627+
temp = calloc (local_size, sizeof (temp[0]));
628+
if (NULL == temp) {
629+
return OMPI_ERR_OUT_OF_RESOURCE;
630+
}
633631

632+
do {
634633
temp[local_rank].rank = my_rank;
635634
temp[local_rank].size = size;
636635

@@ -788,10 +787,9 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
788787
peer->state_handle = (mca_btl_base_registration_handle_t *) state_region->btl_handle_data;
789788
}
790789
peer->state = (osc_rdma_counter_t) ((uintptr_t) state_region->base + state_base + module->state_size * i);
791-
if (i > 0) {
792-
peer->state_endpoint = local_leader->state_endpoint;
793-
peer->state_btl_index = local_leader->state_btl_index;
794-
}
790+
peer->state_endpoint = local_leader->data_endpoint; // data_endpoint initialized in ompi_osc_rdma_new_peer();
791+
peer->state_btl_index = local_leader->data_btl_index;
792+
assert(peer->state_endpoint != NULL);
795793
}
796794

797795
if (my_rank == peer_rank) {
@@ -914,10 +912,8 @@ static void ompi_osc_rdma_ensure_local_add_procs (void)
914912
static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module)
915913
{
916914
mca_btl_base_selected_module_t *item;
917-
char **btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ',');
918915
int btls_found = 0;
919-
920-
btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ',');
916+
char **btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ',');
921917
if (NULL == btls_to_use) {
922918
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "no alternate BTLs requested: %s", ompi_osc_rdma_btl_alternate_names);
923919
return OMPI_ERR_UNREACH;

ompi/mca/osc/rdma/osc_rdma_lock.h

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -89,10 +89,7 @@ static inline int ompi_osc_rdma_btl_fop (ompi_osc_rdma_module_t *module, uint8_t
8989
*result = ((int64_t *) pending_op->op_buffer)[0];
9090
ret = OMPI_SUCCESS;
9191
ompi_osc_rdma_atomic_complete (selected_btl, endpoint, pending_op->op_buffer,
92-
pending_op->op_frag->handle, (void *) pending_op, NULL, OPAL_SUCCESS);
93-
} else {
94-
/* need to release here because ompi_osc_rdma_atomic_complete was not called */
95-
OBJ_RELEASE(pending_op);
92+
NULL, (void *) pending_op, NULL, OPAL_SUCCESS);
9693
}
9794
} else if (wait_for_completion) {
9895
while (!pending_op->op_complete) {
@@ -193,6 +190,7 @@ static inline int ompi_osc_rdma_btl_cswap (ompi_osc_rdma_module_t *module, uint8
193190
{
194191
ompi_osc_rdma_pending_op_t *pending_op;
195192
mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index);
193+
assert(selected_btl != NULL);
196194
int ret;
197195

198196
pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t);
@@ -227,8 +225,6 @@ static inline int ompi_osc_rdma_btl_cswap (ompi_osc_rdma_module_t *module, uint8
227225
ret = OMPI_SUCCESS;
228226
}
229227

230-
/* need to release here because ompi_osc_rdma_atomic_complete was not called */
231-
OBJ_RELEASE(pending_op);
232228
} else {
233229
while (!pending_op->op_complete) {
234230
ompi_osc_rdma_progress (module);

0 commit comments

Comments
 (0)