Skip to content

Commit 72cd4ea

Browse files
Mamzi Bayatpour  mbayatpour@nvidia.com ()janjust
authored and
Mamzi Bayatpour [email protected] ()
committed
Fixes in OSC/UCX:
- Failure in window Post and Complete routines - Failure in Put with 1 byte for Dynamic windows - Failure in Accumulate with noncontig dt - Hang in PSCW - Flush every attached win in fence for dynamic win Signed-off-by: Mamzi Bayatpour <[email protected]> Co-authored-by: Tomislav Janjusic <[email protected]>
1 parent 1b5dc73 commit 72cd4ea

File tree

3 files changed

+27
-9
lines changed

3 files changed

+27
-9
lines changed

ompi/mca/osc/ucx/osc_ucx_active_target.c

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,20 @@ int ompi_osc_ucx_fence(int mpi_assert, struct ompi_win_t *win) {
7474
}
7575

7676
if (!(mpi_assert & MPI_MODE_NOPRECEDE)) {
77-
ret = opal_common_ucx_wpmem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_WORKER, 0/*ignore*/);
78-
if (ret != OMPI_SUCCESS) {
79-
return ret;
77+
if (module->flavor == MPI_WIN_FLAVOR_DYNAMIC) {
78+
int i;
79+
for (i = 0; i < OMPI_OSC_UCX_ATTACH_MAX; i++) {
80+
ret = opal_common_ucx_wpmem_flush(module->local_dynamic_win_info[i].mem,
81+
OPAL_COMMON_UCX_SCOPE_WORKER, 0/*ignore*/);
82+
if (ret != OMPI_SUCCESS) {
83+
return ret;
84+
}
85+
}
86+
} else {
87+
ret = opal_common_ucx_wpmem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_WORKER, 0/*ignore*/);
88+
if (ret != OMPI_SUCCESS) {
89+
return ret;
90+
}
8091
}
8192
}
8293

@@ -181,14 +192,14 @@ int ompi_osc_ucx_complete(struct ompi_win_t *win) {
181192
for (i = 0; i < size; i++) {
182193
uint64_t remote_addr = module->state_addrs[module->start_grp_ranks[i]] + OSC_UCX_STATE_COMPLETE_COUNT_OFFSET; // write to state.complete_count on remote side
183194

184-
ret = opal_common_ucx_wpmem_post(module->mem, UCP_ATOMIC_POST_OP_ADD,
195+
ret = opal_common_ucx_wpmem_post(module->state_mem, UCP_ATOMIC_POST_OP_ADD,
185196
1, module->start_grp_ranks[i], sizeof(uint64_t),
186197
remote_addr);
187198
if (ret != OMPI_SUCCESS) {
188199
OSC_UCX_VERBOSE(1, "opal_common_ucx_mem_post failed: %d", ret);
189200
}
190201

191-
ret = opal_common_ucx_wpmem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_EP,
202+
ret = opal_common_ucx_wpmem_flush(module->state_mem, OPAL_COMMON_UCX_SCOPE_EP,
192203
module->start_grp_ranks[i]);
193204
if (ret != OMPI_SUCCESS) {
194205
return ret;
@@ -243,10 +254,13 @@ int ompi_osc_ucx_post(struct ompi_group_t *group, int mpi_assert, struct ompi_wi
243254
uint64_t remote_addr = module->state_addrs[ranks_in_win_grp[i]] + OSC_UCX_STATE_POST_INDEX_OFFSET; // write to state.post_index on remote side
244255
uint64_t curr_idx = 0, result = 0;
245256

257+
258+
246259
/* do fop first to get an post index */
247-
ret = opal_common_ucx_wpmem_fetch(module->mem, UCP_ATOMIC_FETCH_OP_FADD,
260+
ret = opal_common_ucx_wpmem_fetch(module->state_mem, UCP_ATOMIC_FETCH_OP_FADD,
248261
1, ranks_in_win_grp[i], &result,
249262
sizeof(result), remote_addr);
263+
250264
if (ret != OMPI_SUCCESS) {
251265
ret = OMPI_ERROR;
252266
goto cleanup;
@@ -258,9 +272,12 @@ int ompi_osc_ucx_post(struct ompi_group_t *group, int mpi_assert, struct ompi_wi
258272

259273
/* do cas to send post message */
260274
do {
261-
ret = opal_common_ucx_wpmem_cmpswp(module->mem, 0, result,
262-
myrank + 1, &result, sizeof(result),
275+
276+
result = myrank + 1;
277+
ret = opal_common_ucx_wpmem_cmpswp(module->state_mem, 0, result,
278+
ranks_in_win_grp[i], &result, sizeof(result),
263279
remote_addr);
280+
264281
if (ret != OMPI_SUCCESS) {
265282
ret = OMPI_ERROR;
266283
goto cleanup;

ompi/mca/osc/ucx/osc_ucx_comm.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -656,6 +656,7 @@ int accumulate_req(const void *origin_addr, int origin_count,
656656
if (ret != OMPI_SUCCESS) {
657657
return ret;
658658
}
659+
temp_count *= target_count;
659660
}
660661
ompi_datatype_get_true_extent(temp_dt, &temp_lb, &temp_extent);
661662
temp_addr = free_ptr = malloc(temp_extent * temp_count);

ompi/mca/osc/ucx/osc_ucx_component.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -584,7 +584,7 @@ int ompi_osc_find_attached_region_position(ompi_osc_dynamic_win_info_t *dynamic_
584584
if (dynamic_wins[mid_index].base > base) {
585585
return ompi_osc_find_attached_region_position(dynamic_wins, min_index, mid_index-1,
586586
base, len, insert);
587-
} else if (base + len < dynamic_wins[mid_index].base + dynamic_wins[mid_index].size) {
587+
} else if (base + len <= dynamic_wins[mid_index].base + dynamic_wins[mid_index].size) {
588588
return mid_index;
589589
} else {
590590
return ompi_osc_find_attached_region_position(dynamic_wins, mid_index+1, max_index,

0 commit comments

Comments
 (0)