Skip to content

Commit 9cddb09

Browse files
committed
Correct PML UCX setting values in status MPI_ERROR
Signed-off-by: Aurelien Bouteiller <[email protected]> (cherry picked from commit 9bcf213)
1 parent da13d4c commit 9cddb09

File tree

2 files changed

+51
-11
lines changed

2 files changed

+51
-11
lines changed

ompi/mca/pml/ucx/pml_ucx.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Copyright (C) 2001-2011 Mellanox Technologies Ltd. 2001-2011. ALL RIGHTS RESERVED.
3-
* Copyright (c) 2016 The University of Tennessee and The University
3+
* Copyright (c) 2016-2021 The University of Tennessee and The University
44
* of Tennessee Research Foundation. All rights
55
* reserved.
66
* Copyright (c) 2018 Research Organization for Information Science
@@ -644,7 +644,7 @@ int mca_pml_ucx_recv(void *buf, size_t count, ompi_datatype_t *datatype, int src
644644
MCA_COMMON_UCX_PROGRESS_LOOP(ompi_pml_ucx.ucp_worker) {
645645
status = ucp_request_test(req, &info);
646646
if (status != UCS_INPROGRESS) {
647-
return mca_pml_ucx_set_recv_status_safe(mpi_status, status, &info);
647+
return mca_pml_ucx_set_recv_status_public(mpi_status, status, &info);
648648
}
649649
}
650650
}
@@ -965,7 +965,7 @@ int mca_pml_ucx_iprobe(int src, int tag, struct ompi_communicator_t* comm,
965965
0, &info);
966966
if (ucp_msg != NULL) {
967967
*matched = 1;
968-
mca_pml_ucx_set_recv_status_safe(mpi_status, UCS_OK, &info);
968+
mca_pml_ucx_set_recv_status_public(mpi_status, UCS_OK, &info);
969969
} else {
970970
(++progress_count % opal_common_ucx.progress_iterations) ?
971971
(void)ucp_worker_progress(ompi_pml_ucx.ucp_worker) : opal_progress();
@@ -989,7 +989,7 @@ int mca_pml_ucx_probe(int src, int tag, struct ompi_communicator_t* comm,
989989
ucp_msg = ucp_tag_probe_nb(ompi_pml_ucx.ucp_worker, ucp_tag,
990990
ucp_tag_mask, 0, &info);
991991
if (ucp_msg != NULL) {
992-
mca_pml_ucx_set_recv_status_safe(mpi_status, UCS_OK, &info);
992+
mca_pml_ucx_set_recv_status_public(mpi_status, UCS_OK, &info);
993993
return OMPI_SUCCESS;
994994
}
995995
}
@@ -1014,7 +1014,7 @@ int mca_pml_ucx_improbe(int src, int tag, struct ompi_communicator_t* comm,
10141014
PML_UCX_MESSAGE_NEW(comm, ucp_msg, &info, message);
10151015
PML_UCX_VERBOSE(8, "got message %p (%p)", (void*)*message, (void*)ucp_msg);
10161016
*matched = 1;
1017-
mca_pml_ucx_set_recv_status_safe(mpi_status, UCS_OK, &info);
1017+
mca_pml_ucx_set_recv_status_public(mpi_status, UCS_OK, &info);
10181018
} else {
10191019
(++progress_count % opal_common_ucx.progress_iterations) ?
10201020
(void)ucp_worker_progress(ompi_pml_ucx.ucp_worker) : opal_progress();
@@ -1040,7 +1040,7 @@ int mca_pml_ucx_mprobe(int src, int tag, struct ompi_communicator_t* comm,
10401040
if (ucp_msg != NULL) {
10411041
PML_UCX_MESSAGE_NEW(comm, ucp_msg, &info, message);
10421042
PML_UCX_VERBOSE(8, "got message %p (%p)", (void*)*message, (void*)ucp_msg);
1043-
mca_pml_ucx_set_recv_status_safe(mpi_status, UCS_OK, &info);
1043+
mca_pml_ucx_set_recv_status_public(mpi_status, UCS_OK, &info);
10441044
return OMPI_SUCCESS;
10451045
}
10461046
}

ompi/mca/pml/ucx/pml_ucx_request.h

Lines changed: 45 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED.
3-
* Copyright (c) 2016 The University of Tennessee and The University
3+
* Copyright (c) 2016-2021 The University of Tennessee and The University
44
* of Tennessee Research Foundation. All rights
55
* reserved.
66
* $COPYRIGHT$
@@ -151,6 +151,12 @@ static inline void mca_pml_ucx_request_reset(ompi_request_t *req)
151151
req->req_complete = REQUEST_PENDING;
152152
}
153153

154+
/* Use when setting a request's status field.
155+
* Note that a new function 'mca_mpl_ucx_set_send_status_public' shall
156+
* be created and used instead if updating a publicly visible status becomes
157+
* necessary (i.e., the status argument in an user-visible procedure), see the
158+
* recv_status case below for rationale.
159+
*/
154160
__opal_attribute_always_inline__
155161
static inline void mca_pml_ucx_set_send_status(ompi_status_public_t* mpi_status,
156162
ucs_status_t status)
@@ -165,6 +171,11 @@ static inline void mca_pml_ucx_set_send_status(ompi_status_public_t* mpi_status,
165171
}
166172
}
167173

174+
/* Use when setting a request's status field.
175+
* Note that the next function 'mca_mpl_ucx_set_recv_status_public' shall
176+
* be used instead when updating a publicly visible status (i.e., the
177+
* status argument in an user-visible procedure).
178+
*/
168179
static inline int mca_pml_ucx_set_recv_status(ompi_status_public_t* mpi_status,
169180
ucs_status_t ucp_status,
170181
const ucp_tag_recv_info_t *info)
@@ -180,6 +191,10 @@ static inline int mca_pml_ucx_set_recv_status(ompi_status_public_t* mpi_status,
180191
mpi_status->_ucount = info->length;
181192
} else if (ucp_status == UCS_ERR_MESSAGE_TRUNCATED) {
182193
mpi_status->MPI_ERROR = MPI_ERR_TRUNCATE;
194+
mpi_status->MPI_SOURCE = PML_UCX_TAG_GET_SOURCE(tag);
195+
mpi_status->MPI_TAG = PML_UCX_TAG_GET_MPI_TAG(tag);
196+
mpi_status->_cancelled = false;
197+
mpi_status->_ucount = info->length;
183198
} else if (ucp_status == UCS_ERR_CANCELED) {
184199
mpi_status->MPI_ERROR = MPI_SUCCESS;
185200
mpi_status->_cancelled = true;
@@ -190,16 +205,41 @@ static inline int mca_pml_ucx_set_recv_status(ompi_status_public_t* mpi_status,
190205
return mpi_status->MPI_ERROR;
191206
}
192207

193-
static inline int mca_pml_ucx_set_recv_status_safe(ompi_status_public_t* mpi_status,
208+
/* Use when setting a publicly visible status (i.e., the status argument in an
209+
* user-visible procedure).
210+
* Except in procedures that return MPI_ERR_IN_STATUS, the MPI_ERROR
211+
* field of a status object shall never be modified
212+
* See MPI-1.1 doc, sec 3.2.5, p.22
213+
*/
214+
static inline int mca_pml_ucx_set_recv_status_public(ompi_status_public_t* mpi_status,
194215
ucs_status_t ucp_status,
195216
const ucp_tag_recv_info_t *info)
196217
{
197218
if (mpi_status != MPI_STATUS_IGNORE) {
198-
return mca_pml_ucx_set_recv_status(mpi_status, ucp_status, info);
199-
} else if (OPAL_LIKELY(ucp_status == UCS_OK) || (ucp_status == UCS_ERR_CANCELED)) {
200-
return UCS_OK;
219+
if (OPAL_LIKELY(ucp_status == UCS_OK)) {
220+
uint64_t tag = info->sender_tag;
221+
mpi_status->MPI_SOURCE = PML_UCX_TAG_GET_SOURCE(tag);
222+
mpi_status->MPI_TAG = PML_UCX_TAG_GET_MPI_TAG(tag);
223+
mpi_status->_cancelled = false;
224+
mpi_status->_ucount = info->length;
225+
return MPI_SUCCESS;
226+
} else if (ucp_status == UCS_ERR_MESSAGE_TRUNCATED) {
227+
uint64_t tag = info->sender_tag;
228+
mpi_status->MPI_SOURCE = PML_UCX_TAG_GET_SOURCE(tag);
229+
mpi_status->MPI_TAG = PML_UCX_TAG_GET_MPI_TAG(tag);
230+
mpi_status->_cancelled = false;
231+
mpi_status->_ucount = info->length;
232+
return MPI_ERR_TRUNCATE;
233+
} else if (ucp_status == UCS_ERR_CANCELED) {
234+
mpi_status->_cancelled = true;
235+
return MPI_SUCCESS;
236+
} else {
237+
return MPI_ERR_INTERN;
238+
}
201239
} else if (ucp_status == UCS_ERR_MESSAGE_TRUNCATED) {
202240
return MPI_ERR_TRUNCATE;
241+
} else if (OPAL_LIKELY(ucp_status == UCS_OK) || (ucp_status == UCS_ERR_CANCELED)) {
242+
return MPI_SUCCESS;
203243
}
204244

205245
return MPI_ERR_INTERN;

0 commit comments

Comments
 (0)