Skip to content

Commit 0ce8590

Browse files
authored
Merge pull request #3837 from tkordenbrock/topic/master/get.retry.timeout
master: mtl-portals4: add timeout to rendezvous get fragments
2 parents 6fb81f2 + 5ecd905 commit 0ce8590

File tree

4 files changed

+41
-1
lines changed

4 files changed

+41
-1
lines changed

ompi/mca/mtl/portals4/mtl_portals4.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ struct mca_mtl_portals4_module_t {
7373

7474
/* free list of rendezvous get fragments */
7575
opal_free_list_t fl_rndv_get_frag;
76+
int get_retransmit_timeout;
7677

7778
/** Network interface handle for matched interface */
7879
ptl_handle_ni_t ni_h;

ompi/mca/mtl/portals4/mtl_portals4_component.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,16 @@ ompi_mtl_portals4_component_register(void)
202202
MCA_BASE_VAR_SCOPE_READONLY,
203203
&ompi_mtl_portals4.max_msg_size_mtl);
204204

205+
ompi_mtl_portals4.get_retransmit_timeout=10000;
206+
(void) mca_base_component_var_register(&mca_mtl_portals4_component.mtl_version,
207+
"get_retransmit_timeout",
208+
"PtlGET retransmission timeout in usec",
209+
MCA_BASE_VAR_TYPE_INT,
210+
NULL, 0, 0,
211+
OPAL_INFO_LVL_5,
212+
MCA_BASE_VAR_SCOPE_READONLY,
213+
&ompi_mtl_portals4.get_retransmit_timeout);
214+
205215
OBJ_RELEASE(new_enum);
206216
if (0 > ret) {
207217
return OMPI_ERR_NOT_SUPPORTED;

ompi/mca/mtl/portals4/mtl_portals4_recv.c

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include "ompi/mca/mtl/base/base.h"
2828
#include "ompi/mca/mtl/base/mtl_base_datatype.h"
2929
#include "ompi/message/message.h"
30+
#include "opal/mca/timer/base/base.h"
3031

3132
#include "mtl_portals4.h"
3233
#include "mtl_portals4_endpoint.h"
@@ -81,6 +82,7 @@ read_msg(void *start, ptl_size_t length, ptl_process_t target,
8182
frag->frag_remote_offset = remote_offset + i * ompi_mtl_portals4.max_msg_size_mtl;
8283

8384
frag->event_callback = ompi_mtl_portals4_rndv_get_frag_progress;
85+
frag->frag_abs_timeout_usec = 0;
8486

8587
OPAL_OUTPUT_VERBOSE((90, ompi_mtl_base_framework.framework_output, "GET (fragment %d/%d, size %ld) send",
8688
i + 1, frag_count, frag->frag_length));
@@ -322,17 +324,41 @@ ompi_mtl_portals4_rndv_get_frag_progress(ptl_event_t *ev,
322324
ompi_mtl_portals4_recv_request_t* ptl_request =
323325
(ompi_mtl_portals4_recv_request_t*) rndv_get_frag->request;
324326

325-
assert(ev->type==PTL_EVENT_REPLY);
327+
assert(PTL_EVENT_REPLY == ev->type);
326328

327329
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
328330
"Recv %lu (0x%lx) got reply event",
329331
ptl_request->opcount, ptl_request->hdr_data));
330332

333+
331334
if (OPAL_UNLIKELY(ev->ni_fail_type != PTL_NI_OK)) {
332335
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
333336
"%s:%d: PTL_EVENT_REPLY with ni_fail_type: %d",
334337
__FILE__, __LINE__, ev->ni_fail_type);
335338

339+
if (OPAL_UNLIKELY(ev->ni_fail_type != PTL_NI_DROPPED)) {
340+
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
341+
"PTL_EVENT_REPLY with ni_fail_type: %u => cannot retry",
342+
(uint32_t)ev->ni_fail_type);
343+
ret = PTL_FAIL;
344+
goto callback_error;
345+
}
346+
347+
if (0 == rndv_get_frag->frag_abs_timeout_usec) {
348+
/* this is the first retry of the frag. start the timer. */
349+
/* instead of recording the start time, record the end time
350+
* and avoid addition on each retry. */
351+
rndv_get_frag->frag_abs_timeout_usec = opal_timer_base_get_usec() + ompi_mtl_portals4.get_retransmit_timeout;
352+
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
353+
"setting frag timeout at %lu",
354+
rndv_get_frag->frag_abs_timeout_usec);
355+
} else if (opal_timer_base_get_usec() >= rndv_get_frag->frag_abs_timeout_usec) {
356+
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
357+
"timeout retrying GET");
358+
ret = PTL_FAIL;
359+
goto callback_error;
360+
}
361+
336362
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
337363
"Rendezvous Get Failed: Reissuing frag #%u", rndv_get_frag->frag_num));
338364

ompi/mca/mtl/portals4/mtl_portals4_request.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
#include "opal/datatype/opal_convertor.h"
2424
#include "ompi/mca/mtl/mtl.h"
25+
#include "opal/mca/timer/base/base.h"
2526

2627
struct ompi_mtl_portals4_message_t;
2728
struct ompi_mtl_portals4_pending_request_t;
@@ -93,6 +94,8 @@ struct ompi_mtl_portals4_rndv_get_frag_t {
9394
ptl_process_t frag_target;
9495
ptl_hdr_data_t frag_match_bits;
9596
ptl_size_t frag_remote_offset;
97+
/* the absolute time at which this frag times out */
98+
opal_timer_t frag_abs_timeout_usec;
9699

97100
int (*event_callback)(ptl_event_t *ev, struct ompi_mtl_portals4_rndv_get_frag_t*);
98101

0 commit comments

Comments
 (0)