|
27 | 27 | #include "ompi/mca/mtl/base/base.h"
|
28 | 28 | #include "ompi/mca/mtl/base/mtl_base_datatype.h"
|
29 | 29 | #include "ompi/message/message.h"
|
| 30 | +#include "opal/mca/timer/base/base.h" |
30 | 31 |
|
31 | 32 | #include "mtl_portals4.h"
|
32 | 33 | #include "mtl_portals4_endpoint.h"
|
@@ -81,6 +82,7 @@ read_msg(void *start, ptl_size_t length, ptl_process_t target,
|
81 | 82 | frag->frag_remote_offset = remote_offset + i * ompi_mtl_portals4.max_msg_size_mtl;
|
82 | 83 |
|
83 | 84 | frag->event_callback = ompi_mtl_portals4_rndv_get_frag_progress;
|
| 85 | + frag->frag_abs_timeout_usec = 0; |
84 | 86 |
|
85 | 87 | OPAL_OUTPUT_VERBOSE((90, ompi_mtl_base_framework.framework_output, "GET (fragment %d/%d, size %ld) send",
|
86 | 88 | i + 1, frag_count, frag->frag_length));
|
@@ -322,17 +324,41 @@ ompi_mtl_portals4_rndv_get_frag_progress(ptl_event_t *ev,
|
322 | 324 | ompi_mtl_portals4_recv_request_t* ptl_request =
|
323 | 325 | (ompi_mtl_portals4_recv_request_t*) rndv_get_frag->request;
|
324 | 326 |
|
325 |
| - assert(ev->type==PTL_EVENT_REPLY); |
| 327 | + assert(PTL_EVENT_REPLY == ev->type); |
326 | 328 |
|
327 | 329 | OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
|
328 | 330 | "Recv %lu (0x%lx) got reply event",
|
329 | 331 | ptl_request->opcount, ptl_request->hdr_data));
|
330 | 332 |
|
| 333 | + |
331 | 334 | if (OPAL_UNLIKELY(ev->ni_fail_type != PTL_NI_OK)) {
|
332 | 335 | opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
|
333 | 336 | "%s:%d: PTL_EVENT_REPLY with ni_fail_type: %d",
|
334 | 337 | __FILE__, __LINE__, ev->ni_fail_type);
|
335 | 338 |
|
| 339 | + if (OPAL_UNLIKELY(ev->ni_fail_type != PTL_NI_DROPPED)) { |
| 340 | + opal_output_verbose(1, ompi_mtl_base_framework.framework_output, |
| 341 | + "PTL_EVENT_REPLY with ni_fail_type: %u => cannot retry", |
| 342 | + (uint32_t)ev->ni_fail_type); |
| 343 | + ret = PTL_FAIL; |
| 344 | + goto callback_error; |
| 345 | + } |
| 346 | + |
| 347 | + if (0 == rndv_get_frag->frag_abs_timeout_usec) { |
| 348 | + /* this is the first retry of the frag. start the timer. */ |
| 349 | + /* instead of recording the start time, record the end time |
| 350 | + * and avoid addition on each retry. */ |
| 351 | + rndv_get_frag->frag_abs_timeout_usec = opal_timer_base_get_usec() + ompi_mtl_portals4.get_retransmit_timeout; |
| 352 | + opal_output_verbose(1, ompi_mtl_base_framework.framework_output, |
| 353 | + "setting frag timeout at %lu", |
| 354 | + rndv_get_frag->frag_abs_timeout_usec); |
| 355 | + } else if (opal_timer_base_get_usec() >= rndv_get_frag->frag_abs_timeout_usec) { |
| 356 | + opal_output_verbose(1, ompi_mtl_base_framework.framework_output, |
| 357 | + "timeout retrying GET"); |
| 358 | + ret = PTL_FAIL; |
| 359 | + goto callback_error; |
| 360 | + } |
| 361 | + |
336 | 362 | OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
|
337 | 363 | "Rendezvous Get Failed: Reissuing frag #%u", rndv_get_frag->frag_num));
|
338 | 364 |
|
|
0 commit comments