|
4 | 4 | * reserved.
|
5 | 5 | * Copyright (c) 2018 Triad National Security, LLC. All rights
|
6 | 6 | * reserved.
|
7 |
| - * Copyright (c) 2019 Google, LLC. All rights reserved. |
| 7 | + * Copyright (c) 2019-2025 Google, LLC. All rights reserved. |
8 | 8 | * $COPYRIGHT$
|
9 | 9 | *
|
10 | 10 | * Additional copyrights may follow
|
|
16 | 16 | #include "btl_uct.h"
|
17 | 17 | #include "btl_uct_am.h"
|
18 | 18 | #include "btl_uct_device_context.h"
|
| 19 | +#include "opal/mca/timer/base/base.h" |
19 | 20 | #include "opal/util/proc.h"
|
20 | 21 |
|
21 | 22 | static void mca_btl_uct_endpoint_construct(mca_btl_uct_endpoint_t *endpoint)
|
@@ -257,21 +258,17 @@ static int mca_btl_uct_endpoint_send_conn_req(mca_btl_uct_module_t *uct_btl,
|
257 | 258 | return OPAL_SUCCESS;
|
258 | 259 | }
|
259 | 260 |
|
260 |
| -static int mca_btl_uct_endpoint_connect_endpoint( |
| 261 | +static int mca_btl_uct_endpoint_send_connection_data( |
261 | 262 | mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, mca_btl_uct_tl_t *tl,
|
262 | 263 | mca_btl_uct_device_context_t *tl_context, mca_btl_uct_tl_endpoint_t *tl_endpoint,
|
263 |
| - uint8_t *tl_data, uint8_t *conn_tl_data, void *ep_addr) |
| 264 | + uint8_t *conn_tl_data, int request_type) |
264 | 265 | {
|
265 |
| - size_t request_length = sizeof(mca_btl_uct_conn_req_t) |
266 |
| - + MCA_BTL_UCT_TL_ATTR(tl, tl_context->context_id).ep_addr_len; |
267 |
| - mca_btl_uct_connection_ep_t *conn_ep = endpoint->conn_ep; |
268 | 266 | mca_btl_uct_tl_t *conn_tl = uct_btl->conn_tl;
|
269 | 267 | mca_btl_uct_device_context_t *conn_tl_context = conn_tl->uct_dev_contexts[0];
|
270 |
| - mca_btl_uct_conn_req_t *request = alloca(request_length); |
| 268 | + mca_btl_uct_connection_ep_t *conn_ep = endpoint->conn_ep; |
271 | 269 | uct_device_addr_t *device_addr = NULL;
|
272 | 270 | uct_iface_addr_t *iface_addr;
|
273 | 271 | ucs_status_t ucs_status;
|
274 |
| - int rc; |
275 | 272 |
|
276 | 273 | assert(NULL != conn_tl);
|
277 | 274 |
|
@@ -302,15 +299,50 @@ static int mca_btl_uct_endpoint_connect_endpoint(
|
302 | 299 | ucs_status));
|
303 | 300 | return OPAL_ERROR;
|
304 | 301 | }
|
305 |
| - } else { |
306 |
| - OBJ_RETAIN(conn_ep); |
307 | 302 | }
|
308 | 303 |
|
| 304 | + size_t request_length = sizeof(mca_btl_uct_conn_req_t) |
| 305 | + + MCA_BTL_UCT_TL_ATTR(tl, tl_context->context_id).ep_addr_len; |
| 306 | + mca_btl_uct_conn_req_t *request = alloca(request_length); |
| 307 | + |
309 | 308 | /* fill in common request parameters */
|
310 | 309 | request->proc_name = OPAL_PROC_MY_NAME;
|
311 | 310 | request->context_id = tl_context->context_id;
|
312 | 311 | request->tl_index = tl->tl_index;
|
313 |
| - request->type = !!(ep_addr); |
| 312 | + request->type = request_type; |
| 313 | + |
| 314 | + /* fill in connection request */ |
| 315 | + ucs_status = uct_ep_get_address(tl_endpoint->uct_ep, (uct_ep_addr_t *) request->ep_addr); |
| 316 | + if (UCS_OK != ucs_status) { |
| 317 | + /* this is a fatal a fatal error */ |
| 318 | + OBJ_RELEASE(endpoint->conn_ep); |
| 319 | + uct_ep_destroy(tl_endpoint->uct_ep); |
| 320 | + tl_endpoint->uct_ep = NULL; |
| 321 | + return OPAL_ERROR; |
| 322 | + } |
| 323 | + |
| 324 | + /* let the remote side know that the connection has been established and |
| 325 | + * wait for the message to be sent */ |
| 326 | + int rc = mca_btl_uct_endpoint_send_conn_req(uct_btl, endpoint, conn_tl_context, request, |
| 327 | + request_length); |
| 328 | + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { |
| 329 | + OBJ_RELEASE(endpoint->conn_ep); |
| 330 | + uct_ep_destroy(tl_endpoint->uct_ep); |
| 331 | + tl_endpoint->uct_ep = NULL; |
| 332 | + return OPAL_ERROR; |
| 333 | + } |
| 334 | + |
| 335 | + tl_endpoint->last_connection_req = opal_timer_base_get_usec(); |
| 336 | + |
| 337 | + return OPAL_SUCCESS; |
| 338 | +} |
| 339 | + |
| 340 | +static int mca_btl_uct_endpoint_connect_endpoint( |
| 341 | + mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, mca_btl_uct_tl_t *tl, |
| 342 | + mca_btl_uct_device_context_t *tl_context, mca_btl_uct_tl_endpoint_t *tl_endpoint, |
| 343 | + uint8_t *tl_data, uint8_t *conn_tl_data, void *ep_addr) |
| 344 | +{ |
| 345 | + ucs_status_t ucs_status; |
314 | 346 |
|
315 | 347 | if (NULL == tl_endpoint->uct_ep) {
|
316 | 348 | BTL_VERBOSE(("allocating endpoint for peer %s and sending connection data",
|
@@ -338,29 +370,15 @@ static int mca_btl_uct_endpoint_connect_endpoint(
|
338 | 370 | }
|
339 | 371 | }
|
340 | 372 |
|
341 |
| - /* fill in connection request */ |
342 |
| - ucs_status = uct_ep_get_address(tl_endpoint->uct_ep, (uct_ep_addr_t *) request->ep_addr); |
343 |
| - if (UCS_OK != ucs_status) { |
344 |
| - /* this is a fatal a fatal error */ |
345 |
| - OBJ_RELEASE(endpoint->conn_ep); |
346 |
| - uct_ep_destroy(tl_endpoint->uct_ep); |
347 |
| - tl_endpoint->uct_ep = NULL; |
348 |
| - return OPAL_ERROR; |
349 |
| - } |
350 |
| - |
351 |
| - /* let the remote side know that the connection has been established and |
352 |
| - * wait for the message to be sent */ |
353 |
| - rc = mca_btl_uct_endpoint_send_conn_req(uct_btl, endpoint, conn_tl_context, request, |
354 |
| - request_length); |
355 |
| - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { |
356 |
| - OBJ_RELEASE(endpoint->conn_ep); |
357 |
| - uct_ep_destroy(tl_endpoint->uct_ep); |
358 |
| - tl_endpoint->uct_ep = NULL; |
359 |
| - return OPAL_ERROR; |
| 373 | + opal_timer_t now = opal_timer_base_get_usec(); |
| 374 | + if ((now - tl_endpoint->last_connection_req) < mca_btl_uct_component.connection_retry_timeout && !ep_addr) { |
| 375 | + return (tl_endpoint->flags & MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY) ? OPAL_SUCCESS |
| 376 | + : OPAL_ERR_OUT_OF_RESOURCE; |
360 | 377 | }
|
361 | 378 |
|
362 |
| - return (tl_endpoint->flags & MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY) ? OPAL_SUCCESS |
363 |
| - : OPAL_ERR_OUT_OF_RESOURCE; |
| 379 | + int rc = mca_btl_uct_endpoint_send_connection_data(uct_btl, endpoint, tl, tl_context, tl_endpoint, |
| 380 | + conn_tl_data, /*request_type=*/!!ep_addr); |
| 381 | + return (OPAL_SUCCESS == rc) ? OPAL_ERR_OUT_OF_RESOURCE : rc; |
364 | 382 | }
|
365 | 383 |
|
366 | 384 | int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endpoint_t *endpoint,
|
|
0 commit comments