From 038291a399474a0f0061d914d667e7b4ed66d643 Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Thu, 13 May 2021 11:41:19 -0600 Subject: [PATCH] PMIx_Connect usage: add optional timeout Add an MCA parameter that can be used to set a timeot on the PMIx_Connect operation used to support MPI_Comm_accept/connect and relatives. Related to #8958 Signed-off-by: Howard Pritchard --- ompi/dpm/dpm.c | 9 +++++++-- ompi/runtime/ompi_mpi_params.c | 10 ++++++++++ ompi/runtime/params.h | 6 ++++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/ompi/dpm/dpm.c b/ompi/dpm/dpm.c index b73e2eaa7d3..f06a9f4cad1 100644 --- a/ompi/dpm/dpm.c +++ b/ompi/dpm/dpm.c @@ -21,6 +21,8 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. * Copyright (c) 2021 Nanook Consulting. All rights reserved. + * Copyright (c) 2021 Triad National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -104,7 +106,7 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, bool dense, isnew; opal_process_name_t pname; opal_list_t ilist, mlist, rlist; - pmix_info_t info; + pmix_info_t info, tinfo; pmix_value_t pval; pmix_pdata_t pdat; pmix_proc_t *procs, pxproc; @@ -373,7 +375,10 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, /* tell the host RTE to connect us - this will download * all known data for the nspace's of participating procs * so that add_procs will not result in a slew of lookups */ - pret = PMIx_Connect(procs, nprocs, NULL, 0); + PMIX_INFO_CONSTRUCT(&tinfo); + PMIX_INFO_LOAD(&tinfo, PMIX_TIMEOUT, &ompi_pmix_connect_timeout, PMIX_UINT32); + pret = PMIx_Connect(procs, nprocs, &tinfo, 1); + PMIX_INFO_DESTRUCT(&tinfo); PMIX_PROC_FREE(procs, nprocs); rc = opal_pmix_convert_status(pret); if (OPAL_SUCCESS != rc) { diff --git a/ompi/runtime/ompi_mpi_params.c b/ompi/runtime/ompi_mpi_params.c index 0371228ed97..52a3c7e0257 100644 --- a/ompi/runtime/ompi_mpi_params.c +++ b/ompi/runtime/ompi_mpi_params.c @@ -20,6 +20,8 @@ * All rights reserved. * Copyright (c) 2016-2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2021 Triad National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -83,6 +85,7 @@ bool ompi_mpi_compat_mpi3 = false; char *ompi_mpi_spc_attach_string = NULL; bool ompi_mpi_spc_dump_enabled = false; +uint32_t ompi_pmix_connect_timeout; static bool show_default_mca_params = false; static bool show_file_mca_params = false; @@ -391,6 +394,13 @@ int ompi_mpi_register_params(void) &ompi_mpi_spc_dump_enabled); #endif // SPC_ENABLE + ompi_pmix_connect_timeout = 0; /* infinite timeout - see PMIx standard */ + (void) mca_base_var_register ("ompi", "mpi", NULL, "pmix_connect_timeout", + "Timeout(secs) for calls to PMIx_Connect. Default is no timeout.", + MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, + 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, + &ompi_pmix_connect_timeout); + return OMPI_SUCCESS; } diff --git a/ompi/runtime/params.h b/ompi/runtime/params.h index 91a9120638a..9e3e9b6d086 100644 --- a/ompi/runtime/params.h +++ b/ompi/runtime/params.h @@ -16,6 +16,8 @@ * Copyright (c) 2010-2012 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2013 NVIDIA Corporation. All rights reserved. * Copyright (c) 2013 Intel, Inc. All rights reserved + * Copyright (c) 2021 Triad National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -172,6 +174,10 @@ OMPI_DECLSPEC extern char * ompi_mpi_spc_attach_string; */ OMPI_DECLSPEC extern bool ompi_mpi_spc_dump_enabled; +/** + * Timeout for calls to PMIx_Connect(defaut 0, no timeout) + */ +OMPI_DECLSPEC extern uint32_t ompi_pmix_connect_timeout; /** * Register MCA parameters used by the MPI layer.