From 1cf972dcaf64d31e2892360cf8435bb88b19c566 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Wed, 25 Mar 2020 15:43:27 -0700 Subject: [PATCH 1/4] Update PMIx and PRRTE Deprecate --am and --amca options Avoid default param files on backend nodes Any parameters in the PRRTE default or user param files will have been picked up by prte and included in the environment sent to the prted, so don't open those files on the backend. Avoid picking up MCA param file info on backend Avoid the scaling problem at PRRTE startup by only reading the system and user param files on the frontend. Complete revisions to cmd line parser for OMPI Per specification, enforce following precedence order: 1. system-level default parameter file 1. user-level default parameter file 1. Anything found in the environment 1. "--tune" files. Note that "--amca" goes away and becomes equivalent to "--tune". Okay if it is provided more than once on a cmd line (we will aggregate the list of files, retaining order), but an error if a parameter is referenced in more than one file with a different value 1. "--mca" options. Again, error if the same option appears more than once with a different value. Allowed to override a parameter referenced in a "tune" file 1. "-x" options. Allowed to overwrite options given in a "tune" file, but cannot conflict with an explicit "--mca" option 1. all other options Fix special handling of "-np" Get agreement on jobid across the layers Need all three pieces (PRRTE, PMIx, and OPAL) to agree on the nspace conversion to jobid method Ensure prte show_help messages get output Print abnormal termination messages Cleanup error reporting in persistent operations Signed-off-by: Ralph Castain dd Signed-off-by: Ralph Castain --- opal/mca/pmix/base/pmix_base_fns.c | 76 +++++++++++++++++++----------- opal/mca/pmix/pmix-internal.h | 6 +++ opal/mca/pmix/pmix4x/openpmix | 2 +- prrte | 2 +- 4 files changed, 57 insertions(+), 29 deletions(-) diff --git a/opal/mca/pmix/base/pmix_base_fns.c b/opal/mca/pmix/base/pmix_base_fns.c index 7eeac64e9ba..fa9b446a8f7 100644 --- a/opal/mca/pmix/base/pmix_base_fns.c +++ b/opal/mca/pmix/base/pmix_base_fns.c @@ -110,18 +110,14 @@ int opal_pmix_convert_jobid(pmix_nspace_t nspace, opal_jobid_t jobid) /* zero out the nspace */ PMIX_LOAD_NSPACE(nspace, NULL); - if (opal_process_info.nativelaunch) { - opal_snprintf_jobid(nspace, PMIX_MAX_NSLEN, jobid); - return OPAL_SUCCESS; - } else { - /* cycle across our list of known jobids */ - OPAL_LIST_FOREACH(nptr, &localnspaces, opal_nptr_t) { - if (jobid == nptr->jobid) { - PMIX_LOAD_NSPACE(nspace, nptr->nspace); - return OPAL_SUCCESS; - } + /* cycle across our list of known jobids */ + OPAL_LIST_FOREACH(nptr, &localnspaces, opal_nptr_t) { + if (jobid == nptr->jobid) { + PMIX_LOAD_NSPACE(nspace, nptr->nspace); + return OPAL_SUCCESS; } } + return OPAL_ERR_NOT_FOUND; } @@ -129,29 +125,55 @@ int opal_pmix_convert_nspace(opal_jobid_t *jobid, pmix_nspace_t nspace) { opal_nptr_t *nptr; opal_jobid_t jid; + uint16_t jobfam; + uint32_t hash32, localjob = 0; + char *p = NULL; /* set a default */ *jobid = OPAL_JOBID_INVALID; - if (opal_process_info.nativelaunch) { - return opal_convert_string_to_jobid(jobid, nspace); - } else { - /* cycle across our list of known jobids */ - OPAL_LIST_FOREACH(nptr, &localnspaces, opal_nptr_t) { - if (PMIX_CHECK_NSPACE(nspace, nptr->nspace)) { - *jobid = nptr->jobid; - return OPAL_SUCCESS; - } + /* if the nspace is empty, there is nothing more to do */ + if (0 == strlen(nspace)) { + return OPAL_SUCCESS; + } + if (NULL != strstr(nspace, "JOBID_WILDCARD")) { + *jobid = OPAL_JOBID_WILDCARD; + return OPAL_SUCCESS; + } + if (NULL != strstr(nspace, "JOBID_INVALID")) { + *jobid = OPAL_JOBID_INVALID; + return OPAL_SUCCESS; + } + + /* cycle across our list of known jobids */ + OPAL_LIST_FOREACH(nptr, &localnspaces, opal_nptr_t) { + if (PMIX_CHECK_NSPACE(nspace, nptr->nspace)) { + *jobid = nptr->jobid; + return OPAL_SUCCESS; } - /* if we get here, we don't know this nspace */ - OPAL_HASH_STR(nspace, jid); - jid &= ~(0x8000); - *jobid = jid; - nptr = OBJ_NEW(opal_nptr_t); - nptr->jobid = jid; - PMIX_LOAD_NSPACE(nptr->nspace, nspace); - opal_list_append(&localnspaces, &nptr->super); } + /* if we get here, we don't know this nspace */ + /* find the "." at the end that indicates the child job */ + if (NULL != (p = strrchr(nspace, '.'))) { + *p = '\0'; + } + OPAL_HASH_STR(nspace, hash32); + if (NULL != p) { + *p = '.'; + ++p; + localjob = strtoul(p, NULL, 10); + } + + /* now compress to 16-bits */ + jobfam = (uint16_t)(((0x0000ffff & (0xffff0000 & hash32) >> 16)) ^ (0x0000ffff & hash32)); + jid = (0xffff0000 & ((uint32_t)jobfam << 16)) | (0x0000ffff & localjob); + *jobid = jid; + /* save this jobid/nspace pair */ + nptr = OBJ_NEW(opal_nptr_t); + nptr->jobid = jid; + PMIX_LOAD_NSPACE(nptr->nspace, nspace); + opal_list_append(&localnspaces, &nptr->super); + return OPAL_SUCCESS; } diff --git a/opal/mca/pmix/pmix-internal.h b/opal/mca/pmix/pmix-internal.h index d8f8dd0cfb7..12a7d670c41 100644 --- a/opal/mca/pmix/pmix-internal.h +++ b/opal/mca/pmix/pmix-internal.h @@ -595,9 +595,11 @@ OPAL_DECLSPEC int opal_pmix_convert_nspace(opal_jobid_t *jobid, pmix_nspace_t ns OPAL_DECLSPEC void opal_pmix_setup_nspace_tracker(void); OPAL_DECLSPEC void opal_pmix_finalize_nspace_tracker(void); +/* convert jobid to nspace */ #define OPAL_PMIX_CONVERT_JOBID(n, j) \ opal_pmix_convert_jobid((n), (j)) +/* convert vpid to rank */ #define OPAL_PMIX_CONVERT_VPID(r, v) \ do { \ if (OPAL_VPID_WILDCARD == (v)) { \ @@ -607,6 +609,7 @@ OPAL_DECLSPEC void opal_pmix_finalize_nspace_tracker(void); } \ } while(0) +/* convert opal_process_name_t to pmix_proc_t */ #define OPAL_PMIX_CONVERT_NAME(p, n) \ do { \ OPAL_PMIX_CONVERT_JOBID((p)->nspace, (n)->jobid); \ @@ -614,9 +617,11 @@ OPAL_DECLSPEC void opal_pmix_finalize_nspace_tracker(void); } while(0) +/* convert nspace to jobid */ #define OPAL_PMIX_CONVERT_NSPACE(r, j, n) \ (r) = opal_pmix_convert_nspace((j), (n)) +/* convert pmix rank to opal vpid */ #define OPAL_PMIX_CONVERT_RANK(v, r) \ do { \ if (PMIX_RANK_WILDCARD == (r)) { \ @@ -628,6 +633,7 @@ OPAL_DECLSPEC void opal_pmix_finalize_nspace_tracker(void); } \ } while(0) +/* convert pmix_proc_t to opal_process_name_t */ #define OPAL_PMIX_CONVERT_PROCT(r, n, p) \ do { \ OPAL_PMIX_CONVERT_NSPACE((r), &(n)->jobid, (p)->nspace); \ diff --git a/opal/mca/pmix/pmix4x/openpmix b/opal/mca/pmix/pmix4x/openpmix index a18e5313829..4c62a26b319 160000 --- a/opal/mca/pmix/pmix4x/openpmix +++ b/opal/mca/pmix/pmix4x/openpmix @@ -1 +1 @@ -Subproject commit a18e53138298d61a01fec4471518140304539e8c +Subproject commit 4c62a26b319ba78feadc42679200e93041f611a2 diff --git a/prrte b/prrte index cdea5231171..8d673047b32 160000 --- a/prrte +++ b/prrte @@ -1 +1 @@ -Subproject commit cdea5231171b2fdea11269033de9e265fc7f3a63 +Subproject commit 8d673047b325a148f55c65e049aab67f1de1d318 From f88f2710541a9fa089c3d2ed518014e083ecc89a Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Sun, 29 Mar 2020 11:58:43 -0700 Subject: [PATCH 2/4] Cleanup few errors associated with tool support Properly mark/detect that a daemon sourced the event broadcast to avoid reinjecting it into the PMIx server library. Correct the source field for the event notify call on launcher ready. Update event notification for tool support Deal with a variety of race conditions related to tool reconnection to a different server. Signed-off-by: Ralph Castain --- opal/mca/pmix/pmix4x/openpmix | 2 +- prrte | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/opal/mca/pmix/pmix4x/openpmix b/opal/mca/pmix/pmix4x/openpmix index 4c62a26b319..8c565209c21 160000 --- a/opal/mca/pmix/pmix4x/openpmix +++ b/opal/mca/pmix/pmix4x/openpmix @@ -1 +1 @@ -Subproject commit 4c62a26b319ba78feadc42679200e93041f611a2 +Subproject commit 8c565209c21f93d11e2156c0d53d73c3f6f9aaab diff --git a/prrte b/prrte index 8d673047b32..7f82facd41f 160000 --- a/prrte +++ b/prrte @@ -1 +1 @@ -Subproject commit 8d673047b325a148f55c65e049aab67f1de1d318 +Subproject commit 7f82facd41f55f49a70dc7096c668b4f38497241 From 1aabbe456d5de8fc3c3d6f91eabb8851db7d63eb Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 30 Mar 2020 16:06:40 -0700 Subject: [PATCH 3/4] Add extra libs to PRRTE binaries for external deps libevent, hwloc, and pmix can be external and may require that their libs be explicitly linked into the PRRTE binaries Signed-off-by: Ralph Castain --- config/ompi_setup_prrte.m4 | 40 +++++++++++++++++++------------------- prrte | 2 +- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/config/ompi_setup_prrte.m4 b/config/ompi_setup_prrte.m4 index a83bd618f71..b814bde1b16 100644 --- a/config/ompi_setup_prrte.m4 +++ b/config/ompi_setup_prrte.m4 @@ -24,7 +24,7 @@ # AC_DEFUN([OMPI_SETUP_PRRTE],[ - OPAL_VAR_SCOPE_PUSH([opal_prrte_save_CPPFLAGS opal_prrte_save_CFLAGS opal_prrte_save_LDFLAGS opal_prrte_save_LIBS opal_prrte_args opal_prrte_save_enable_dlopen opal_prrte_save_enable_mca_dso opal_prrte_save_enable_mca_static]) + OPAL_VAR_SCOPE_PUSH([opal_prrte_save_CPPFLAGS opal_prrte_save_CFLAGS opal_prrte_save_LDFLAGS opal_prrte_save_LIBS opal_prrte_args opal_prrte_save_enable_dlopen opal_prrte_save_enable_mca_dso opal_prrte_save_enable_mca_static opal_prrte_extra_libs opal_prrte_extra_ltlibs opal_prrte_extra_ldflags]) opal_prrte_save_CFLAGS=$CFLAGS opal_prrte_save_CPPFLAGS=$CPPFLAGS @@ -59,29 +59,29 @@ AC_DEFUN([OMPI_SETUP_PRRTE],[ if test "$enable_internal_rte" != "no"; then AC_MSG_RESULT([yes]) ompi_want_prrte=yes - if test -z $with_libevent || test "$with_libevent" = "internal" || test "$with_libevent" = "yes"; then - opal_prrte_libevent_arg="--with-libevent-header=$OMPI_TOP_SRCDIR/opal/mca/event/event.h" - elif test "$with_libevent" = "external"; then - opal_prrte_libevent_arg="" - else - opal_prrte_libevent_arg="--with-libevent=$with_libevent" + opal_prrte_extra_libs=$OMPI_TOP_BUILDDIR/opal/libopen-pal.la + opal_prrte_extra_ltlibs=$OMPI_TOP_BUILDDIR/opal/libopen-pal.la + + if test "$opal_event_external_support" = "yes"; then + opal_prrte_extra_libs="$opal_prrte_extra_libs $opal_event_external_LIBS" + opal_prrte_extra_ltlibs="$opal_prrte_extra_ltlibs $opal_event_external_LIBS" fi + # specifying --with-libevent-header causes prrte to ignore the with_libevent and with_libevent_libdir options + opal_prrte_libevent_arg="--with-libevent-header=$OMPI_TOP_SRCDIR/opal/mca/event/event.h" - if test -z $with_hwloc || test "$with_hwloc" = "internal" || test "$with_hwloc" = "yes"; then - opal_prrte_hwloc_arg="--with-hwloc-header=$OMPI_TOP_SRCDIR/opal/mca/hwloc/hwloc-internal.h" - elif test "$with_hwloc" = "external"; then - opal_prrte_hwloc_arg="" - else - opal_prrte_hwloc_arg="--with-hwloc=$with_hwloc" + if test "$opal_hwloc_external_support" = "yes"; then + opal_prrte_extra_libs="$opal_prrte_extra_libs $opal_hwloc_external_LIBS" + opal_prrte_extra_ltlibs="$opal_prrte_extra_ltlibs $opal_hwloc_external_LIBS" fi + # specifying --with-hwloc-header causes prrte to ignore the with_hwloc and with_hwloc_libdir options + opal_prrte_hwloc_arg="--with-hwloc-header=$OMPI_TOP_SRCDIR/opal/mca/hwloc/hwloc-internal.h" - if test -z $with_pmix || test "$with_pmix" = "internal" || test "$with_pmix" = "yes"; then - opal_prrte_pmix_arg="--with-pmix-header=$OMPI_TOP_SRCDIR/opal/mca/pmix/pmix-internal.h" - elif test "$with_pmix" = "external"; then - opal_prrte_pmix_arg="" - else - opal_prrte_pmix_arg="--with-pmix=$with_pmix" + if test "$opal_external_pmix_happy" = "yes"; then + opal_prrte_extra_libs="$opal_prrte_extra_libs $opal_pmix_external_LIBS" + opal_prrte_extra_ltlibs="$opal_prrte_extra_ltlibs $opal_pmix_external_LIBS" fi + # specifying --with-pmix-header causes prrte to ignore the with_pmix and with_pmix_libdir options + opal_prrte_pmix_arg="--with-pmix-header=$OMPI_TOP_SRCDIR/opal/mca/pmix/pmix-internal.h" if test -z $enable_prte_prefix_by_default || test "$enable_prte_prefix_by_default" = "yes" || test "$enable_orterun_prefix_given" = "yes"; then @@ -102,7 +102,7 @@ AC_DEFUN([OMPI_SETUP_PRRTE],[ opal_prrte_args="$opal_prrte_args --with-platform=$with_prrte_platform" fi # add the extra libs - opal_prrte_args="$opal_prrte_args --with-prrte-extra-lib=$OMPI_TOP_BUILDDIR/opal/libopen-pal.la --with-prrte-extra-ltlib=$OMPI_TOP_BUILDDIR/opal/libopen-pal.la" + opal_prrte_args="$opal_prrte_args --with-prrte-extra-lib=\"$opal_prrte_extra_libs\" --with-prrte-extra-ltlib=\"$opal_prrte_extra_ltlibs\"" AC_MSG_CHECKING([final prrte configure args]) AC_MSG_RESULT([$opal_prrte_args]) diff --git a/prrte b/prrte index 7f82facd41f..6bab23ee556 160000 --- a/prrte +++ b/prrte @@ -1 +1 @@ -Subproject commit 7f82facd41f55f49a70dc7096c668b4f38497241 +Subproject commit 6bab23ee556e7d30586c951808a340cd0f787989 From 556b3fcc00bade0ed34044929f2aca954a2adf2d Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 31 Mar 2020 07:03:40 -0700 Subject: [PATCH 4/4] PRRTE: Return non-zero status on timeout Signed-off-by: Ralph Castain --- prrte | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prrte b/prrte index 6bab23ee556..d879d566937 160000 --- a/prrte +++ b/prrte @@ -1 +1 @@ -Subproject commit 6bab23ee556e7d30586c951808a340cd0f787989 +Subproject commit d879d5669379ffbe093d60b76b46cc9b2aae20e1