diff --git a/opal/Makefile.am b/opal/Makefile.am index ca90f302860..e7163fca243 100644 --- a/opal/Makefile.am +++ b/opal/Makefile.am @@ -2,7 +2,7 @@ # Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana # University Research and Technology # Corporation. All rights reserved. -# Copyright (c) 2004-2009 The University of Tennessee and The University +# Copyright (c) 2004-2022 The University of Tennessee and The University # of Tennessee Research Foundation. All rights # reserved. # Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, @@ -70,30 +70,59 @@ DIST_SUBDIRS = \ $(MCA_opal_FRAMEWORK_COMPONENT_ALL_SUBDIRS) endif +noinst_LTLIBRARIES = lib@OPAL_LIB_NAME@_core.la lib@OPAL_LIB_NAME@_util.la + +# The first convenience library for the core functionality, everything that is shared +# between the base and the util libraries. We need this in order to avoid duplicate +# symbols when lib@OPAL_LIB_NAME@.la depends directly of lib@OPAL_LIB_NAME@_util.la +# (because $MCA_opal_FRAMEWORK_LIBS already contains the _STATIC_LTLIBS added to the +# lib@OPAL_LIB_NAME@_util.la library). +lib@OPAL_LIB_NAME@_core_la_SOURCES = +lib@OPAL_LIB_NAME@_core_la_LIBADD = \ + datatype/libdatatype.la \ + mca/base/libmca_base.la \ + util/libopalutil.la + +lib@OPAL_LIB_NAME@_util_la_SOURCES = +lib@OPAL_LIB_NAME@_util_la_LIBADD = \ + lib@OPAL_LIB_NAME@_core.la \ + mca/backtrace/libmca_backtrace.la \ + $(MCA_opal_backtrace_STATIC_LTLIBS) \ + mca/threads/libmca_threads.la \ + $(MCA_opal_threads_STATIC_LTLIBS) \ + mca/timer/libmca_timer.la \ + $(MCA_opal_timer_STATIC_LTLIBS) \ + mca/installdirs/libmca_installdirs.la \ + $(MCA_opal_installdirs_STATIC_LTLIBS) \ + mca/if/libmca_if.la \ + $(MCA_opal_if_STATIC_LTLIBS) \ + mca/dl/libmca_dl.la \ + $(MCA_opal_dl_STATIC_LTLIBS) \ + $(opal_libevent_LIBS) +lib@OPAL_LIB_NAME@_util_la_DEPENDENCIES = \ + lib@OPAL_LIB_NAME@_core.la + # Build the main OPAL library lib_LTLIBRARIES = lib@OPAL_LIB_NAME@.la lib@OPAL_LIB_NAME@_la_SOURCES = lib@OPAL_LIB_NAME@_la_LIBADD = \ - datatype/libdatatype.la \ - mca/base/libmca_base.la \ - util/libopalutil.la \ + lib@OPAL_LIB_NAME@_core.la \ $(MCA_opal_FRAMEWORK_LIBS) \ $(opal_libevent_LIBS) \ $(opal_hwloc_LIBS) \ $(opal_pmix_LIBS) lib@OPAL_LIB_NAME@_la_DEPENDENCIES = \ - datatype/libdatatype.la \ - mca/base/libmca_base.la \ - util/libopalutil.la \ + lib@OPAL_LIB_NAME@_core.la \ $(MCA_opal_FRAMEWORK_LIBS) + lib@OPAL_LIB_NAME@_la_LDFLAGS = -version-info @libopen_pal_so_version@ \ + lib@OPAL_LIB_NAME@_core_LDFLAGS \ $(opal_libevent_LDFLAGS) \ $(opal_hwloc_LDFLAGS) \ $(opal_pmix_LDFLAGS) # included subdirectory Makefile.am's and appended-to variables headers = -noinst_LTLIBRARIES = dist_opaldata_DATA = lib@OPAL_LIB_NAME@_la_SOURCES += $(headers) diff --git a/opal/class/Makefile.am b/opal/class/Makefile.am index ed49054c8ea..69f8e8f6ece 100644 --- a/opal/class/Makefile.am +++ b/opal/class/Makefile.am @@ -3,7 +3,7 @@ # Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana # University Research and Technology # Corporation. All rights reserved. -# Copyright (c) 2004-2007 The University of Tennessee and The University +# Copyright (c) 2004-2022 The University of Tennessee and The University # of Tennessee Research Foundation. All rights # reserved. # Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -42,7 +42,7 @@ headers += \ class/opal_rb_tree.h \ class/opal_interval_tree.h -lib@OPAL_LIB_NAME@_la_SOURCES += \ +lib@OPAL_LIB_NAME@_core_la_SOURCES += \ class/opal_bitmap.c \ class/opal_cstring.c \ class/opal_free_list.c \ diff --git a/opal/mca/pmix/base/pmix_base_frame.c b/opal/mca/pmix/base/pmix_base_frame.c index 319b21ccddc..cda84d77aba 100644 --- a/opal/mca/pmix/base/pmix_base_frame.c +++ b/opal/mca/pmix/base/pmix_base_frame.c @@ -1,6 +1,9 @@ /* * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. * Copyright (c) 2015-2016 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2022 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -16,6 +19,7 @@ #include "opal/mca/threads/thread_usage.h" #include "opal/util/argv.h" #include "opal/util/output.h" +#include "opal/util/proc.h" #include "opal/mca/pmix/base/base.h" #include "opal/mca/pmix/pmix-internal.h" @@ -59,6 +63,32 @@ static int opal_pmix_base_frame_register(mca_base_register_flag_t flags) return OPAL_SUCCESS; } +static char* +opal_get_proc_hostname_using_pmix(const opal_proc_t *proc) +{ + int ret; + char *hostname; + + /* if the proc is NULL, then we can't know */ + if (NULL == proc) { + return strdup("unknown"); + } + + /* if it is my own hostname we are after, then just hand back + * the value in opal_process_info */ + if (proc == opal_proc_local_get()) { + return strdup(opal_process_info.nodename); + } + /* if we don't already have it, then try to get it */ + OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, PMIX_HOSTNAME, &proc->proc_name, (char **) &hostname, + PMIX_STRING); + if (OPAL_SUCCESS != ret) { + return strdup("unknown"); // return something so the caller doesn't segfault + } + /* user is not allowed to release the data */ + return hostname; +} + static int opal_pmix_base_frame_close(void) { int rc; @@ -77,6 +107,8 @@ static int opal_pmix_base_frame_open(mca_base_open_flag_t flags) opal_pmix_base.evbase = opal_sync_event_base; /* pass across the verbosity */ opal_pmix_verbose_output = opal_pmix_base_framework.framework_output; + /* Set the distributed name service via PMIx */ + opal_get_proc_hostname = opal_get_proc_hostname_using_pmix; return rc; } diff --git a/opal/mca/shmem/posix/shmem_posix_module.c b/opal/mca/shmem/posix/shmem_posix_module.c index b6fbcb77da4..9ef16b2ce9f 100644 --- a/opal/mca/shmem/posix/shmem_posix_module.c +++ b/opal/mca/shmem/posix/shmem_posix_module.c @@ -205,7 +205,7 @@ static int segment_create(opal_shmem_ds_t *ds_buf, const char *file_name, size_t * already set for us :-) */ - /* set "valid" bit because setment creation was successful */ + /* set "valid" bit because segment creation was successful */ OPAL_SHMEM_DS_SET_VALID(ds_buf); OPAL_OUTPUT_VERBOSE((70, opal_shmem_base_framework.framework_output, diff --git a/opal/runtime/Makefile.am b/opal/runtime/Makefile.am index 0f3abc64e06..b12f4b40cd2 100644 --- a/opal/runtime/Makefile.am +++ b/opal/runtime/Makefile.am @@ -3,7 +3,7 @@ # Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana # University Research and Technology # Corporation. All rights reserved. -# Copyright (c) 2004-2020 The University of Tennessee and The University +# Copyright (c) 2004-2022 The University of Tennessee and The University # of Tennessee Research Foundation. All rights # reserved. # Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -37,10 +37,14 @@ headers += \ runtime/opal_params.h \ runtime/opal_progress_threads.h -lib@OPAL_LIB_NAME@_la_SOURCES += \ +lib@OPAL_LIB_NAME@_core_la_SOURCES += \ runtime/opal_progress.c \ - runtime/opal_finalize.c \ - runtime/opal_init.c \ runtime/opal_params.c \ + runtime/opal_util_finalize.c \ + runtime/opal_util_init.c \ runtime/opal_info_support.c \ runtime/opal_progress_threads.c + +lib@OPAL_LIB_NAME@_la_SOURCES += \ + runtime/opal_finalize.c \ + runtime/opal_init.c diff --git a/opal/runtime/opal_finalize.c b/opal/runtime/opal_finalize.c index 94cd775af5b..dd51b840616 100644 --- a/opal/runtime/opal_finalize.c +++ b/opal/runtime/opal_finalize.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University + * Copyright (c) 2004-2022 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -43,121 +43,6 @@ #include "opal/util/show_help.h" extern int opal_initialized; -extern int opal_util_initialized; -extern bool opal_init_called; - -static opal_mutex_t opal_finalize_cleanup_fns_lock = OPAL_MUTEX_STATIC_INIT; -opal_list_t opal_finalize_cleanup_fns = {{0}}; - -struct opal_cleanup_fn_item_t { - opal_list_item_t super; - opal_cleanup_fn_t cleanup_fn; - void *user_data; -#if OPAL_ENABLE_DEBUG - char *cleanup_fn_name; -#endif -}; - -typedef struct opal_cleanup_fn_item_t opal_cleanup_fn_item_t; -OBJ_CLASS_DECLARATION(opal_cleanup_fn_item_t); - -static void opal_cleanup_fn_item_construct(opal_cleanup_fn_item_t *item) -{ -#if OPAL_ENABLE_DEBUG - item->cleanup_fn_name = NULL; -#endif -} - -static void opal_cleanup_fn_item_destruct(opal_cleanup_fn_item_t *item) -{ -#if OPAL_ENABLE_DEBUG - free(item->cleanup_fn_name); - item->cleanup_fn_name = NULL; -#endif -} - -OBJ_CLASS_INSTANCE(opal_cleanup_fn_item_t, opal_list_item_t, opal_cleanup_fn_item_construct, - opal_cleanup_fn_item_destruct); - -static void opal_finalize_domain_construct(opal_finalize_domain_t *domain) -{ - domain->domain_name = NULL; -} - -static void opal_finalize_domain_destruct(opal_finalize_domain_t *domain) -{ - free(domain->domain_name); - domain->domain_name = NULL; -} - -OBJ_CLASS_INSTANCE(opal_finalize_domain_t, opal_list_t, opal_finalize_domain_construct, - opal_finalize_domain_destruct); - -static opal_finalize_domain_t *current_finalize_domain; -opal_finalize_domain_t opal_init_util_domain = {{{0}}}; -opal_finalize_domain_t opal_init_domain = {{{0}}}; - -void opal_finalize_append_cleanup(opal_cleanup_fn_t cleanup_fn, const char *fn_name, - void *user_data) -{ - opal_cleanup_fn_item_t *cleanup_item = OBJ_NEW(opal_cleanup_fn_item_t); - assert(NULL != cleanup_item); - cleanup_item->cleanup_fn = cleanup_fn; - cleanup_item->user_data = user_data; -#if OPAL_ENABLE_DEBUG - cleanup_item->cleanup_fn_name = strdup(fn_name); - assert(NULL != cleanup_item->cleanup_fn_name); -#else - (void) fn_name; -#endif - - opal_mutex_lock(&opal_finalize_cleanup_fns_lock); - opal_list_append(¤t_finalize_domain->super, &cleanup_item->super); - opal_mutex_unlock(&opal_finalize_cleanup_fns_lock); -} - -void opal_finalize_domain_init(opal_finalize_domain_t *domain, const char *domain_name) -{ - free(domain->domain_name); - domain->domain_name = domain_name ? strdup(domain_name) : NULL; -} - -void opal_finalize_set_domain(opal_finalize_domain_t *domain) -{ - current_finalize_domain = domain; -} - -void opal_finalize_cleanup_domain(opal_finalize_domain_t *domain) -{ - opal_cleanup_fn_item_t *cleanup_item, *next; - /* call any registered cleanup functions before tearing down OPAL */ - OPAL_LIST_FOREACH_SAFE_REV (cleanup_item, next, &domain->super, opal_cleanup_fn_item_t) { - cleanup_item->cleanup_fn(cleanup_item->user_data); - opal_list_remove_item(&domain->super, &cleanup_item->super); - OBJ_RELEASE(cleanup_item); - } -} - -int opal_finalize_util(void) -{ - if (--opal_util_initialized != 0) { - if (opal_util_initialized < 0) { - return OPAL_ERROR; - } - return OPAL_SUCCESS; - } - - opal_finalize_cleanup_domain(&opal_init_util_domain); - OBJ_DESTRUCT(&opal_init_util_domain); - - /* finalize the class/object system */ - opal_class_finalize(); - - free(opal_process_info.nodename); - opal_process_info.nodename = NULL; - - return OPAL_SUCCESS; -} int opal_finalize(void) { diff --git a/opal/runtime/opal_init.c b/opal/runtime/opal_init.c index 169cd89af7a..dfca20d1fe3 100644 --- a/opal/runtime/opal_init.c +++ b/opal/runtime/opal_init.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2020 The University of Tennessee and The University + * Copyright (c) 2004-2022 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -83,339 +83,13 @@ const char opal_version_string[] = OPAL_IDENT_STRING; int opal_initialized = 0; -bool opal_init_called = false; -int opal_util_initialized = 0; /* We have to put a guess in here in case hwloc is not available. If hwloc is available, this value will be overwritten when the hwloc data is loaded. */ int opal_cache_line_size = 128; -bool opal_warn_on_fork = true; -/* If there is a preprocessor macro that redefined the call to - * gethostname, we undefine that here */ -#ifdef gethostname -# undef gethostname -#endif - -#define NUM_TRIES_FOR_NULL_HOSTNAME 8 - -/* - * This gethostname wrapper does not return the full-length hostname in - * those rare cases where it is too long for the buffer. It does, however, - * guarantee a null-terminated hostname is returned, even if it's - * truncated. It also tries again in the case where gethostname returns an - * error because the buffer is initially too short. - */ -int opal_init_gethostname(void) -{ - size_t count, length = OPAL_LOCAL_MAXHOSTNAMELEN; - int ret_val, num_tries = 0; - - char *buf = calloc(1, length); - if (NULL == buf) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - - while (num_tries < NUM_TRIES_FOR_NULL_HOSTNAME) { - ++num_tries; - - /* - * Offer all but the last byte of the buffer to gethostname. - */ - ret_val = gethostname(buf, length - 1); - /* - * Terminate the buffer in the last position. - */ - buf[length - 1] = '\0'; - if (0 == ret_val) { - count = strlen(buf); - /* The result was not truncated */ - if (count > 0 && count < length - 1) { - /* - * If we got a good result, save it. This value may - * be longer than what callers to opal_gethostname() - * are expecting, so that should be checked by the - * caller. - */ - opal_process_info.nodename = buf; - return OPAL_SUCCESS; - } - /* - * "Good" cases: - * - * 0 == count: The buffer is empty. In some gethostname - * implementations, this can be because the - * buffer was too small. - * (length-1) == count: The result *may* be truncated. - * - * If it's one of these cases, we'll fall through to - * increase the length of the buffer and try again. - * - * If it's not one of these good cases, it's an error: - * return. - */ - else if (!(0 == count || count == length - 1)) { - free(buf); - return OPAL_ERR_IN_ERRNO; - } - } - /* - * "Good" cases: - * - * errno == EINVAL or ENAMETOOLONG: hostname was truncated and - * there was an error. Perhaps there is something - * in the buffer and perhaps not. - * - * If it's one of these cases, we'll fall through to - * increase the length of the buffer and try again. - * - * If it's not one of these good cases, it's an error: return. - */ - else if (!(EINVAL == errno || ENAMETOOLONG == errno)) { - free(buf); - return OPAL_ERR_IN_ERRNO; - } - - /* - * If we get here, it means we want to double the length of - * the buffer and try again. - */ - length *= 2; - buf = realloc(buf, length); - if (NULL == buf) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - } /* end while */ - - /* If we got here, it means that we tried too many times and are - * giving up. */ - free(buf); - return OPAL_ERR_NOT_FOUND; -} - -static int opal_err2str(int errnum, const char **errmsg) -{ - const char *retval; - - switch (errnum) { - case OPAL_SUCCESS: - retval = "Success"; - break; - case OPAL_ERROR: - retval = "Error"; - break; - case OPAL_ERR_OUT_OF_RESOURCE: - retval = "Out of resource"; - break; - case OPAL_ERR_TEMP_OUT_OF_RESOURCE: - retval = "Temporarily out of resource"; - break; - case OPAL_ERR_RESOURCE_BUSY: - retval = "Resource busy"; - break; - case OPAL_ERR_BAD_PARAM: - retval = "Bad parameter"; - break; - case OPAL_ERR_FATAL: - retval = "Fatal"; - break; - case OPAL_ERR_NOT_IMPLEMENTED: - retval = "Not implemented"; - break; - case OPAL_ERR_NOT_SUPPORTED: - retval = "Not supported"; - break; - case OPAL_ERR_INTERRUPTED: - retval = "Interrupted"; - break; - case OPAL_ERR_WOULD_BLOCK: - retval = "Would block"; - break; - case OPAL_ERR_IN_ERRNO: - retval = "In errno"; - break; - case OPAL_ERR_UNREACH: - retval = "Unreachable"; - break; - case OPAL_ERR_NOT_FOUND: - retval = "Not found"; - break; - case OPAL_EXISTS: - retval = "Exists"; - break; - case OPAL_ERR_TIMEOUT: - retval = "Timeout"; - break; - case OPAL_ERR_NOT_AVAILABLE: - retval = "Not available"; - break; - case OPAL_ERR_PERM: - retval = "No permission"; - break; - case OPAL_ERR_VALUE_OUT_OF_BOUNDS: - retval = "Value out of bounds"; - break; - case OPAL_ERR_FILE_READ_FAILURE: - retval = "File read failure"; - break; - case OPAL_ERR_FILE_WRITE_FAILURE: - retval = "File write failure"; - break; - case OPAL_ERR_FILE_OPEN_FAILURE: - retval = "File open failure"; - break; - case OPAL_ERR_PACK_MISMATCH: - retval = "Pack data mismatch"; - break; - case OPAL_ERR_PACK_FAILURE: - retval = "Data pack failed"; - break; - case OPAL_ERR_UNPACK_FAILURE: - retval = "Data unpack failed"; - break; - case OPAL_ERR_UNPACK_INADEQUATE_SPACE: - retval = "Data unpack had inadequate space"; - break; - case OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER: - retval = "Data unpack would read past end of buffer"; - break; - case OPAL_ERR_OPERATION_UNSUPPORTED: - retval = "Requested operation is not supported on referenced data type"; - break; - case OPAL_ERR_UNKNOWN_DATA_TYPE: - retval = "Unknown data type"; - break; - case OPAL_ERR_BUFFER: - retval = "Buffer type (described vs non-described) mismatch - operation not allowed"; - break; - case OPAL_ERR_DATA_TYPE_REDEF: - retval = "Attempt to redefine an existing data type"; - break; - case OPAL_ERR_DATA_OVERWRITE_ATTEMPT: - retval = "Attempt to overwrite a data value"; - break; - case OPAL_ERR_MODULE_NOT_FOUND: - retval = "Framework requires at least one active module, but none found"; - break; - case OPAL_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED: - retval = "OS topology does not support slot_list process affinity"; - break; - case OPAL_ERR_TOPO_SOCKET_NOT_SUPPORTED: - retval = "Could not obtain socket topology information"; - break; - case OPAL_ERR_TOPO_CORE_NOT_SUPPORTED: - retval = "Could not obtain core topology information"; - break; - case OPAL_ERR_NOT_ENOUGH_SOCKETS: - retval = "Not enough sockets to meet request"; - break; - case OPAL_ERR_NOT_ENOUGH_CORES: - retval = "Not enough cores to meet request"; - break; - case OPAL_ERR_INVALID_PHYS_CPU: - retval = "Invalid physical cpu number returned"; - break; - case OPAL_ERR_MULTIPLE_AFFINITIES: - retval = "Multiple methods for assigning process affinity were specified"; - break; - case OPAL_ERR_SLOT_LIST_RANGE: - retval = "Provided slot_list range is invalid"; - break; - case OPAL_ERR_NETWORK_NOT_PARSEABLE: - retval = "Provided network specification is not parseable"; - break; - case OPAL_ERR_SILENT: - retval = NULL; - break; - case OPAL_ERR_NOT_INITIALIZED: - retval = "Not initialized"; - break; - case OPAL_ERR_NOT_BOUND: - retval = "Not bound"; - break; - case OPAL_ERR_TAKE_NEXT_OPTION: - retval = "Take next option"; - break; - case OPAL_ERR_PROC_ENTRY_NOT_FOUND: - retval = "Database entry not found"; - break; - case OPAL_ERR_DATA_VALUE_NOT_FOUND: - retval = "Data for specified key not found"; - break; - case OPAL_ERR_CONNECTION_FAILED: - retval = "Connection failed"; - break; - case OPAL_ERR_AUTHENTICATION_FAILED: - retval = "Authentication failed"; - break; - case OPAL_ERR_COMM_FAILURE: - retval = "Comm failure"; - break; - case OPAL_ERR_SERVER_NOT_AVAIL: - retval = "Server not available"; - break; - case OPAL_ERR_IN_PROCESS: - retval = "Operation in process"; - break; - case OPAL_ERR_DEBUGGER_RELEASE: - retval = "Release debugger"; - break; - case OPAL_ERR_HANDLERS_COMPLETE: - retval = "Event handlers complete"; - break; - case OPAL_ERR_PARTIAL_SUCCESS: - retval = "Partial success"; - break; - case OPAL_ERR_PROC_ABORTED: - retval = "Process abnormally terminated"; - break; - case OPAL_ERR_PROC_REQUESTED_ABORT: - retval = "Process requested abort"; - break; - case OPAL_ERR_PROC_ABORTING: - retval = "Process is aborting"; - break; - case OPAL_ERR_NODE_DOWN: - retval = "Node has gone down"; - break; - case OPAL_ERR_NODE_OFFLINE: - retval = "Node has gone offline"; - break; - case OPAL_ERR_JOB_TERMINATED: - retval = "Job terminated"; - break; - case OPAL_ERR_PROC_RESTART: - retval = "Process restarted"; - break; - case OPAL_ERR_PROC_CHECKPOINT: - retval = "Process checkpoint"; - break; - case OPAL_ERR_PROC_MIGRATE: - retval = "Process migrate"; - break; - case OPAL_ERR_EVENT_REGISTRATION: - retval = "Event registration"; - break; - case OPAL_ERR_HEARTBEAT_ALERT: - retval = "Heartbeat not received"; - break; - case OPAL_ERR_FILE_ALERT: - retval = "File alert - proc may have stalled"; - break; - case OPAL_ERR_MODEL_DECLARED: - retval = "Model declared"; - break; - case OPAL_PMIX_LAUNCH_DIRECTIVE: - retval = "Launch directive"; - break; - - default: - retval = "UNRECOGNIZED"; - } - - *errmsg = retval; - return OPAL_SUCCESS; -} +/* Defined in opal_util_init.c and part of the opal_util.la library */ +int opal_init_error(const char *error, int ret); int opal_init_psm(void) { @@ -444,172 +118,6 @@ int opal_init_psm(void) return OPAL_SUCCESS; } -static int opal_init_error(const char *error, int ret) -{ - if (OPAL_ERR_SILENT != ret) { - opal_show_help("help-opal-runtime.txt", "opal_init:startup:internal-failure", true, error, - ret); - } - return ret; -} - -static mca_base_framework_t *opal_init_util_frameworks[] = { - &opal_installdirs_base_framework, - &opal_if_base_framework, - NULL, -}; - -int opal_init_util(int *pargc, char ***pargv) -{ - int ret; - char *error = NULL; - OPAL_TIMING_ENV_INIT(otmng); - - if (opal_util_initialized != 0) { - if (opal_util_initialized < 0) { - return OPAL_ERROR; - } - ++opal_util_initialized; - return OPAL_SUCCESS; - } - - OBJ_CONSTRUCT(&opal_init_util_domain, opal_finalize_domain_t); - (void) opal_finalize_domain_init(&opal_init_util_domain, "opal_init_util"); - opal_finalize_set_domain(&opal_init_util_domain); - - opal_thread_set_main(); - - opal_init_called = true; - - /* set the nodename right away so anyone who needs it has it. Note - * that we don't bother with fqdn and prefix issues here - we let - * the RTE later replace this with a modified name if the user - * requests it */ - ret = opal_init_gethostname(); - if (OPAL_SUCCESS != ret) { - fprintf(stderr, - "opal_init_gethostname() failed -- process will likely abort (%s:%d, returned %d " - "instead of OPAL_SUCCESS)\n", - __FILE__, __LINE__, ret); - return ret; - } - - /* initialize the memory allocator */ - opal_malloc_init(); - - OPAL_TIMING_ENV_NEXT(otmng, "opal_malloc_init"); - - /* initialize the output system */ - opal_output_init(); - - /* initialize install dirs code */ - if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_installdirs_base_framework, 0))) { - fprintf(stderr, - "opal_installdirs_base_open() failed -- process will likely abort (%s:%d, returned " - "%d instead of OPAL_SUCCESS)\n", - __FILE__, __LINE__, ret); - return ret; - } - - /* initialize the help system */ - opal_show_help_init(); - - OPAL_TIMING_ENV_NEXT(otmng, "opal_show_help_init"); - - /* register handler for errnum -> string converstion */ - if (OPAL_SUCCESS - != (ret = opal_error_register("OPAL", OPAL_ERR_BASE, OPAL_ERR_MAX, opal_err2str))) { - return opal_init_error("opal_error_register", ret); - } - - /* keyval lex-based parser */ - if (OPAL_SUCCESS != (ret = opal_util_keyval_parse_init())) { - return opal_init_error("opal_util_keyval_parse_init", ret); - } - - // Disable PSM signal hijacking (see comment in function for more - // details) - opal_init_psm(); - - OPAL_TIMING_ENV_NEXT(otmng, "opal_init_psm"); - - /* Setup the parameter system */ - if (OPAL_SUCCESS != (ret = mca_base_var_init())) { - return opal_init_error("mca_base_var_init", ret); - } - OPAL_TIMING_ENV_NEXT(otmng, "opal_var_init"); - - /* read any param files that were provided */ - if (OPAL_SUCCESS != (ret = mca_base_var_cache_files(false))) { - return opal_init_error("failed to cache files", ret); - } - - OPAL_TIMING_ENV_NEXT(otmng, "opal_var_cache"); - - /* register params for opal */ - if (OPAL_SUCCESS != (ret = opal_register_params())) { - return opal_init_error("opal_register_params", ret); - } - - if (OPAL_SUCCESS != (ret = opal_net_init())) { - return opal_init_error("opal_net_init", ret); - } - - OPAL_TIMING_ENV_NEXT(otmng, "opal_net_init"); - - /* pretty-print stack handlers */ - if (OPAL_SUCCESS != (ret = opal_util_register_stackhandlers())) { - return opal_init_error("opal_util_register_stackhandlers", ret); - } - - /* set system resource limits - internally protected against - * doing so twice in cases where the launch agent did it for us - */ - if (OPAL_SUCCESS != (ret = opal_util_init_sys_limits(&error))) { - opal_show_help("help-opal-runtime.txt", "opal_init:syslimit", false, error); - return OPAL_ERR_SILENT; - } - - /* initialize the arch string */ - if (OPAL_SUCCESS != (ret = opal_arch_init())) { - return opal_init_error("opal_arch_init", ret); - } - - OPAL_TIMING_ENV_NEXT(otmng, "opal_arch_init"); - - /* initialize the datatype engine */ - if (OPAL_SUCCESS != (ret = opal_datatype_init())) { - return opal_init_error("opal_datatype_init", ret); - } - - OPAL_TIMING_ENV_NEXT(otmng, "opal_datatype_init"); - - /* initialize the mca */ - if (OPAL_SUCCESS != (ret = mca_base_open())) { - return opal_init_error("mca_base_open", ret); - } - - OPAL_TIMING_ENV_NEXT(otmng, "mca_base_open"); - - /* initialize if framework */ - if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_if_base_framework, 0))) { - fprintf(stderr, - "opal_if_base_open() failed -- process will likely abort (%s:%d, returned %d " - "instead of OPAL_SUCCESS)\n", - __FILE__, __LINE__, ret); - return ret; - } - - /* register for */ - opal_finalize_register_cleanup_arg(mca_base_framework_close_list, opal_init_util_frameworks); - - OPAL_TIMING_ENV_NEXT(otmng, "opal_if_init"); - - ++opal_util_initialized; - - return OPAL_SUCCESS; -} - /* the memcpy component should be one of the first who get * loaded in order to make sure we have all the available * versions of memcpy correctly configured. @@ -640,6 +148,12 @@ int opal_init(int *pargc, char ***pargv) return ret; } + // Disable PSM signal hijacking (see comment in function for more + // details) + opal_init_psm(); + + OPAL_TIMING_ENV_NEXT(otmng, "opal_init_psm"); + OBJ_CONSTRUCT(&opal_init_domain, opal_finalize_domain_t); (void) opal_finalize_domain_init(&opal_init_domain, "opal_init"); opal_finalize_set_domain(&opal_init_domain); diff --git a/opal/runtime/opal_util_finalize.c b/opal/runtime/opal_util_finalize.c new file mode 100644 index 00000000000..c7516477d44 --- /dev/null +++ b/opal/runtime/opal_util_finalize.c @@ -0,0 +1,154 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2022 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2008-2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2010-2015 Los Alamos National Security, LLC. + * All rights reserved. + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2016-2017 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2017 Amazon.com, Inc. or its affiliates. + * All Rights reserved. + * Copyright (c) 2018 Triad National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** @file **/ + +#include "opal_config.h" + +#include "opal/class/opal_object.h" +#include "opal/constants.h" +#include "opal/runtime/opal.h" +#include "opal/util/output.h" +#include "opal/util/proc.h" + +extern int opal_util_initialized; + +static opal_mutex_t opal_finalize_cleanup_fns_lock = OPAL_MUTEX_STATIC_INIT; +opal_list_t opal_finalize_cleanup_fns = {{0}}; + +struct opal_cleanup_fn_item_t { + opal_list_item_t super; + opal_cleanup_fn_t cleanup_fn; + void *user_data; +#if OPAL_ENABLE_DEBUG + char *cleanup_fn_name; +#endif +}; + +typedef struct opal_cleanup_fn_item_t opal_cleanup_fn_item_t; +OBJ_CLASS_DECLARATION(opal_cleanup_fn_item_t); + +static void opal_cleanup_fn_item_construct(opal_cleanup_fn_item_t *item) +{ +#if OPAL_ENABLE_DEBUG + item->cleanup_fn_name = NULL; +#endif +} + +static void opal_cleanup_fn_item_destruct(opal_cleanup_fn_item_t *item) +{ +#if OPAL_ENABLE_DEBUG + free(item->cleanup_fn_name); + item->cleanup_fn_name = NULL; +#endif +} + +OBJ_CLASS_INSTANCE(opal_cleanup_fn_item_t, opal_list_item_t, opal_cleanup_fn_item_construct, + opal_cleanup_fn_item_destruct); + +static void opal_finalize_domain_construct(opal_finalize_domain_t *domain) +{ + domain->domain_name = NULL; +} + +static void opal_finalize_domain_destruct(opal_finalize_domain_t *domain) +{ + free(domain->domain_name); + domain->domain_name = NULL; +} + +OBJ_CLASS_INSTANCE(opal_finalize_domain_t, opal_list_t, opal_finalize_domain_construct, + opal_finalize_domain_destruct); + +static opal_finalize_domain_t *current_finalize_domain; +opal_finalize_domain_t opal_init_util_domain = {{{0}}}; +opal_finalize_domain_t opal_init_domain = {{{0}}}; + +void opal_finalize_append_cleanup(opal_cleanup_fn_t cleanup_fn, const char *fn_name, + void *user_data) +{ + opal_cleanup_fn_item_t *cleanup_item = OBJ_NEW(opal_cleanup_fn_item_t); + assert(NULL != cleanup_item); + cleanup_item->cleanup_fn = cleanup_fn; + cleanup_item->user_data = user_data; +#if OPAL_ENABLE_DEBUG + cleanup_item->cleanup_fn_name = strdup(fn_name); + assert(NULL != cleanup_item->cleanup_fn_name); +#else + (void) fn_name; +#endif + + opal_mutex_lock(&opal_finalize_cleanup_fns_lock); + opal_list_append(¤t_finalize_domain->super, &cleanup_item->super); + opal_mutex_unlock(&opal_finalize_cleanup_fns_lock); +} + +void opal_finalize_domain_init(opal_finalize_domain_t *domain, const char *domain_name) +{ + free(domain->domain_name); + domain->domain_name = domain_name ? strdup(domain_name) : NULL; +} + +void opal_finalize_set_domain(opal_finalize_domain_t *domain) +{ + current_finalize_domain = domain; +} + +void opal_finalize_cleanup_domain(opal_finalize_domain_t *domain) +{ + opal_cleanup_fn_item_t *cleanup_item, *next; + /* call any registered cleanup functions before tearing down OPAL */ + OPAL_LIST_FOREACH_SAFE_REV (cleanup_item, next, &domain->super, opal_cleanup_fn_item_t) { + cleanup_item->cleanup_fn(cleanup_item->user_data); + opal_list_remove_item(&domain->super, &cleanup_item->super); + OBJ_RELEASE(cleanup_item); + } +} + +int opal_finalize_util(void) +{ + if (--opal_util_initialized != 0) { + if (opal_util_initialized < 0) { + return OPAL_ERROR; + } + return OPAL_SUCCESS; + } + + opal_finalize_cleanup_domain(&opal_init_util_domain); + OBJ_DESTRUCT(&opal_init_util_domain); + + /* finalize the class/object system */ + opal_class_finalize(); + + free(opal_process_info.nodename); + opal_process_info.nodename = NULL; + + return OPAL_SUCCESS; +} + diff --git a/opal/runtime/opal_util_init.c b/opal/runtime/opal_util_init.c new file mode 100644 index 00000000000..0e8736931ce --- /dev/null +++ b/opal/runtime/opal_util_init.c @@ -0,0 +1,574 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2022 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007-2020 Cisco Systems, Inc. All rights reserved + * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2010-2015 Los Alamos National Security, LLC. + * All rights reserved. + * Copyright (c) 2013-2019 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2017 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2017-2020 Amazon.com, Inc. or its affiliates. + * All Rights reserved. + * Copyright (c) 2018 Mellanox Technologies, Inc. + * All rights reserved. + * Copyright (c) 2018-2019 Triad National Security, LLC. All rights + * reserved. + * Copyright (c) 2020 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2021 Nanook Consulting. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** @file **/ + +#ifdef HAVE_UNISTD_H +# include +#endif + +#include "opal/include/opal_config.h" + +#include "opal/datatype/opal_datatype.h" +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_var.h" +#include "opal/mca/hwloc/base/base.h" +#include "opal/mca/if/base/base.h" +#include "opal/mca/installdirs/base/base.h" +#include "opal/mca/memchecker/base/base.h" +#include "opal/mca/memcpy/base/base.h" +#include "opal/mca/memory/base/base.h" +#include "opal/mca/patcher/base/base.h" +#include "opal/mca/pmix/base/base.h" +#include "opal/mca/reachable/base/base.h" +#include "opal/mca/shmem/base/base.h" +#include "opal/mca/smsc/base/base.h" +#include "opal/mca/threads/threads.h" +#include "opal/mca/threads/tsd.h" +#include "opal/mca/timer/base/base.h" +#include "opal/memoryhooks/memory.h" +#include "opal/runtime/opal.h" +#include "opal/util/arch.h" +#include "opal/util/malloc.h" +#include "opal/util/net.h" +#include "opal/util/output.h" +#include "opal/util/proc.h" +#include "opal/util/show_help.h" + +#include "opal/mca/backtrace/base/base.h" +#include "opal/mca/threads/base/base.h" +#include "opal/runtime/opal_progress.h" +#include "opal/util/opal_environ.h" + +#include "opal/constants.h" +#include "opal/util/error.h" +#include "opal/util/event.h" +#include "opal/util/keyval_parse.h" +#include "opal/util/stacktrace.h" +#include "opal/util/sys_limits.h" +#include "opal/util/timings.h" + +int opal_util_initialized = 0; +bool opal_warn_on_fork = true; + +int opal_init_error(const char *error, int ret); + +static int opal_err2str(int errnum, const char **errmsg) +{ + const char *retval; + + switch (errnum) { + case OPAL_SUCCESS: + retval = "Success"; + break; + case OPAL_ERROR: + retval = "Error"; + break; + case OPAL_ERR_OUT_OF_RESOURCE: + retval = "Out of resource"; + break; + case OPAL_ERR_TEMP_OUT_OF_RESOURCE: + retval = "Temporarily out of resource"; + break; + case OPAL_ERR_RESOURCE_BUSY: + retval = "Resource busy"; + break; + case OPAL_ERR_BAD_PARAM: + retval = "Bad parameter"; + break; + case OPAL_ERR_FATAL: + retval = "Fatal"; + break; + case OPAL_ERR_NOT_IMPLEMENTED: + retval = "Not implemented"; + break; + case OPAL_ERR_NOT_SUPPORTED: + retval = "Not supported"; + break; + case OPAL_ERR_INTERRUPTED: + retval = "Interrupted"; + break; + case OPAL_ERR_WOULD_BLOCK: + retval = "Would block"; + break; + case OPAL_ERR_IN_ERRNO: + retval = "In errno"; + break; + case OPAL_ERR_UNREACH: + retval = "Unreachable"; + break; + case OPAL_ERR_NOT_FOUND: + retval = "Not found"; + break; + case OPAL_EXISTS: + retval = "Exists"; + break; + case OPAL_ERR_TIMEOUT: + retval = "Timeout"; + break; + case OPAL_ERR_NOT_AVAILABLE: + retval = "Not available"; + break; + case OPAL_ERR_PERM: + retval = "No permission"; + break; + case OPAL_ERR_VALUE_OUT_OF_BOUNDS: + retval = "Value out of bounds"; + break; + case OPAL_ERR_FILE_READ_FAILURE: + retval = "File read failure"; + break; + case OPAL_ERR_FILE_WRITE_FAILURE: + retval = "File write failure"; + break; + case OPAL_ERR_FILE_OPEN_FAILURE: + retval = "File open failure"; + break; + case OPAL_ERR_PACK_MISMATCH: + retval = "Pack data mismatch"; + break; + case OPAL_ERR_PACK_FAILURE: + retval = "Data pack failed"; + break; + case OPAL_ERR_UNPACK_FAILURE: + retval = "Data unpack failed"; + break; + case OPAL_ERR_UNPACK_INADEQUATE_SPACE: + retval = "Data unpack had inadequate space"; + break; + case OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER: + retval = "Data unpack would read past end of buffer"; + break; + case OPAL_ERR_OPERATION_UNSUPPORTED: + retval = "Requested operation is not supported on referenced data type"; + break; + case OPAL_ERR_UNKNOWN_DATA_TYPE: + retval = "Unknown data type"; + break; + case OPAL_ERR_BUFFER: + retval = "Buffer type (described vs non-described) mismatch - operation not allowed"; + break; + case OPAL_ERR_DATA_TYPE_REDEF: + retval = "Attempt to redefine an existing data type"; + break; + case OPAL_ERR_DATA_OVERWRITE_ATTEMPT: + retval = "Attempt to overwrite a data value"; + break; + case OPAL_ERR_MODULE_NOT_FOUND: + retval = "Framework requires at least one active module, but none found"; + break; + case OPAL_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED: + retval = "OS topology does not support slot_list process affinity"; + break; + case OPAL_ERR_TOPO_SOCKET_NOT_SUPPORTED: + retval = "Could not obtain socket topology information"; + break; + case OPAL_ERR_TOPO_CORE_NOT_SUPPORTED: + retval = "Could not obtain core topology information"; + break; + case OPAL_ERR_NOT_ENOUGH_SOCKETS: + retval = "Not enough sockets to meet request"; + break; + case OPAL_ERR_NOT_ENOUGH_CORES: + retval = "Not enough cores to meet request"; + break; + case OPAL_ERR_INVALID_PHYS_CPU: + retval = "Invalid physical cpu number returned"; + break; + case OPAL_ERR_MULTIPLE_AFFINITIES: + retval = "Multiple methods for assigning process affinity were specified"; + break; + case OPAL_ERR_SLOT_LIST_RANGE: + retval = "Provided slot_list range is invalid"; + break; + case OPAL_ERR_NETWORK_NOT_PARSEABLE: + retval = "Provided network specification is not parseable"; + break; + case OPAL_ERR_SILENT: + retval = NULL; + break; + case OPAL_ERR_NOT_INITIALIZED: + retval = "Not initialized"; + break; + case OPAL_ERR_NOT_BOUND: + retval = "Not bound"; + break; + case OPAL_ERR_TAKE_NEXT_OPTION: + retval = "Take next option"; + break; + case OPAL_ERR_PROC_ENTRY_NOT_FOUND: + retval = "Database entry not found"; + break; + case OPAL_ERR_DATA_VALUE_NOT_FOUND: + retval = "Data for specified key not found"; + break; + case OPAL_ERR_CONNECTION_FAILED: + retval = "Connection failed"; + break; + case OPAL_ERR_AUTHENTICATION_FAILED: + retval = "Authentication failed"; + break; + case OPAL_ERR_COMM_FAILURE: + retval = "Comm failure"; + break; + case OPAL_ERR_SERVER_NOT_AVAIL: + retval = "Server not available"; + break; + case OPAL_ERR_IN_PROCESS: + retval = "Operation in process"; + break; + case OPAL_ERR_DEBUGGER_RELEASE: + retval = "Release debugger"; + break; + case OPAL_ERR_HANDLERS_COMPLETE: + retval = "Event handlers complete"; + break; + case OPAL_ERR_PARTIAL_SUCCESS: + retval = "Partial success"; + break; + case OPAL_ERR_PROC_ABORTED: + retval = "Process abnormally terminated"; + break; + case OPAL_ERR_PROC_REQUESTED_ABORT: + retval = "Process requested abort"; + break; + case OPAL_ERR_PROC_ABORTING: + retval = "Process is aborting"; + break; + case OPAL_ERR_NODE_DOWN: + retval = "Node has gone down"; + break; + case OPAL_ERR_NODE_OFFLINE: + retval = "Node has gone offline"; + break; + case OPAL_ERR_JOB_TERMINATED: + retval = "Job terminated"; + break; + case OPAL_ERR_PROC_RESTART: + retval = "Process restarted"; + break; + case OPAL_ERR_PROC_CHECKPOINT: + retval = "Process checkpoint"; + break; + case OPAL_ERR_PROC_MIGRATE: + retval = "Process migrate"; + break; + case OPAL_ERR_EVENT_REGISTRATION: + retval = "Event registration"; + break; + case OPAL_ERR_HEARTBEAT_ALERT: + retval = "Heartbeat not received"; + break; + case OPAL_ERR_FILE_ALERT: + retval = "File alert - proc may have stalled"; + break; + case OPAL_ERR_MODEL_DECLARED: + retval = "Model declared"; + break; + case OPAL_PMIX_LAUNCH_DIRECTIVE: + retval = "Launch directive"; + break; + + default: + retval = "UNRECOGNIZED"; + } + + *errmsg = retval; + return OPAL_SUCCESS; +} + +int opal_init_error(const char *error, int ret) +{ + if (OPAL_ERR_SILENT != ret) { + opal_show_help("help-opal-runtime.txt", "opal_init:startup:internal-failure", true, error, + ret); + } + return ret; +} + +/* If there is a preprocessor macro that redefined the call to + * gethostname, we undefine that here */ +#ifdef gethostname +# undef gethostname +#endif + +#define NUM_TRIES_FOR_NULL_HOSTNAME 8 + +extern int opal_init_error(const char *error, int ret); + +/* + * This gethostname wrapper does not return the full-length hostname in + * those rare cases where it is too long for the buffer. It does, however, + * guarantee a null-terminated hostname is returned, even if it's + * truncated. It also tries again in the case where gethostname returns an + * error because the buffer is initially too short. + */ +int opal_init_gethostname(void) +{ + size_t count, length = OPAL_LOCAL_MAXHOSTNAMELEN; + int ret_val, num_tries = 0; + + char *buf = calloc(1, length); + if (NULL == buf) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + while (num_tries < NUM_TRIES_FOR_NULL_HOSTNAME) { + ++num_tries; + + /* + * Offer all but the last byte of the buffer to gethostname. + */ + ret_val = gethostname(buf, length - 1); + /* + * Terminate the buffer in the last position. + */ + buf[length - 1] = '\0'; + if (0 == ret_val) { + count = strlen(buf); + /* The result was not truncated */ + if (count > 0 && count < length - 1) { + /* + * If we got a good result, save it. This value may + * be longer than what callers to opal_gethostname() + * are expecting, so that should be checked by the + * caller. + */ + opal_process_info.nodename = buf; + return OPAL_SUCCESS; + } + /* + * "Good" cases: + * + * 0 == count: The buffer is empty. In some gethostname + * implementations, this can be because the + * buffer was too small. + * (length-1) == count: The result *may* be truncated. + * + * If it's one of these cases, we'll fall through to + * increase the length of the buffer and try again. + * + * If it's not one of these good cases, it's an error: + * return. + */ + else if (!(0 == count || count == length - 1)) { + free(buf); + return OPAL_ERR_IN_ERRNO; + } + } + /* + * "Good" cases: + * + * errno == EINVAL or ENAMETOOLONG: hostname was truncated and + * there was an error. Perhaps there is something + * in the buffer and perhaps not. + * + * If it's one of these cases, we'll fall through to + * increase the length of the buffer and try again. + * + * If it's not one of these good cases, it's an error: return. + */ + else if (!(EINVAL == errno || ENAMETOOLONG == errno)) { + free(buf); + return OPAL_ERR_IN_ERRNO; + } + + /* + * If we get here, it means we want to double the length of + * the buffer and try again. + */ + length *= 2; + buf = realloc(buf, length); + if (NULL == buf) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + } /* end while */ + + /* If we got here, it means that we tried too many times and are + * giving up. */ + free(buf); + return OPAL_ERR_NOT_FOUND; +} + +static mca_base_framework_t *opal_init_util_frameworks[] = { + &opal_installdirs_base_framework, + &opal_if_base_framework, + NULL, +}; + +int opal_init_util(int *pargc, char ***pargv) +{ + int ret; + char *error = NULL; + OPAL_TIMING_ENV_INIT(otmng); + + if (opal_util_initialized != 0) { + if (opal_util_initialized < 0) { + return OPAL_ERROR; + } + ++opal_util_initialized; + return OPAL_SUCCESS; + } + + OBJ_CONSTRUCT(&opal_init_util_domain, opal_finalize_domain_t); + (void) opal_finalize_domain_init(&opal_init_util_domain, "opal_init_util"); + opal_finalize_set_domain(&opal_init_util_domain); + + opal_thread_set_main(); + + /* set the nodename right away so anyone who needs it has it. Note + * that we don't bother with fqdn and prefix issues here - we let + * the RTE later replace this with a modified name if the user + * requests it */ + ret = opal_init_gethostname(); + if (OPAL_SUCCESS != ret) { + fprintf(stderr, + "opal_init_gethostname() failed -- process will likely abort (%s:%d, returned %d " + "instead of OPAL_SUCCESS)\n", + __FILE__, __LINE__, ret); + return ret; + } + + /* initialize the memory allocator */ + opal_malloc_init(); + + OPAL_TIMING_ENV_NEXT(otmng, "opal_malloc_init"); + + /* initialize the output system */ + opal_output_init(); + + /* initialize install dirs code */ + if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_installdirs_base_framework, 0))) { + fprintf(stderr, + "opal_installdirs_base_open() failed -- process will likely abort (%s:%d, returned " + "%d instead of OPAL_SUCCESS)\n", + __FILE__, __LINE__, ret); + return ret; + } + + /* initialize the help system */ + opal_show_help_init(); + + OPAL_TIMING_ENV_NEXT(otmng, "opal_show_help_init"); + + /* register handler for errnum -> string converstion */ + if (OPAL_SUCCESS + != (ret = opal_error_register("OPAL", OPAL_ERR_BASE, OPAL_ERR_MAX, opal_err2str))) { + return opal_init_error("opal_error_register", ret); + } + + /* keyval lex-based parser */ + if (OPAL_SUCCESS != (ret = opal_util_keyval_parse_init())) { + return opal_init_error("opal_util_keyval_parse_init", ret); + } + + /* Setup the parameter system */ + if (OPAL_SUCCESS != (ret = mca_base_var_init())) { + return opal_init_error("mca_base_var_init", ret); + } + OPAL_TIMING_ENV_NEXT(otmng, "opal_var_init"); + + /* read any param files that were provided */ + if (OPAL_SUCCESS != (ret = mca_base_var_cache_files(false))) { + return opal_init_error("failed to cache files", ret); + } + + OPAL_TIMING_ENV_NEXT(otmng, "opal_var_cache"); + + /* register params for opal */ + if (OPAL_SUCCESS != (ret = opal_register_params())) { + return opal_init_error("opal_register_params", ret); + } + + if (OPAL_SUCCESS != (ret = opal_net_init())) { + return opal_init_error("opal_net_init", ret); + } + + OPAL_TIMING_ENV_NEXT(otmng, "opal_net_init"); + + /* pretty-print stack handlers */ + if (OPAL_SUCCESS != (ret = opal_util_register_stackhandlers())) { + return opal_init_error("opal_util_register_stackhandlers", ret); + } + + /* set system resource limits - internally protected against + * doing so twice in cases where the launch agent did it for us + */ + if (OPAL_SUCCESS != (ret = opal_util_init_sys_limits(&error))) { + opal_show_help("help-opal-runtime.txt", "opal_init:syslimit", false, error); + return OPAL_ERR_SILENT; + } + + /* initialize the arch string */ + if (OPAL_SUCCESS != (ret = opal_arch_init())) { + return opal_init_error("opal_arch_init", ret); + } + + OPAL_TIMING_ENV_NEXT(otmng, "opal_arch_init"); + + /* initialize the datatype engine */ + if (OPAL_SUCCESS != (ret = opal_datatype_init())) { + return opal_init_error("opal_datatype_init", ret); + } + + OPAL_TIMING_ENV_NEXT(otmng, "opal_datatype_init"); + + /* initialize the mca */ + if (OPAL_SUCCESS != (ret = mca_base_open())) { + return opal_init_error("mca_base_open", ret); + } + + OPAL_TIMING_ENV_NEXT(otmng, "mca_base_open"); + + /* initialize if framework */ + if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_if_base_framework, 0))) { + fprintf(stderr, + "opal_if_base_open() failed -- process will likely abort (%s:%d, returned %d " + "instead of OPAL_SUCCESS)\n", + __FILE__, __LINE__, ret); + return ret; + } + + /* register for */ + opal_finalize_register_cleanup_arg(mca_base_framework_close_list, opal_init_util_frameworks); + + OPAL_TIMING_ENV_NEXT(otmng, "opal_if_init"); + + ++opal_util_initialized; + + return OPAL_SUCCESS; +} + + diff --git a/opal/tools/wrappers/Makefile.am b/opal/tools/wrappers/Makefile.am index a03ff2e6ecd..9a8f0d7de13 100644 --- a/opal/tools/wrappers/Makefile.am +++ b/opal/tools/wrappers/Makefile.am @@ -2,7 +2,7 @@ # Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana # University Research and Technology # Corporation. All rights reserved. -# Copyright (c) 2004-2006 The University of Tennessee and The University +# Copyright (c) 2004-2022 The University of Tennessee and The University # of Tennessee Research Foundation. All rights # reserved. # Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, @@ -59,7 +59,7 @@ endif # OPAL_INSTALL_BINARIES endif # OPAL_WANT_SCRIPT_WRAPPER_COMPILERS opal_wrapper_SOURCES = opal_wrapper.c -opal_wrapper_LDADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la +opal_wrapper_LDADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@_util.la # Ensure that the man pages are rebuilt if the opal_config.h file # changes; a "good enough" way to know if configure was run again (and diff --git a/opal/util/output.c b/opal/util/output.c index b9ae8ae6bca..e970c4e8fd1 100644 --- a/opal/util/output.c +++ b/opal/util/output.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2021 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, @@ -782,9 +782,13 @@ static int open_file(int i) free(filename); /* release the filename in all cases */ return OPAL_ERR_IN_ERRNO; } - +#if 0 + /* + * GB: I do question the need to declare our output files with pmix ?!?! + */ /* register it to be ignored */ opal_pmix_register_cleanup(filename, false, true, false); +#endif free(filename); /* release the filename in all cases */ } diff --git a/opal/util/proc.c b/opal/util/proc.c index c309e1a7f98..628a90f399f 100644 --- a/opal/util/proc.c +++ b/opal/util/proc.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2013 The University of Tennessee and The University + * Copyright (c) 2013-2022 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2013 Inria. All rights reserved. @@ -185,6 +185,28 @@ static struct opal_proc_t *opal_proc_for_name_should_never_be_called(opal_proces return NULL; } +/** + * Return the hostname of the target proc. The default implementation + * only support the current proc. Once a RTE is initialized it must replace + * the default function with one that can handle multiple, potentially + * distributed, processes. + */ +static char* +opal_get_proc_hostname_local_only(const opal_proc_t *proc) +{ + /* if the proc is NULL, then we can't know */ + if (NULL == proc) { + return strdup("unknown"); + } + + /* if it is my own hostname we are after, then just hand back + * the value in opal_process_info */ + if (proc == opal_proc_my_name) { + return strdup(opal_process_info.nodename); + } + return strdup("unknown"); +} + char *(*opal_process_name_print)(const opal_process_name_t) = opal_process_name_print_should_never_be_called; char *(*opal_vpid_print)(const opal_vpid_t) = opal_vpid_print_should_never_be_called; @@ -199,30 +221,6 @@ int (*opal_convert_string_to_jobid)(opal_jobid_t *jobid, const char *jobid_strin = opal_convert_string_to_jobid_should_never_be_called; struct opal_proc_t *(*opal_proc_for_name)(const opal_process_name_t name) = opal_proc_for_name_should_never_be_called; +char *(*opal_get_proc_hostname)(const opal_proc_t *proc) + = opal_get_proc_hostname_local_only; -char *opal_get_proc_hostname(const opal_proc_t *proc) -{ - int ret; - char *hostname; - - /* if the proc is NULL, then we can't know */ - if (NULL == proc) { - return strdup("unknown"); - } - - /* if it is my own hostname we are after, then just hand back - * the value in opal_process_info */ - if (proc == opal_proc_my_name) { - return strdup(opal_process_info.nodename); - } - - /* if we don't already have it, then try to get it */ - OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, PMIX_HOSTNAME, &proc->proc_name, (char **) &hostname, - PMIX_STRING); - if (OPAL_SUCCESS != ret) { - return strdup("unknown"); // return something so the caller doesn't segfault - } - - /* user is not allowed to release the data */ - return hostname; -} diff --git a/opal/util/proc.h b/opal/util/proc.h index 433735e50f3..0db2938e2e0 100644 --- a/opal/util/proc.h +++ b/opal/util/proc.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The University of Tennessee and The University + * Copyright (c) 2013-2022 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2013 Inria. All rights reserved. @@ -171,6 +171,6 @@ OPAL_DECLSPEC extern struct opal_proc_t *(*opal_proc_for_name)(const opal_proces * our own. This is to be used by all BTLs so we don't retrieve hostnames * unless needed. The returned value MUST NOT be free'd as it is * owned by the proc_t */ -OPAL_DECLSPEC char *opal_get_proc_hostname(const opal_proc_t *proc); +OPAL_DECLSPEC extern char *(*opal_get_proc_hostname)(const opal_proc_t *proc); #endif /* OPAL_PROC_H */