From a9c3226b6549f9a5f2093e47646f10a6506c47f3 Mon Sep 17 00:00:00 2001 From: Nathan Weeks Date: Fri, 4 Jan 2019 09:30:21 -0600 Subject: [PATCH 1/3] Replace MPI_Errhandler_set() with MPI_Comm_set_errhandler() Open-MPI doesn't support the deprecated MPI_Errhandler_set() unless configured with --enable-mpi1-compatibility --- src/mpi/mpi_caf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mpi/mpi_caf.c b/src/mpi/mpi_caf.c index 680b9cde..c1d8cce6 100644 --- a/src/mpi/mpi_caf.c +++ b/src/mpi/mpi_caf.c @@ -1009,8 +1009,8 @@ finalize_internal(int status_code) if (status_code == 0) { /* In finalization do not report stopped or failed images any more. */ - ierr = MPI_Errhandler_set(CAF_COMM_WORLD, MPI_ERRORS_RETURN); chk_err(ierr); - ierr = MPI_Errhandler_set(alive_comm, MPI_ERRORS_RETURN); chk_err(ierr); + ierr = MPI_Comm_set_errhandler(CAF_COMM_WORLD, MPI_ERRORS_RETURN); chk_err(ierr); + ierr = MPI_Comm_set_errhandler(alive_comm, MPI_ERRORS_RETURN); chk_err(ierr); /* Only add a conventional barrier to prevent images rom quitting too * early, when this images is not failing. */ dprint("Before MPI_Barrier(CAF_COMM_WORLD)\n"); From b883a9cfa89d068a2ae19f1c6a0f86be7b14cd7c Mon Sep 17 00:00:00 2001 From: Nathan Weeks Date: Fri, 4 Jan 2019 13:29:55 -0600 Subject: [PATCH 2/3] Support ULFM2 mpiexec in cafrun Change (MPICH-specific) --reenable-auto-cleanup cafrun option to --disable-failed-images, and select appropriate mpiexec option based on MPI implementation (applicable only when CAF_ENABLE_FAILED_IMAGES=TRUE) --- src/extensions/cafrun.in | 35 +++++++++++++++++++++++------------ src/mpi/CMakeLists.txt | 7 ++++++- 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/src/extensions/cafrun.in b/src/extensions/cafrun.in index d1f34a2b..3e15dec6 100755 --- a/src/extensions/cafrun.in +++ b/src/extensions/cafrun.in @@ -156,13 +156,15 @@ usage() { echo " --wraps, -w, Report info about the wrapped MPI launcher" echo " -np , Number of images, N, to execute, N must be a positive integer" echo " -n , Same as -np" - echo " --reenable-auto-cleanup Turn off failed images support (if library support is present)" - echo " This option re-enables MPI auto cleanup, which is disabled by" - echo " by default if GFortran/OpenCoarrays/MPI all support failed" - echo " images through MPI ULFM. When MPI auto cleanup is disabled and" - echo " failed image support is present, OpenCoarrays triggers cleanup" - echo " explicitly when a failed/stopped image is encountered in an" - echo " image control statement without a \`stat=\` clause." + if [[ ${have_failed_img} != 'false' ]]; then + echo " --disable-failed-images Turn off failed images support (if library support is present)" + echo " This option re-enables MPI auto cleanup, which is disabled by" + echo " by default if GFortran/OpenCoarrays/MPI all support failed" + echo " images through MPI ULFM. When MPI auto cleanup is disabled and" + echo " failed image support is present, OpenCoarrays triggers cleanup" + echo " explicitly when a failed/stopped image is encountered in an" + echo " image control statement without a \`stat=\` clause." + fi echo " --show, -s, Show the command that the wrapper will execute. You can pass" echo " this as the first argument and then the additional arguments" echo " that you're planning to pass to perform a dry run." @@ -175,7 +177,9 @@ usage() { echo " ${cmd} --help" echo " ${cmd} --show" echo " ${cmd} -s -np 4 my_exe" - echo " ${cmd} -np 4 --reenable-auto-cleanup ./my_exe arg1 arg2" + if [[ ${have_failed_img} != 'false' ]]; then + echo " ${cmd} -np 4 --disable-failed-images ./my_exe arg1 arg2" + fi echo "" echo " Notes:" echo " [options] must be a CAF executable file, one of the above arguments," @@ -187,26 +191,33 @@ i=0 disable_failed_images=false for arg in "${@}"; do ((i+=1)) - if [[ "${arg}" == "--reenable-auto-cleanup" ]]; then + if [[ "${arg}" == "--disable-failed-images" ]]; then # Strip "--reenable-auto-cleanup" from args set -- "${@:1:$((i - 1))}" "${@:$((i+1)):$((${#} - i))}" - if ! ${have_failed_img}; then - echo "Library was not built with failed image support, so passing \`--reenable-auto-cleanup\` is a noop" >&2 + if [[ ${have_failed_img} == 'false' ]]; then + echo "Library was not built with failed image support, so passing \`--disable-failed-images\` is a noop" >&2 fi disable_failed_images=true fi done if ! ${disable_failed_images}; then - if ${have_failed_img}; then + if [[ ${have_failed_img} == 'mpich' ]]; then if [[ -n "${preflags:-}" ]]; then preflags+=(--disable-auto-cleanup) else preflags=(--disable-auto-cleanup) fi fi +elif [[ ${have_failed_img} == 'ulfm2' ]]; then + if [[ -n "${preflags:-}" ]]; then + preflags+=(--disable-recovery) + else + preflags=(--disable-recovery) + fi fi + # Print useage information if caf is invoked without arguments if ((${#} == 0)); then usage diff --git a/src/mpi/CMakeLists.txt b/src/mpi/CMakeLists.txt index ddb150e9..a0aae564 100644 --- a/src/mpi/CMakeLists.txt +++ b/src/mpi/CMakeLists.txt @@ -120,6 +120,7 @@ execute_process(COMMAND ${MPIEXEC} --version if (mpi_version_out MATCHES "[Oo]pen[ -][Mm][Pp][Ii]") message( STATUS "OpenMPI detected") set ( openmpi true PARENT_SCOPE) + set ( openmpi true ) # Write out a host file because OMPI's mpiexec is dumb file(WRITE ${CMAKE_BINARY_DIR}/hostfile "${HOST_NAME} slots=${N_CPU}\n") message( STATUS "hostfile written to: ${CMAKE_BINARY_DIR}/hostfile") @@ -321,7 +322,11 @@ configure_file("${CMAKE_SOURCE_DIR}/src/extensions/caf.in" "${CMAKE_BINARY_DIR}/ # if(CAF_ENABLE_FAILED_IMAGES) - set(HAVE_FAILED_IMG "true") + if (openmpi) # in cafrun, distinguish mpiexec arguments to enable/disable failed images + set(HAVE_FAILED_IMG "ulfm2") + else() + set(HAVE_FAILED_IMG "mpich") + endif() else() set(HAVE_FAILED_IMG "false") endif() From 3e8f663f1a80cca88a55daf581f9015d10a935f3 Mon Sep 17 00:00:00 2001 From: Nathan Weeks Date: Sat, 23 Oct 2021 10:01:02 -0500 Subject: [PATCH 3/3] Rebrand ulfm2 to openmpi Open MPI mpiexec --enable-recovery to enable recovery from image failure --- src/extensions/cafrun.in | 8 +++++++- src/mpi/CMakeLists.txt | 4 ++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/extensions/cafrun.in b/src/extensions/cafrun.in index 3e15dec6..3bc60794 100755 --- a/src/extensions/cafrun.in +++ b/src/extensions/cafrun.in @@ -208,8 +208,14 @@ if ! ${disable_failed_images}; then else preflags=(--disable-auto-cleanup) fi + elif [[ ${have_failed_img} == 'openmpi' ]]; then + if [[ -n "${preflags:-}" ]]; then + preflags+=(--enable-recovery) + else + preflags=(--enable-recovery) + fi fi -elif [[ ${have_failed_img} == 'ulfm2' ]]; then +elif [[ ${have_failed_img} == 'openmpi' ]]; then if [[ -n "${preflags:-}" ]]; then preflags+=(--disable-recovery) else diff --git a/src/mpi/CMakeLists.txt b/src/mpi/CMakeLists.txt index a0aae564..5e2e9af4 100644 --- a/src/mpi/CMakeLists.txt +++ b/src/mpi/CMakeLists.txt @@ -235,7 +235,7 @@ set(CMAKE_REQUIRED_INCLUDES ${old_cmake_required_includes}) set(CMAKE_REQUIRED_FLAGS ${old_cmake_required_flags}) set(CMAKE_REQUIRED_LIBRARIES ${old_cmake_required_libraries}) -if(MPI_HAS_FAULT_TOL_EXT) # AND (NOT openmpi)) +if(MPI_HAS_FAULT_TOL_EXT) option(CAF_ENABLE_FAILED_IMAGES "Enable failed images support" FALSE) message(STATUS "The MPI implementation appears to have the experimental features for ULFM that will allow you to build OpenCoarrays with failed images support. However, @@ -323,7 +323,7 @@ configure_file("${CMAKE_SOURCE_DIR}/src/extensions/caf.in" "${CMAKE_BINARY_DIR}/ if(CAF_ENABLE_FAILED_IMAGES) if (openmpi) # in cafrun, distinguish mpiexec arguments to enable/disable failed images - set(HAVE_FAILED_IMG "ulfm2") + set(HAVE_FAILED_IMG "openmpi") else() set(HAVE_FAILED_IMG "mpich") endif()