Skip to content

[WIP] Support ULFM2 #624

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 29 additions & 12 deletions src/extensions/cafrun.in
Original file line number Diff line number Diff line change
Expand Up @@ -156,13 +156,15 @@ usage() {
echo " --wraps, -w, Report info about the wrapped MPI launcher"
echo " -np <N>, Number of images, N, to execute, N must be a positive integer"
echo " -n <N>, Same as -np"
echo " --reenable-auto-cleanup Turn off failed images support (if library support is present)"
echo " This option re-enables MPI auto cleanup, which is disabled by"
echo " by default if GFortran/OpenCoarrays/MPI all support failed"
echo " images through MPI ULFM. When MPI auto cleanup is disabled and"
echo " failed image support is present, OpenCoarrays triggers cleanup"
echo " explicitly when a failed/stopped image is encountered in an"
echo " image control statement without a \`stat=\` clause."
if [[ ${have_failed_img} != 'false' ]]; then
echo " --disable-failed-images Turn off failed images support (if library support is present)"
echo " This option re-enables MPI auto cleanup, which is disabled by"
echo " by default if GFortran/OpenCoarrays/MPI all support failed"
echo " images through MPI ULFM. When MPI auto cleanup is disabled and"
echo " failed image support is present, OpenCoarrays triggers cleanup"
echo " explicitly when a failed/stopped image is encountered in an"
echo " image control statement without a \`stat=\` clause."
fi
echo " --show, -s, Show the command that the wrapper will execute. You can pass"
echo " this as the first argument and then the additional arguments"
echo " that you're planning to pass to perform a dry run."
Expand All @@ -175,7 +177,9 @@ usage() {
echo " ${cmd} --help"
echo " ${cmd} --show"
echo " ${cmd} -s -np 4 my_exe"
echo " ${cmd} -np 4 --reenable-auto-cleanup ./my_exe arg1 arg2"
if [[ ${have_failed_img} != 'false' ]]; then
echo " ${cmd} -np 4 --disable-failed-images ./my_exe arg1 arg2"
fi
echo ""
echo " Notes:"
echo " [options] must be a CAF executable file, one of the above arguments,"
Expand All @@ -187,26 +191,39 @@ i=0
disable_failed_images=false
for arg in "${@}"; do
((i+=1))
if [[ "${arg}" == "--reenable-auto-cleanup" ]]; then
if [[ "${arg}" == "--disable-failed-images" ]]; then
# Strip "--reenable-auto-cleanup" from args
set -- "${@:1:$((i - 1))}" "${@:$((i+1)):$((${#} - i))}"
if ! ${have_failed_img}; then
echo "Library was not built with failed image support, so passing \`--reenable-auto-cleanup\` is a noop" >&2
if [[ ${have_failed_img} == 'false' ]]; then
echo "Library was not built with failed image support, so passing \`--disable-failed-images\` is a noop" >&2
fi
disable_failed_images=true
fi
done

if ! ${disable_failed_images}; then
if ${have_failed_img}; then
if [[ ${have_failed_img} == 'mpich' ]]; then
if [[ -n "${preflags:-}" ]]; then
preflags+=(--disable-auto-cleanup)
else
preflags=(--disable-auto-cleanup)
fi
elif [[ ${have_failed_img} == 'openmpi' ]]; then
if [[ -n "${preflags:-}" ]]; then
preflags+=(--enable-recovery)
else
preflags=(--enable-recovery)
fi
fi
elif [[ ${have_failed_img} == 'openmpi' ]]; then
if [[ -n "${preflags:-}" ]]; then
preflags+=(--disable-recovery)
else
preflags=(--disable-recovery)
fi
fi


# Print useage information if caf is invoked without arguments
if ((${#} == 0)); then
usage
Expand Down
9 changes: 7 additions & 2 deletions src/mpi/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ execute_process(COMMAND ${MPIEXEC} --version
if (mpi_version_out MATCHES "[Oo]pen[ -][Mm][Pp][Ii]")
message( STATUS "OpenMPI detected")
set ( openmpi true PARENT_SCOPE)
set ( openmpi true )
# Write out a host file because OMPI's mpiexec is dumb
file(WRITE ${CMAKE_BINARY_DIR}/hostfile "${HOST_NAME} slots=${N_CPU}\n")
message( STATUS "hostfile written to: ${CMAKE_BINARY_DIR}/hostfile")
Expand Down Expand Up @@ -234,7 +235,7 @@ set(CMAKE_REQUIRED_INCLUDES ${old_cmake_required_includes})
set(CMAKE_REQUIRED_FLAGS ${old_cmake_required_flags})
set(CMAKE_REQUIRED_LIBRARIES ${old_cmake_required_libraries})

if(MPI_HAS_FAULT_TOL_EXT) # AND (NOT openmpi))
if(MPI_HAS_FAULT_TOL_EXT)
option(CAF_ENABLE_FAILED_IMAGES "Enable failed images support" FALSE)
message(STATUS "The MPI implementation appears to have the experimental features for ULFM
that will allow you to build OpenCoarrays with failed images support. However,
Expand Down Expand Up @@ -321,7 +322,11 @@ configure_file("${CMAKE_SOURCE_DIR}/src/extensions/caf.in" "${CMAKE_BINARY_DIR}/
#

if(CAF_ENABLE_FAILED_IMAGES)
set(HAVE_FAILED_IMG "true")
if (openmpi) # in cafrun, distinguish mpiexec arguments to enable/disable failed images
set(HAVE_FAILED_IMG "openmpi")
else()
set(HAVE_FAILED_IMG "mpich")
endif()
else()
set(HAVE_FAILED_IMG "false")
endif()
Expand Down
4 changes: 2 additions & 2 deletions src/mpi/mpi_caf.c
Original file line number Diff line number Diff line change
Expand Up @@ -1009,8 +1009,8 @@ finalize_internal(int status_code)
if (status_code == 0)
{
/* In finalization do not report stopped or failed images any more. */
ierr = MPI_Errhandler_set(CAF_COMM_WORLD, MPI_ERRORS_RETURN); chk_err(ierr);
ierr = MPI_Errhandler_set(alive_comm, MPI_ERRORS_RETURN); chk_err(ierr);
ierr = MPI_Comm_set_errhandler(CAF_COMM_WORLD, MPI_ERRORS_RETURN); chk_err(ierr);
ierr = MPI_Comm_set_errhandler(alive_comm, MPI_ERRORS_RETURN); chk_err(ierr);
/* Only add a conventional barrier to prevent images rom quitting too
* early, when this images is not failing. */
dprint("Before MPI_Barrier(CAF_COMM_WORLD)\n");
Expand Down