Skip to content

Pull in failed images support. #370

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 67 commits into from
May 25, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
67 commits
Select commit Hold shift + click to select a range
26d788f
Multiple communicators
May 9, 2016
2ba109e
fail image and communicator error handling
May 9, 2016
b6688c1
MPI Win error handling
May 9, 2016
fb24a1b
MPI Win error handling
May 9, 2016
9226f3a
Failed image number detected
May 9, 2016
6dc12a4
Image_status
May 9, 2016
b38abc2
Disabling general error detection in image_status
May 9, 2016
a99a2cc
Supporting stat_stopped_image in image_status
May 9, 2016
c01a05a
Resuming multiple communicators
May 9, 2016
8cdc90e
Failed images draft
May 11, 2016
d6640d9
Shrinking communicator
May 16, 2016
17699e5
Partial fault tolerant support for locks
May 16, 2016
d6a57b8
Partial patch for locks
May 18, 2016
b72742d
Partial patch for locks
May 18, 2016
23d2f47
Minor changes
May 18, 2016
b9d0761
Debug message as last instruction
May 18, 2016
b3d2f9e
Working version of failed_images
May 19, 2016
aa4a61a
Sorting failed images
May 25, 2016
c4841e8
Managing stat for sync all
May 25, 2016
01b75ce
Managing stat for sync all
May 25, 2016
5448ec5
Fixed bug in sync_all
May 25, 2016
be8d98f
Fixed bug in sync_all
May 25, 2016
c12de97
Minor change
May 25, 2016
f7de5fd
Adding stat constraint to sync images, lock, unlock
May 26, 2016
32e4cb4
Adding stat constraint to event post and event wait
May 26, 2016
c1190e8
Fixed few bugs on
May 26, 2016
881cc26
Adding stat constraint to allocate
May 26, 2016
02ffc81
Fixed few bugs on
May 26, 2016
f607483
Fixed bug in register
May 27, 2016
4fb6da1
Using MPI_Comm_revoke
May 27, 2016
d66b185
Draft stopped images
Jun 1, 2016
c2373cf
Partial version for stopped images
Jun 3, 2016
f0a24c8
Fixed bug in failed_images
Jun 6, 2016
ae6ed63
Get and put stat variable
Jun 10, 2016
5ae617c
Adapting prototypes to the new stat= attribute
Jun 10, 2016
f244aa1
Fixed bug on number of failed images
Jun 10, 2016
be7a1a6
Cleanup post bug-fix
Jun 10, 2016
cba6ef5
stat attribute in last position for get and put for compatibility
Jun 17, 2016
59ce1e5
General improvement after tutorial
Oct 4, 2016
dc64a97
Fix small bug about number of failed images
Oct 5, 2016
f4d3ce2
Fixed bug in locking
Oct 5, 2016
245dc1f
Fixed num_images
Oct 19, 2016
6a1fbab
Revert last change
Oct 20, 2016
f4411e6
Failed images fixed
Oct 24, 2016
8421c8c
Cleanup
Oct 24, 2016
c2887c7
New stopped_images function
Oct 25, 2016
0264344
Stopped images
Oct 26, 2016
168a253
Update from opencoarrays_ft_rep
Mar 2, 2017
5737bca
Minor change
Mar 2, 2017
8765f6b
First Aurelien patch
Mar 2, 2017
798f073
Fixed register for GCC-7 compatibility
Mar 2, 2017
83c3745
Merge branch 'master' into vehre/failed-images
vehre Mar 10, 2017
7e1054c
Added first testcase.
vehre Mar 10, 2017
f1ed29e
Added missing cmake-file for image_states-tests.
vehre Mar 11, 2017
dca0738
Implement FAILED IMAGES support for gcc >= 7 and mpich >= 3.2.
vehre Apr 30, 2017
877602b
Merge branch 'master' into vehre/failed-images
vehre Apr 30, 2017
e720481
Merge branch 'master' into vehre/failed-images
zbeekman May 16, 2017
d45aa2c
Add comment to syncimages_status test
May 23, 2017
0eb8345
Update build system for failed images
zbeekman May 23, 2017
dd8442e
Rename CMake OpenCoarrays developer tests option
zbeekman May 23, 2017
151eb80
Handle `<alloca.h>` better
zbeekman May 23, 2017
244b01a
Merge branch 'master' of https://github.com/sourceryinstitute/OpenCoa…
zbeekman May 23, 2017
59b0165
Ensure that libcaf.h is early in includes
zbeekman May 23, 2017
9c6eb82
Flag image_fail_and_sync_test_2 as developer
zbeekman May 24, 2017
9674d71
Merge branch 'master' into vehre/failed-images
vehre May 25, 2017
120210a
Do not shrink communicator when number of failed images is out of bou…
vehre May 25, 2017
b8e9f56
Merge branch 'vehre/failed-images' of github.com:sourceryinstitute/op…
vehre May 25, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 44 additions & 8 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@ set_property ( CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS ${CMAKE_CONFIGURATION_TYP

# Add option and check environment to determine if developer tests should be run
if($ENV{OPENCOARRAYS_DEVELOPER})
option(RUN_DEVELOPER_TESTS "Run tests intended only for developers" ON)
option(CAF_RUN_DEVELOPER_TESTS "Run tests intended only for developers" ON)
else()
option(RUN_DEVELOPER_TESTS "Run tests intended only for developers" OFF)
option(CAF_RUN_DEVELOPER_TESTS "Run tests intended only for developers" OFF)
endif()
mark_as_advanced(RUN_DEVELOPER_TESTS)
mark_as_advanced(CAF_RUN_DEVELOPER_TESTS)

if( NOT DEFINED ENV{OPENCOARRAYS_DEVELOPER})
set ( ENV{OPENCOARRAYS_DEVELOPER} FALSE )
Expand Down Expand Up @@ -387,7 +387,7 @@ include(GNUInstallDirs)
#-------------------------------
# Recurse into the src directory
#-------------------------------
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)

add_subdirectory(src)

Expand Down Expand Up @@ -465,6 +465,24 @@ function(add_mpi_test name num_mpi_proc path)
set_property(TEST ${name} PROPERTY PASS_REGULAR_EXPRESSION "Test passed.")
endfunction(add_mpi_test)

function(add_fault_tolerant_mpi_test name num_mpi_proc path)
if ( ((N LESS num_mpi_proc) OR (N EQUAL 0)) )
message(STATUS "Test ${name} is oversubscribed: ${num_mpi_proc} ranks requested with ${N} system processor available.")
if ( openmpi )
if ( N LESS 2 )
set( num_mpi_proc 2 )
set (test_parameters --oversubscribe)
else()
set ( num_mpi_proc ${N} )
endif()
message( STATUS "Open-MPI detected, over-riding oversubscribed test, ${name}, with ${num_mpi_proc} ranks." )
endif()
endif()
set(test_parameters ${test_parameters} ${MPIEXEC_NUMPROC_FLAG} ${num_mpi_proc} -disable-auto-cleanup )
add_test(NAME ${name} COMMAND ${MPIEXEC} ${test_parameters} "${path}")
set_property(TEST ${name} PROPERTY PASS_REGULAR_EXPRESSION "Test passed.")
endfunction(add_fault_tolerant_mpi_test)

set(tests_root ${CMAKE_CURRENT_BINARY_DIR}/src/tests)


Expand All @@ -480,7 +498,7 @@ if(opencoarrays_aware_compiler)
add_mpi_test(register_alloc_comp_1 2 ${tests_root}/unit/init_register/register_alloc_comp_1)
add_mpi_test(register_alloc_comp_2 2 ${tests_root}/unit/init_register/register_alloc_comp_2)
add_mpi_test(register_alloc_comp_3 2 ${tests_root}/unit/init_register/register_alloc_comp_3)
if (RUN_DEVELOPER_TESTS OR $ENV{OPENCOARRAYS_DEVELOPER})
if (CAF_RUN_DEVELOPER_TESTS OR $ENV{OPENCOARRAYS_DEVELOPER})
message ( STATUS "Running Developer tests is enabled." )
add_mpi_test(async_comp_alloc 6 ${tests_root}/unit/init_register/async_comp_alloc)
# Timeout async_comp_alloc test after 3 seconds to progess past the known failure
Expand Down Expand Up @@ -523,16 +541,34 @@ if(opencoarrays_aware_compiler)
# GFortran PR 78505 only fixed on trunk/gcc 7
add_mpi_test(source-alloc-no-sync 8 ${tests_root}/regression/reported/source-alloc-sync)
endif()
if (RUN_DEVELOPER_TESTS OR $ENV{OPENCOARRAYS_DEVELOPER})
if (CAF_RUN_DEVELOPER_TESTS OR $ENV{OPENCOARRAYS_DEVELOPER})
add_mpi_test(convert-before-put 3 ${tests_root}/regression/reported/convert-before-put)
endif()
add_mpi_test(event-post 3 ${tests_root}/regression/reported/event-post)
add_mpi_test(co_reduce-factorial 4 ${tests_root}/regression/reported/co_reduce-factorial)
add_mpi_test(co_reduce-factorial-int8 4 ${tests_root}/regression/reported/co_reduce-factorial-int8)
add_mpi_test(co_reduce-factorial-int64 4 ${tests_root}/regression/reported/co_reduce-factorial-int64)
add_mpi_test(co_reduce_string 4 ${tests_root}/unit/collectives/co_reduce_string)
# remove this before merging into master
# set_property(TEST co_reduce-factorial PROPERTY WILL_FAIL TRUE)

# IMAGE FAIL tests
if(NOT CMAKE_Fortran_COMPILER_VERSION VERSION_LESS 7)
add_mpi_test(image_status_test_1 4 ${tests_root}/unit/fail_images/image_status_test_1)
if(CAF_ENABLE_FAILED_IMAGES)
# No other way to check that image_fail_test_1 passes.
add_fault_tolerant_mpi_test(image_fail_test_1 4 ${tests_root}/unit/fail_images/image_fail_test_1)
set_property(TEST image_fail_test_1 PROPERTY FAIL_REGULAR_EXPRESSION "Test failed")
set_property(TEST image_fail_test_1 PROPERTY PASS_REGULAR_EXPRESSION "Test passed")
add_fault_tolerant_mpi_test(image_fail_and_sync_test_1 4 ${tests_root}/unit/fail_images/image_fail_and_sync_test_1)
if (CAF_RUN_DEVELOPER_TESTS OR $ENV{OPENCOARRAYS_DEVELOPER})
add_fault_tolerant_mpi_test(image_fail_and_sync_test_2 4 ${tests_root}/unit/fail_images/image_fail_and_sync_test_2)
endif()
add_fault_tolerant_mpi_test(image_fail_and_sync_test_3 4 ${tests_root}/unit/fail_images/image_fail_and_sync_test_3)
add_fault_tolerant_mpi_test(image_fail_and_status_test_1 4 ${tests_root}/unit/fail_images/image_fail_and_status_test_1)
add_fault_tolerant_mpi_test(image_fail_and_failed_images_test_1 4 ${tests_root}/unit/fail_images/image_fail_and_failed_images_test_1)
add_fault_tolerant_mpi_test(image_fail_and_stopped_images_test_1 4 ${tests_root}/unit/fail_images/image_fail_and_stopped_images_test_1)
add_fault_tolerant_mpi_test(image_fail_and_get_test_1 4 ${tests_root}/unit/fail_images/image_fail_and_get_test_1)
endif()
endif()
else()
add_test(co_sum_extension ${tests_root}/unit/extensions/test-co_sum-extension.sh)
set_property(TEST co_sum_extension PROPERTY PASS_REGULAR_EXPRESSION "Test passed.")
Expand Down
22 changes: 18 additions & 4 deletions src/libcaf.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
#define STAT_LOCKED_OTHER_IMAGE 2
#define STAT_DUP_SYNC_IMAGES 3
#define STAT_STOPPED_IMAGE 6000
#define STAT_FAILED_IMAGE 6001

/* Describes what type of array we are registerring. Keep in sync with
gcc/fortran/trans.h. */
Expand All @@ -88,11 +89,15 @@ typedef enum caf_deregister_t {
caf_deregister_t;

typedef void* caf_token_t;

#ifdef GCC_GE_7
/** Add a dummy type representing teams in coarrays. */
typedef void * caf_team_t;
#endif

/* Linked list of static coarrays registered. */
typedef struct caf_static_t {
caf_token_t token;
caf_token_t stopped_token;
struct caf_static_t *prev;
}
caf_static_t;
Expand Down Expand Up @@ -228,13 +233,15 @@ void PREFIX (deregister) (caf_token_t *, int *, char *, int);
#endif

void PREFIX (caf_get) (caf_token_t, size_t, int, gfc_descriptor_t *,
caf_vector_t *, gfc_descriptor_t *, int, int, int);
caf_vector_t *, gfc_descriptor_t *, int, int, bool, int *);
void PREFIX (caf_send) (caf_token_t, size_t, int, gfc_descriptor_t *,
caf_vector_t *, gfc_descriptor_t *, int, int);
caf_vector_t *, gfc_descriptor_t *, int, int, bool,
int *);

void PREFIX (caf_sendget) (caf_token_t, size_t, int, gfc_descriptor_t *,
caf_vector_t *, caf_token_t, size_t, int,
gfc_descriptor_t *, caf_vector_t *, int, int);
gfc_descriptor_t *, caf_vector_t *, int, int, bool,
int *);

#ifdef GCC_GE_7
void PREFIX(get_by_ref) (caf_token_t, int,
Expand Down Expand Up @@ -263,9 +270,16 @@ void PREFIX (sync_all) (int *, char *, int);
void PREFIX (sync_images) (int, int[], int *, char *, int);
void PREFIX (sync_memory) (int *, char *, int);

void PREFIX (stop_str) (const char *, int32_t) __attribute__ ((noreturn));
void PREFIX (stop) (int32_t) __attribute__ ((noreturn));
void PREFIX (error_stop_str) (const char *, int32_t)
__attribute__ ((noreturn));
void PREFIX (error_stop) (int32_t) __attribute__ ((noreturn));
void PREFIX (fail_image) (void) __attribute__ ((noreturn));

int PREFIX (image_status) (int);
void PREFIX (failed_images) (gfc_descriptor_t *, int, int *);
void PREFIX (stopped_images) (gfc_descriptor_t *, int, int *);

void PREFIX (atomic_define) (caf_token_t, size_t, int, void *, int *, int, int);
void PREFIX (atomic_ref) (caf_token_t, size_t, int, void *, int *, int, int);
Expand Down
61 changes: 61 additions & 0 deletions src/mpi/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,67 @@ if(CAF_EXPOSE_INIT_FINALIZE)
add_definitions(-DEXPOSE_INIT_FINALIZE)
endif()

include(CheckIncludeFile)
CHECK_INCLUDE_FILE("alloca.h" HAVE_ALLOCA)
if(NOT HAVE_ALLOCA)
add_definitions(-DALLOCA_MISSING)
message(WARNING "Could not find <alloca.h>. Assuming functionality is provided elsewhere.")
endif()

#----------------------------------------------------------------------
# Test if MPI implementation provides features needed for failed images
#----------------------------------------------------------------------
set(NEEDED_SYMBOLS MPIX_ERR_PROC_FAILED;MPIX_ERR_REVOKED;MPIX_Comm_failure_ack;MPIX_Comm_failure_get_acked;MPIX_Comm_shrink;MPIX_Comm_agree)
set(MPI_HAS_FAULT_TOL_EXT YES)
set(old_cmake_required_includes "${CMAKE_REQUIRED_INCLUDES}")
if(CMAKE_REQUIRED_INCLUDES)
set(CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES};${MPI_C_INCLUDE_PATH})
else()
set(CMAKE_REQUIRED_INCLUDES ${MPI_C_INCLUDE_PATH})
endif()
set(old_cmake_required_flags "${CMAKE_REQUIRED_FLAGS}")
if(CMAKE_REQUIRED_FLAGS)
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS};${MPI_C_COMPILE_FLAGS};${MPI_C_LINK_FLAGS})
else()
set(CMAKE_REQUIRED_FLAGS ${MPI_C_COMPILE_FLAGS};${MPI_C_LINK_FLAGS})
endif()
set(old_cmake_required_libraries "${CMAKE_REQUIRED_LIBRARIES}")
if(CMAKE_REQUIRED_LIBRARIES)
set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES};${MPI_C_LIBRARIES})
else()
set(CMAKE_REQUIRED_LIBRARIES ${MPI_C_LIBRARIES})
endif()

set(MPI_HEADERS mpi.h)
CHECK_INCLUDE_FILE("mpi-ext.h" HAVE_MPI_EXT)
if(HAVE_MPI_EXT)
add_definitions(-DHAVE_MPI_EXT_H)
set(MPI_HEADERS ${MPI_HEADERS};mpi-ext.h)
endif()
include(CheckSymbolExists)
foreach(symbol ${NEEDED_SYMBOLS})
CHECK_SYMBOL_EXISTS(${symbol} ${MPI_HEADERS} HAVE_${symbol})
if(NOT HAVE_${symbol})
message( STATUS "\${HAVE_${symbol}} = ${HAVE_${symbol}}")
message( WARNING "Disabling Failed Image support due to lack of support in the current MPI implementation.")
set(MPI_HAS_FAULT_TOL_EXT NO)
break() # no need to keep looking
endif()
endforeach(symbol)
set(CMAKE_REQUIRED_INCLUDES ${old_cmake_required_includes})
set(CMAKE_REQUIRED_FLAGS ${old_cmake_required_flags})
set(CMAKE_REQUIRED_LIBRARIES ${old_cmake_required_libraries})

if(MPI_HAS_FAULT_TOL_EXT)
option(CAF_ENABLE_FAILED_IMAGES "Enable failed images support" TRUE)
else()
set(CAF_ENABLE_FAILED_IMAGES FALSE CACHE BOOL "Enable failed images support" FORCE)
endif()

if(CAF_ENABLE_FAILED_IMAGES)
add_definitions(-DUSE_FAILED_IMAGES)
endif()

# Determine whether and how to include OpenCoarrays module based on if the Fortran MPI compiler:
# - workds
# - is compatible with the fortran compiler used to build the MPI implementation
Expand Down
Loading