Skip to content

Commit 69ab6b8

Browse files
committed
Importing ULFM ompi layer: snapshot of WIP
Missing BTL and COLL imports. Almost compiles w/o --with-ft
1 parent 8e0ca63 commit 69ab6b8

File tree

182 files changed

+3644
-108
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

182 files changed

+3644
-108
lines changed

config/opal_setup_ft.m4

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
dnl
2+
dnl Copyright (c) 2004-2016 The University of Tennessee and The University
3+
dnl of Tennessee Research Foundation. All rights
4+
dnl reserved.
5+
dnl Copyright (c) 2009-2012 Oak Ridge National Labs. All rights reserved.
26
dnl Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
37
dnl Copyright (c) 2015 Research Organization for Information Science
48
dnl and Technology (RIST). All rights reserved.
@@ -12,11 +16,15 @@ dnl
1216
#
1317
# --with-ft=TYPE
1418
# TYPE:
19+
# - mpi (synonym for 'ulfm')
1520
# - LAM (synonym for 'cr' currently)
1621
# - cr
1722
# /* General FT sections */
1823
# #if OPAL_ENABLE_FT == 0 /* FT Disabled globaly */
1924
# #if OPAL_ENABLE_FT == 1 /* FT Enabled globaly */
25+
# /* ULFM Specific sections */
26+
# #if OPAL_ENABLE_FT_MPI == 0 /* FT ULFM Disabled */
27+
# #if OPAL_ENABLE_FT_MPI == 1 /* FT ULFM Enabled */
2028
# /* CR Specific sections */
2129
# #if OPAL_ENABLE_FT_CR == 0 /* FT Ckpt/Restart Disabled */
2230
# #if OPAL_ENABLE_FT_CR == 1 /* FT Ckpt/Restart Enabled */
@@ -33,7 +41,7 @@ AC_DEFUN([OPAL_SETUP_FT_OPTIONS],[
3341
opal_setup_ft_options="yes"
3442
AC_ARG_WITH(ft,
3543
[AC_HELP_STRING([--with-ft=TYPE],
36-
[Specify the type of fault tolerance to enable. Options: LAM (LAM/MPI-like), cr (Checkpoint/Restart), (default: disabled)])],
44+
[Specify the type of fault tolerance to enable. Options: LAM (LAM/MPI-like), cr (Checkpoint/Restart), mpi (ULFM) (default: disabled)])],
3745
[opal_want_ft=1],
3846
[opal_want_ft=0])
3947
@@ -66,6 +74,7 @@ AC_DEFUN([OPAL_SETUP_FT],[
6674
if test "x$with_ft" != "x" || test "$opal_want_ft" = "1"; then
6775
opal_want_ft=1
6876
opal_want_ft_cr=0
77+
opal_want_ft_mpi=0
6978
opal_want_ft_type=none
7079
7180
as_save_IFS=$IFS
@@ -75,7 +84,15 @@ AC_DEFUN([OPAL_SETUP_FT],[
7584
7685
# Default value
7786
if test "$opt" = "" || test "$opt" = "yes"; then
78-
opal_want_ft_cr=1
87+
opal_want_ft_mpi=1
88+
elif test "$opt" = "ULFM"; then
89+
opal_want_ft_mpi=1
90+
elif test "$opt" = "ulfm"; then
91+
opal_want_ft_mpi=1
92+
elif test "$opt" = "MPI"; then
93+
opal_want_ft_mpi=1
94+
elif test "$opt" = "mpi"; then
95+
opal_want_ft_mpi=1
7996
elif test "$opt" = "LAM"; then
8097
opal_want_ft_cr=1
8198
elif test "$opt" = "lam"; then
@@ -89,7 +106,9 @@ AC_DEFUN([OPAL_SETUP_FT],[
89106
AC_MSG_ERROR([Cannot continue])
90107
fi
91108
done
92-
if test "$opal_want_ft_cr" = 1; then
109+
if test "$opal_want_ft_mpi" = 1; then
110+
opal_want_ft_type="mpi"
111+
elif test "$opal_want_ft_cr" = 1; then
93112
opal_want_ft_type="cr"
94113
fi
95114
@@ -101,16 +120,20 @@ AC_DEFUN([OPAL_SETUP_FT],[
101120
AC_MSG_WARN([**************************************************])
102121
else
103122
opal_want_ft=0
123+
opal_want_ft_mpi=0
104124
opal_want_ft_cr=0
105125
if test "$opal_setup_ft_options" = "yes"; then
106126
AC_MSG_RESULT([Disabled fault tolerance])
107127
fi
108128
fi
109129
AC_DEFINE_UNQUOTED([OPAL_ENABLE_FT], [$opal_want_ft],
110130
[Enable fault tolerance general components and logic])
131+
AC_DEFINE_UNQUOTED([OPAL_ENABLE_FT_MPI], [$opal_want_ft_mpi],
132+
[Enable fault tolerance MPI ULFM components and logic])
111133
AC_DEFINE_UNQUOTED([OPAL_ENABLE_FT_CR], [$opal_want_ft_cr],
112134
[Enable fault tolerance checkpoint/restart components and logic])
113135
AM_CONDITIONAL(WANT_FT, test "$opal_want_ft" = "1")
136+
AM_CONDITIONAL(WANT_FT_MPI, test "$opal_want_ft_mpi" = "1")
114137
AM_CONDITIONAL(WANT_FT_CR, test "$opal_want_ft_cr" = "1")
115138
116139
if test "$opal_setup_ft_options" = "yes"; then

contrib/amca-param-sets/ft-enable-mpi

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
#
2+
# Copyright (c) 2010-2012 Oak Ridge National Labs. All rights reserved.
3+
# Copyright (c) 2012-2016 The University of Tennessee and The University
4+
# of Tennessee Research Foundation. All rights
5+
# reserved.
6+
#
7+
# $COPYRIGHT$
8+
#
9+
# Additional copyrights may follow
10+
#
11+
# $HEADER$
12+
#
13+
# An Aggregate MCA Parameter Set to enable MPI Layer fault tolerance
14+
# capabilities.
15+
#
16+
# Usage:
17+
# shell$ mpirun -am ft-enable-mpi ./app
18+
#
19+
20+
#
21+
# OPAL Parameters
22+
#
23+
24+
# ORTE Parameters
25+
# - Use the modified 'cm' routed component - 'rts'. It is the only one that is currently able to
26+
# handle process and daemon loss.
27+
#
28+
# JJH: routed=binomial
29+
routed=rts
30+
#plm=rsh
31+
#rmaps=resilient
32+
33+
# Disable tree spawn for now
34+
plm_rsh_no_tree_spawn = 1
35+
36+
# Stablization at the runtime layer
37+
errmgr_rts_hnp_priority=5000
38+
errmgr_rts_orted_priority=5000
39+
errmgr_rts_app_priority=5000
40+
41+
#
42+
# OMPI Parameters
43+
# - Only fully tested with the listed btls
44+
# - ftbasic provides agreement.
45+
#
46+
ompi_ftmpi_enable=1
47+
# UGNI, OpenIB, TCP, SM have been tested to work well and support most failures gracefully.
48+
# if you want perfectly safe, the following restriction can be set, but we believe its' unecessary.
49+
# XPMEM is broken and it's build is disabled in this fork.
50+
#btl=tcp,sm,self
51+
# Tuned has been tested and works well in many cases; however, it does have some
52+
# corner cases. The safer setup "basic,ftbasic" imparts a huge performance
53+
# penalty, we encourage you to try "tuned,basic,ftbasic" and see if it works for you, in most cases, it will.
54+
coll=tuned,basic,ftbasic

ompi/attribute/attribute_predefined.c

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
33
* University Research and Technology
44
* Corporation. All rights reserved.
5-
* Copyright (c) 2004-2005 The University of Tennessee and The University
5+
* Copyright (c) 2004-2016 The University of Tennessee and The University
66
* of Tennessee Research Foundation. All rights
77
* reserved.
88
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -51,6 +51,9 @@
5151
*
5252
* MPI_WTIME_IS_GLOBAL is set to 0 (a conservative answer).
5353
*
54+
* MPI_FT is set to 0 or 1 (according to OPAL_ENABLE_FT_MPI and
55+
* ompi_ftmpi_enabled)
56+
*
5457
* MPI_APPNUM is set as the result of a GPR subscription.
5558
*
5659
* MPI_LASTUSEDCODE is set to an initial value and is reset every time
@@ -129,6 +132,9 @@ int ompi_attr_create_predefined(void)
129132
OMPI_SUCCESS != (ret = create_win(MPI_WIN_DISP_UNIT)) ||
130133
OMPI_SUCCESS != (ret = create_win(MPI_WIN_CREATE_FLAVOR)) ||
131134
OMPI_SUCCESS != (ret = create_win(MPI_WIN_MODEL)) ||
135+
#if OPAL_ENABLE_FT_MPI
136+
OMPI_SUCCESS != (ret = create_comm(MPI_FT, true)) ||
137+
#endif /* OPAL_ENABLE_FT_MPI */
132138
#if 0
133139
/* JMS For when we implement IMPI */
134140
OMPI_SUCCESS != (ret = create_comm(IMPI_CLIENT_SIZE, true)) ||
@@ -148,6 +154,12 @@ int ompi_attr_create_predefined(void)
148154
OMPI_SUCCESS != (ret = set_f(MPI_WTIME_IS_GLOBAL, 0)) ||
149155
OMPI_SUCCESS != (ret = set_f(MPI_LASTUSEDCODE,
150156
ompi_mpi_errcode_lastused)) ||
157+
#if OPAL_ENABLE_FT_MPI
158+
/* Although we always define the key to ease fortran integration,
159+
* lets not set a default value to the attribute if we do not
160+
* have fault tolerance built in. */
161+
OMPI_SUCCESS != (ret = set_f(MPI_FT, ompi_ftmpi_enabled)) ||
162+
#endif /* OPAL_ENABLE_FT_MPI */
151163
#if 0
152164
/* JMS For when we implement IMPI */
153165
OMPI_SUCCESS != (ret = set(IMPI_CLIENT_SIZE,
@@ -189,6 +201,9 @@ int ompi_attr_free_predefined(void)
189201
OMPI_SUCCESS != (ret = free_comm(MPI_HOST)) ||
190202
OMPI_SUCCESS != (ret = free_comm(MPI_IO)) ||
191203
OMPI_SUCCESS != (ret = free_comm(MPI_WTIME_IS_GLOBAL)) ||
204+
#if OPAL_ENABLE_FT_MPI
205+
OMPI_SUCCESS != (ret = free_comm(MPI_FT)) ||
206+
#endif /* OPAL_ENABLE_FT_MPI */
192207
OMPI_SUCCESS != (ret = free_comm(MPI_APPNUM)) ||
193208
OMPI_SUCCESS != (ret = free_comm(MPI_LASTUSEDCODE)) ||
194209
OMPI_SUCCESS != (ret = free_comm(MPI_UNIVERSE_SIZE)) ||

ompi/communicator/Makefile.am

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,14 @@
33
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
44
# University Research and Technology
55
# Corporation. All rights reserved.
6-
# Copyright (c) 2004-2005 The University of Tennessee and The University
6+
# Copyright (c) 2004-2016 The University of Tennessee and The University
77
# of Tennessee Research Foundation. All rights
88
# reserved.
99
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
1010
# University of Stuttgart. All rights reserved.
1111
# Copyright (c) 2004-2005 The Regents of the University of California.
1212
# All rights reserved.
13+
# Copyright (c) 2010-2012 Oak Ridge National Labs. All rights reserved.
1314
# Copyright (c) 2013-2017 Los Alamos National Security, LLC. All rights
1415
# reserved.
1516
# Copyright (c) 2014 Research Organization for Information Science
@@ -34,3 +35,8 @@ lib@OMPI_LIBMPI_NAME@_la_SOURCES += \
3435
communicator/comm_cid.c \
3536
communicator/comm_request.c
3637

38+
if WANT_FT_MPI
39+
lib@OMPI_LIBMPI_NAME@_la_SOURCES += \
40+
communicator/comm_ft.c communicator/comm_ft_reliable_bcast.c communicator/comm_ft_propagate.c communicator/comm_ft_revoke.c
41+
endif # WANT_FT_MPI
42+

0 commit comments

Comments
 (0)