Skip to content

Commit fe41070

Browse files
authored
Merge pull request #3828 from rhc54/cmr30/bele
Detect that we have a mix of BE/LE in the system, provide a warning that OMPI doesn't currently support this environment, and error out
2 parents b92a139 + 2f4b3ab commit fe41070

File tree

5 files changed

+59
-25
lines changed

5 files changed

+59
-25
lines changed

config/opal_configure_options.m4

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -286,22 +286,7 @@ fi
286286
AC_DEFINE_UNQUOTED(OPAL_ENABLE_DLOPEN_SUPPORT, $OPAL_ENABLE_DLOPEN_SUPPORT,
287287
[Whether we want to enable dlopen support])
288288

289-
#
290-
# Heterogeneous support
291-
#
292-
293-
AC_MSG_CHECKING([if want heterogeneous support])
294-
AC_ARG_ENABLE([heterogeneous],
295-
[AC_HELP_STRING([--enable-heterogeneous],
296-
[Enable features required for heterogeneous
297-
platform support (default: disabled)])])
298-
if test "$enable_heterogeneous" = "yes" ; then
299-
AC_MSG_RESULT([yes])
300-
opal_want_heterogeneous=1
301-
else
302-
AC_MSG_RESULT([no])
303-
opal_want_heterogeneous=0
304-
fi
289+
opal_want_heterogeneous=0
305290
AC_DEFINE_UNQUOTED([OPAL_ENABLE_HETEROGENEOUS_SUPPORT],
306291
[$opal_want_heterogeneous],
307292
[Enable features required for heterogeneous support])

configure.ac

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -588,7 +588,7 @@ AC_CACHE_SAVE
588588
opal_show_title "Header file tests"
589589

590590
AC_CHECK_HEADERS([alloca.h aio.h arpa/inet.h dirent.h \
591-
dlfcn.h execinfo.h err.h fcntl.h grp.h libgen.h \
591+
dlfcn.h endian.h execinfo.h err.h fcntl.h grp.h libgen.h \
592592
libutil.h memory.h netdb.h netinet/in.h netinet/tcp.h \
593593
poll.h pthread.h pty.h pwd.h sched.h \
594594
strings.h stropts.h linux/ethtool.h linux/sockios.h \

opal/mca/hwloc/base/hwloc_base_util.c

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@
3232
#ifdef HAVE_UNISTD_H
3333
#include <unistd.h>
3434
#endif
35+
#ifdef HAVE_ENDIAN_H
36+
#include <endian.h>
37+
#endif
3538

3639
#include "opal/runtime/opal.h"
3740
#include "opal/constants.h"
@@ -2163,7 +2166,7 @@ int opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, char* device_name, op
21632166
char* opal_hwloc_base_get_topo_signature(hwloc_topology_t topo)
21642167
{
21652168
int nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt;
2166-
char *sig=NULL, *arch=NULL;
2169+
char *sig=NULL, *arch = NULL, *endian;
21672170
hwloc_obj_t obj;
21682171
unsigned i;
21692172

@@ -2183,14 +2186,22 @@ char* opal_hwloc_base_get_topo_signature(hwloc_topology_t topo)
21832186
break;
21842187
}
21852188
}
2186-
21872189
if (NULL == arch) {
2188-
asprintf(&sig, "%dN:%dS:%dL3:%dL2:%dL1:%dC:%dH",
2189-
nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt);
2190-
} else {
2191-
asprintf(&sig, "%dN:%dS:%dL3:%dL2:%dL1:%dC:%dH:%s",
2192-
nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt, arch);
2190+
arch = "unknown";
21932191
}
2192+
2193+
#ifdef __BYTE_ORDER
2194+
#if __BYTE_ORDER == __LITTLE_ENDIAN
2195+
endian = "le";
2196+
#else
2197+
endian = "be";
2198+
#endif
2199+
#else
2200+
endian = "unknown";
2201+
#endif
2202+
2203+
asprintf(&sig, "%dN:%dS:%dL3:%dL2:%dL1:%dC:%dH:%s:%s",
2204+
nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt, arch, endian);
21942205
return sig;
21952206
}
21962207

orte/mca/plm/base/help-plm-base.txt

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
# University of Stuttgart. All rights reserved.
1111
# Copyright (c) 2004-2005 The Regents of the University of California.
1212
# All rights reserved.
13-
# Copyright (c) 2015 Intel, Inc. All rights reserved.
13+
# Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
1414
# $COPYRIGHT$
1515
#
1616
# Additional copyrights may follow
@@ -162,3 +162,14 @@ A call was made to launch additional processes, but this process has
162162
no active out-of-band transports and therefore cannot execute this call.
163163
Please check to see if you have the "oob" MCA parameter set and ensure
164164
that it is either unset or at least includes the tcp transport.
165+
#
166+
[multi-endian]
167+
Open MPI does not currently support multi-endian operations. We have
168+
detected that the following node differs in endianness:
169+
170+
171+
Nodename: %s
172+
Endian: %s
173+
Local endian: %s
174+
175+
Please correct the situation and try again.

orte/mca/plm/base/plm_base_launch_support.c

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1055,12 +1055,23 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
10551055
int i;
10561056
bool found;
10571057
orte_daemon_cmd_flag_t cmd;
1058+
char *myendian;
10581059

10591060
/* get the daemon job, if necessary */
10601061
if (NULL == jdatorted) {
10611062
jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
10621063
}
10631064

1065+
/* get my endianness */
1066+
t = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, 0);
1067+
if (NULL == t) {
1068+
/* should never happen */
1069+
myendian = "unknown";
1070+
} else {
1071+
myendian = strrchr(t->sig, ':');
1072+
++myendian;
1073+
}
1074+
10641075
/* multiple daemons could be in this buffer, so unpack until we exhaust the data */
10651076
idx = 1;
10661077
while (OPAL_SUCCESS == (rc = opal_dss.unpack(buffer, &dname, &idx, ORTE_NAME))) {
@@ -1240,8 +1251,24 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
12401251
}
12411252
free(sig);
12421253
break;
1254+
} else {
1255+
/* check if the difference is due to the endianness */
1256+
ptr = strrchr(sig, ':');
1257+
++ptr;
1258+
if (0 != strcmp(ptr, myendian)) {
1259+
/* we don't currently handle multi-endian operations in the
1260+
* MPI support */
1261+
orte_show_help("help-plm-base", "multi-endian", true,
1262+
nodename, ptr, myendian);
1263+
orted_failed_launch = true;
1264+
if (NULL != topo) {
1265+
hwloc_topology_destroy(topo);
1266+
}
1267+
goto CLEANUP;
1268+
}
12431269
}
12441270
}
1271+
12451272
if (!found) {
12461273
/* nope - save the signature and request the complete topology from that node */
12471274
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,

0 commit comments

Comments
 (0)