Skip to content

Detect that we have a mix of BE/LE in the system, provide a warning that OMPI doesn't currently support this environment, and error out #3828

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 7, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 1 addition & 16 deletions config/opal_configure_options.m4
Original file line number Diff line number Diff line change
Expand Up @@ -286,22 +286,7 @@ fi
AC_DEFINE_UNQUOTED(OPAL_ENABLE_DLOPEN_SUPPORT, $OPAL_ENABLE_DLOPEN_SUPPORT,
[Whether we want to enable dlopen support])

#
# Heterogeneous support
#

AC_MSG_CHECKING([if want heterogeneous support])
AC_ARG_ENABLE([heterogeneous],
[AC_HELP_STRING([--enable-heterogeneous],
[Enable features required for heterogeneous
platform support (default: disabled)])])
if test "$enable_heterogeneous" = "yes" ; then
AC_MSG_RESULT([yes])
opal_want_heterogeneous=1
else
AC_MSG_RESULT([no])
opal_want_heterogeneous=0
fi
opal_want_heterogeneous=0
AC_DEFINE_UNQUOTED([OPAL_ENABLE_HETEROGENEOUS_SUPPORT],
[$opal_want_heterogeneous],
[Enable features required for heterogeneous support])
Expand Down
2 changes: 1 addition & 1 deletion configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,7 @@ AC_CACHE_SAVE
opal_show_title "Header file tests"

AC_CHECK_HEADERS([alloca.h aio.h arpa/inet.h dirent.h \
dlfcn.h execinfo.h err.h fcntl.h grp.h libgen.h \
dlfcn.h endian.h execinfo.h err.h fcntl.h grp.h libgen.h \
libutil.h memory.h netdb.h netinet/in.h netinet/tcp.h \
poll.h pthread.h pty.h pwd.h sched.h \
strings.h stropts.h linux/ethtool.h linux/sockios.h \
Expand Down
25 changes: 18 additions & 7 deletions opal/mca/hwloc/base/hwloc_base_util.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_ENDIAN_H
#include <endian.h>
#endif

#include "opal/runtime/opal.h"
#include "opal/constants.h"
Expand Down Expand Up @@ -2163,7 +2166,7 @@ int opal_hwloc_get_sorted_numa_list(hwloc_topology_t topo, char* device_name, op
char* opal_hwloc_base_get_topo_signature(hwloc_topology_t topo)
{
int nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt;
char *sig=NULL, *arch=NULL;
char *sig=NULL, *arch = NULL, *endian;
hwloc_obj_t obj;
unsigned i;

Expand All @@ -2183,14 +2186,22 @@ char* opal_hwloc_base_get_topo_signature(hwloc_topology_t topo)
break;
}
}

if (NULL == arch) {
asprintf(&sig, "%dN:%dS:%dL3:%dL2:%dL1:%dC:%dH",
nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt);
} else {
asprintf(&sig, "%dN:%dS:%dL3:%dL2:%dL1:%dC:%dH:%s",
nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt, arch);
arch = "unknown";
}

#ifdef __BYTE_ORDER
#if __BYTE_ORDER == __LITTLE_ENDIAN
endian = "le";
#else
endian = "be";
#endif
#else
endian = "unknown";
#endif

asprintf(&sig, "%dN:%dS:%dL3:%dL2:%dL1:%dC:%dH:%s:%s",
nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt, arch, endian);
return sig;
}

Expand Down
13 changes: 12 additions & 1 deletion orte/mca/plm/base/help-plm-base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2015 Intel, Inc. All rights reserved.
# Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
Expand Down Expand Up @@ -162,3 +162,14 @@ A call was made to launch additional processes, but this process has
no active out-of-band transports and therefore cannot execute this call.
Please check to see if you have the "oob" MCA parameter set and ensure
that it is either unset or at least includes the tcp transport.
#
[multi-endian]
Open MPI does not currently support multi-endian operations. We have
detected that the following node differs in endianness:


Nodename: %s
Endian: %s
Local endian: %s

Please correct the situation and try again.
27 changes: 27 additions & 0 deletions orte/mca/plm/base/plm_base_launch_support.c
Original file line number Diff line number Diff line change
Expand Up @@ -1055,12 +1055,23 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
int i;
bool found;
orte_daemon_cmd_flag_t cmd;
char *myendian;

/* get the daemon job, if necessary */
if (NULL == jdatorted) {
jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
}

/* get my endianness */
t = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, 0);
if (NULL == t) {
/* should never happen */
myendian = "unknown";
} else {
myendian = strrchr(t->sig, ':');
++myendian;
}

/* multiple daemons could be in this buffer, so unpack until we exhaust the data */
idx = 1;
while (OPAL_SUCCESS == (rc = opal_dss.unpack(buffer, &dname, &idx, ORTE_NAME))) {
Expand Down Expand Up @@ -1240,8 +1251,24 @@ void orte_plm_base_daemon_callback(int status, orte_process_name_t* sender,
}
free(sig);
break;
} else {
/* check if the difference is due to the endianness */
ptr = strrchr(sig, ':');
++ptr;
if (0 != strcmp(ptr, myendian)) {
/* we don't currently handle multi-endian operations in the
* MPI support */
orte_show_help("help-plm-base", "multi-endian", true,
nodename, ptr, myendian);
orted_failed_launch = true;
if (NULL != topo) {
hwloc_topology_destroy(topo);
}
goto CLEANUP;
}
}
}

if (!found) {
/* nope - save the signature and request the complete topology from that node */
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
Expand Down