Skip to content

Commit f146bf9

Browse files
committed
Fix incorrect TCP connections.
If nodes have the same IP addresses (for containers or other purposes) and these addresses get published as part of the modex, a remote peer might try to use one of the addresses to connect. As both nodes have the same IP, there are several cases: - the "remote" port is not used by an OMPI process locally, the connection is refused or it timeouts. This is the "nicest" outcome, as a new IP will be used resulting in a successful connection and the continuation of the application. - the "remote" port is used by another OMPI process on the local node. A connection will be established but the incorrect guid will be exchanged leading to complaints, connection dropped and/or deadlocks. - the "remote" port is used by this process, basically resulting in a connection-to-self. Bad things happen, as we don't support TCP connections to self. Some output messages are generated, but the outcome is most likely a deadlock. Up to now, users were expected to exclude such interfaces from the accepted interfaces, but this patch removes this need. If we discover a local IP as part of the IP list of a remote peer, we drop it and never try to use it. This does not apply to local processes, so we can still use these interfaces for node level communications (which will work as we will connect to the correct port according to the destination process). Signed-off-by: George Bosilca <[email protected]>
1 parent f0261cb commit f146bf9

File tree

4 files changed

+114
-6
lines changed

4 files changed

+114
-6
lines changed

opal/mca/btl/tcp/btl_tcp_component.c

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
* Copyright (c) 2009 Oak Ridge National Laboratory
1616
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
1717
* reserved.
18-
* Copyright (c) 2013-2015 NVIDIA Corporation. All rights reserved.
18+
* Copyright (c) 2013-2024 NVIDIA Corporation. All rights reserved.
1919
* Copyright (c) 2014-2019 Intel, Inc. All rights reserved.
2020
* Copyright (c) 2014-2017 Research Organization for Information Science
2121
* and Technology (RIST). All rights reserved.
@@ -1502,9 +1502,32 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void *user)
15021502
/* lookup the corresponding process */
15031503
btl_proc = mca_btl_tcp_proc_lookup(&guid);
15041504
if (NULL == btl_proc) {
1505-
opal_show_help("help-mpi-btl-tcp.txt", "server accept cannot find guid", true,
1506-
opal_process_info.nodename, getpid());
1505+
const char *peer = opal_fd_get_peer_name(sd);
1506+
if( 0 == opal_compare_proc(opal_process_info.my_name, guid) ) {
1507+
opal_show_help("help-mpi-btl-tcp.txt", "server cannot accept connection from self", true,
1508+
peer, OPAL_NAME_PRINT(guid),
1509+
opal_process_info.nodename, getpid());
1510+
/**
1511+
* Special case: we used an interface to send data to a remote peer
1512+
* but that interface is only local, and we received our own message.
1513+
* If we just close the socket this will confuse the connection, as
1514+
* it will not be able to know that the interface should not be
1515+
* used. Instead, we cabn identify ourselves by sending our guid
1516+
* back to ourselves, marking the interface as improper for future
1517+
* communications.
1518+
*/
1519+
if (sizeof(hs_msg) != mca_btl_tcp_send_blocking(sd, &hs_msg, sizeof(hs_msg))) {
1520+
opal_show_help("help-mpi-btl-tcp.txt", "client handshake fail", true,
1521+
opal_process_info.nodename, sizeof(hs_msg),
1522+
"connect ACK failed to send magic-id and guid");
1523+
}
1524+
} else {
1525+
opal_show_help("help-mpi-btl-tcp.txt", "server accept cannot find guid", true,
1526+
OPAL_NAME_PRINT(opal_process_info.my_name), opal_process_info.nodename,
1527+
getpid(), OPAL_NAME_PRINT(guid), peer);
1528+
}
15071529
CLOSE_THE_SOCKET(sd);
1530+
free((char*)peer);
15081531
return;
15091532
}
15101533

opal/mca/btl/tcp/btl_tcp_endpoint.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -791,6 +791,14 @@ static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t *btl_endpo
791791
CLOSE_THE_SOCKET(btl_endpoint->endpoint_sd);
792792
return OPAL_ERROR;
793793
}
794+
char tmp[2][16];
795+
inet_ntop(AF_INET, &((struct sockaddr_in *)&btl_endpoint->endpoint_btl->tcp_ifaddr)->sin_addr, tmp[0], 16);
796+
inet_ntop(AF_INET, &((struct sockaddr_in *)&endpoint_addr)->sin_addr, tmp[1], 16);
797+
opal_output(0, "proc %s bind socket to %s:%d before connecting to peer %s at %s:%d\n",
798+
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
799+
tmp[0], htons(((struct sockaddr_in *) &btl_endpoint->endpoint_btl->tcp_ifaddr)->sin_port),
800+
OPAL_NAME_PRINT(btl_endpoint->endpoint_proc->proc_opal->proc_name),
801+
tmp[1], ntohs(((struct sockaddr_in *) &endpoint_addr)->sin_port));
794802
}
795803
#if OPAL_ENABLE_IPV6
796804
if (endpoint_addr.ss_family == AF_INET6) {

opal/mca/btl/tcp/btl_tcp_proc.c

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,68 @@ mca_btl_tcp_proc_t *mca_btl_tcp_proc_create(opal_proc_t *proc)
409409
goto cleanup;
410410
}
411411

412-
btl_proc->proc_addr_count = size / sizeof(mca_btl_tcp_modex_addr_t);
412+
/**
413+
* If the peer is physically located on another node, remove allinterfaces
414+
* that have the an IP address identical to any local interface (this will
415+
* remove all the local and virtual interfaces).
416+
*/
417+
size_t count = size / sizeof(mca_btl_tcp_modex_addr_t);
418+
if (!OPAL_PROC_ON_LOCAL_NODE(proc->proc_flags)) {
419+
opal_list_t *local_ifs = &mca_btl_tcp_component.local_ifs;
420+
opal_if_t *local_iter;
421+
char tmp[2][16];
422+
for (uint32_t i = 0; i < count; /* no automatic progress */) {
423+
OPAL_LIST_FOREACH (local_iter, local_ifs, opal_if_t) {
424+
if (MCA_BTL_TCP_AF_INET == remote_addrs[i].addr_family &&
425+
AF_INET == local_iter->af_family) {
426+
if (!memcmp(&((struct sockaddr_in *)&local_iter->if_addr)->sin_addr,
427+
remote_addrs[i].addr,
428+
sizeof(struct in_addr))) {
429+
/* we found a match */
430+
inet_ntop(AF_INET, remote_addrs[i].addr, tmp[0], 16);
431+
inet_ntop(AF_INET, &((struct sockaddr_in *) &local_iter->if_addr)->sin_addr,
432+
tmp[1], 16);
433+
goto match_found;
434+
}
435+
}
436+
#if OPAL_ENABLE_IPV6
437+
else if (MCA_BTL_TCP_AF_INET6 == remote_addrs[i].addr_family &&
438+
AF_INET6 == local_iter->af_family) {
439+
if (!memcmp(&((struct sockaddr_in6 *) &local_iter->if_addr)->sin6_addr,
440+
remote_addrs[i].addr,
441+
sizeof(struct in6_addr))) {
442+
/* we found a match */
443+
inet_ntop(AF_INET6, remote_addrs[i].addr, tmp[0], 16);
444+
inet_ntop(AF_INET6, &((struct sockaddr_in6 *) &local_iter->if_addr)->sin6_addr,
445+
tmp[1], 16);
446+
goto match_found;
447+
}
448+
}
449+
#endif /* OPAL_ENABLE_IPV6 */
450+
}
451+
if (MCA_BTL_TCP_AF_INET == remote_addrs[i].addr_family) {
452+
inet_ntop(AF_INET, remote_addrs[i].addr, tmp[0], 16);
453+
} else {
454+
inet_ntop(AF_INET6, remote_addrs[i].addr, tmp[0], 16);
455+
}
456+
opal_output_verbose(20, opal_btl_base_framework.framework_output,
457+
"btl: tcp: Accept IP %s from %s\n", tmp[0],
458+
OPAL_NAME_PRINT(proc->proc_name));
459+
i++; /* go to the next remote interface */
460+
continue;
461+
match_found:
462+
opal_output_verbose(20, opal_btl_base_framework.framework_output,
463+
"btl: tcp: Drop IP %s from %s because it matches "
464+
"the local IP %s (%s))!\n",
465+
tmp[0], OPAL_NAME_PRINT(proc->proc_name), tmp[1],
466+
local_iter->if_name);
467+
count--;
468+
memmove(&remote_addrs[i], &remote_addrs[i + 1],
469+
(count - i) * sizeof(mca_btl_tcp_modex_addr_t));
470+
break;
471+
}
472+
}
473+
btl_proc->proc_addr_count = count;
413474
btl_proc->proc_addrs = malloc(btl_proc->proc_addr_count * sizeof(mca_btl_tcp_addr_t));
414475
if (NULL == btl_proc->proc_addrs) {
415476
rc = OPAL_ERR_OUT_OF_RESOURCE;

opal/mca/btl/tcp/help-mpi-btl-tcp.txt

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,19 @@ aborting your job.
113113
Flag: %s
114114
Error: %s (%d)
115115
#
116+
[server cannot accept connection from self]
117+
WARNING: Open MPI accepted a TCP connection from what appears to be
118+
the same Open MPI process. This is not supported, please exclude
119+
(using btl_tcp_if_exclude) the interface corresponding to the
120+
following IP address %s
121+
122+
This attempted connection will be ignored; your MPI job may or may not
123+
continue properly.
124+
125+
Process guid: %s
126+
on host: %s
127+
PID: %d
128+
#
116129
[server accept cannot find guid]
117130
WARNING: Open MPI accepted a TCP connection from what appears to be a
118131
another Open MPI process but cannot find a corresponding process
@@ -121,8 +134,11 @@ entry for that peer.
121134
This attempted connection will be ignored; your MPI job may or may not
122135
continue properly.
123136

124-
Local host: %s
125-
PID: %d
137+
Process guid: %s
138+
on host: %s
139+
PID: %d
140+
guid: %s
141+
Source IP of socket: %s
126142
#
127143
[server getpeername failed]
128144
WARNING: Open MPI failed to look up the peer IP address information of

0 commit comments

Comments
 (0)