From 32277db6aba4e0a794a0c9814a23f6d2930f4170 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Fri, 30 Aug 2013 13:57:11 +0200 Subject: [PATCH 1/3] Add support for async progress in the BTL TCP. All BTL-only operations (basically all data movements with the exception of the matching operation) can now be handled for the TCP BTL by a progress thread. --- .../btl_tcp2.c | 84 ++++- .../btl_tcp2_endpoint.c | 330 +++++++++++------- .../btl_tcp2_frag.h | 62 +++- .../btl_tcp2_proc.c | 47 +-- opal/mca/btl/tcp/btl_tcp.h | 100 +++++- opal/mca/btl/tcp/btl_tcp_component.c | 283 +++++++++++++-- opal/mca/btl/tcp/btl_tcp_endpoint.h | 5 +- opal/mca/btl/tcp/btl_tcp_frag.c | 39 ++- opal/mca/btl/tcp/btl_tcp_hdr.h | 2 +- opal/mca/btl/tcp/btl_tcp_proc.h | 2 +- 10 files changed, 737 insertions(+), 217 deletions(-) diff --git a/contrib/build-mca-comps-outside-of-tree/btl_tcp2.c b/contrib/build-mca-comps-outside-of-tree/btl_tcp2.c index bb25a08dee9..59fdc7fe75c 100644 --- a/contrib/build-mca-comps-outside-of-tree/btl_tcp2.c +++ b/contrib/build-mca-comps-outside-of-tree/btl_tcp2.c @@ -32,7 +32,10 @@ #include "opal/datatype/opal_convertor.h" #include "ompi/mca/mpool/base/base.h" #include "ompi/mca/mpool/mpool.h" -#include "ompi/proc/proc.h" +#include "btl_tcp.h" +#include "btl_tcp_frag.h" +#include "btl_tcp_proc.h" +#include "btl_tcp_endpoint.h" mca_btl_tcp2_module_t mca_btl_tcp2_module = { { @@ -57,9 +60,9 @@ mca_btl_tcp2_module_t mca_btl_tcp2_module = { mca_btl_tcp2_prepare_dst, mca_btl_tcp2_send, NULL, /* send immediate */ - mca_btl_tcp2_put, - NULL, /* get */ - mca_btl_base_dump, + mca_btl_tcp_put, + NULL, /* get */ + mca_btl_tcp_dump, NULL, /* mpool */ NULL, /* register error */ mca_btl_tcp2_ft_event @@ -134,7 +137,9 @@ int mca_btl_tcp2_add_procs( struct mca_btl_base_module_t* btl, /* we increase the count of MPI users of the event library once per peer, so that we are used until we aren't connected to a peer */ +#if !MCA_BTL_TCP_USES_PROGRESS_THREAD opal_progress_event_users_increment(); +#endif /* !MCA_BTL_TCP_USES_PROGRESS_THREAD */ } return OMPI_SUCCESS; @@ -153,7 +158,9 @@ int mca_btl_tcp2_del_procs(struct mca_btl_base_module_t* btl, opal_list_remove_item(&tcp_btl->tcp_endpoints, (opal_list_item_t*)tcp_endpoint); OBJ_RELEASE(tcp_endpoint); } +#if !MCA_BTL_TCP_USES_PROGRESS_THREAD opal_progress_event_users_decrement(); +#endif /* !MCA_BTL_TCP_USES_PROGRESS_THREAD */ } return OMPI_SUCCESS; } @@ -183,7 +190,11 @@ mca_btl_base_descriptor_t* mca_btl_tcp2_alloc( if( OPAL_UNLIKELY(NULL == frag) ) { return NULL; } - + +#define GB_DEFINED 0 +#if GB_DEFINED + opal_output(0, "alloc_frag( size = %lu )\n", size); +#endif /* GB_DEFINED */ frag->segments[0].seg_len = size; frag->segments[0].seg_addr.pval = frag+1; @@ -193,7 +204,8 @@ mca_btl_base_descriptor_t* mca_btl_tcp2_alloc( frag->base.des_dst_cnt = 0; frag->base.des_flags = flags; frag->base.order = MCA_BTL_NO_ORDER; - frag->btl = (mca_btl_tcp2_module_t*)btl; + frag->btl = (mca_btl_tcp_module_t*)btl; + frag->endpoint = endpoint; return (mca_btl_base_descriptor_t*)frag; } @@ -296,6 +308,10 @@ mca_btl_base_descriptor_t* mca_btl_tcp2_prepare_src( frag->base.des_flags = flags; frag->base.order = MCA_BTL_NO_ORDER; *size = max_data; +#if GB_DEFINED + opal_output(0, "prepare_src( bConverted = %lu, size = %lu\n", + convertor->bConverted, *size); +#endif /* GB_DEFINED */ return &frag->base; } @@ -343,6 +359,10 @@ mca_btl_base_descriptor_t* mca_btl_tcp2_prepare_dst( frag->base.des_dst_cnt = 1; frag->base.des_flags = flags; frag->base.order = MCA_BTL_NO_ORDER; +#if GB_DEFINED + opal_output(0, " prepare_dst( bConverted = %lu, size = %lu\n", + convertor->bConverted, *size); +#endif /* GB_DEFINED */ return &frag->base; } @@ -384,7 +404,10 @@ int mca_btl_tcp2_send( struct mca_btl_base_module_t* btl, frag->hdr.type = MCA_BTL_TCP_HDR_TYPE_SEND; frag->hdr.count = 0; if (endpoint->endpoint_nbo) MCA_BTL_TCP_HDR_HTON(frag->hdr); - return mca_btl_tcp2_endpoint_send(endpoint,frag); +#if GB_DEFINED + opal_output(0, "frag_send( size = %u )\n", frag->hdr.size ); +#endif /* GB_DEFINED */ + return mca_btl_tcp_endpoint_send(endpoint,frag); } @@ -425,7 +448,10 @@ int mca_btl_tcp2_put( mca_btl_base_module_t* btl, frag->hdr.type = MCA_BTL_TCP_HDR_TYPE_PUT; frag->hdr.count = frag->base.des_dst_cnt; if (endpoint->endpoint_nbo) MCA_BTL_TCP_HDR_HTON(frag->hdr); - return ((i = mca_btl_tcp2_endpoint_send(endpoint,frag)) >= 0 ? OMPI_SUCCESS : i); +#if GB_DEFINED + opal_output(0, "frag_put( size = %u )\n", frag->hdr.size ); +#endif /* GB_DEFINED */ + return ((i = mca_btl_tcp_endpoint_send(endpoint,frag)) >= 0 ? OMPI_SUCCESS : i); } @@ -462,12 +488,16 @@ int mca_btl_tcp2_get( frag->hdr.type = MCA_BTL_TCP_HDR_TYPE_GET; frag->hdr.count = frag->base.des_src_cnt; if (endpoint->endpoint_nbo) MCA_BTL_TCP_HDR_HTON(frag->hdr); - return ((rc = mca_btl_tcp2_endpoint_send(endpoint,frag)) >= 0 ? OMPI_SUCCESS : rc); +#if GB_DEFINED + opal_output(0, "frag_get( size = %u )\n", frag->hdr.size ); +#endif /* GB_DEFINED */ + return ((rc = mca_btl_tcp_endpoint_send(endpoint,frag)) >= 0 ? OMPI_SUCCESS : rc); } /* - * Cleanup/release module resources. + * Cleanup/release module resources. This function should only be called once, + * there is no need to protect it. */ int mca_btl_tcp2_finalize(struct mca_btl_base_module_t* btl) @@ -479,8 +509,42 @@ int mca_btl_tcp2_finalize(struct mca_btl_base_module_t* btl) item = opal_list_remove_first(&tcp_btl->tcp_endpoints)) { mca_btl_tcp2_endpoint_t *endpoint = (mca_btl_tcp2_endpoint_t*)item; OBJ_RELEASE(endpoint); +#if !MCA_BTL_TCP_USES_PROGRESS_THREAD opal_progress_event_users_decrement(); +#endif /* !MCA_BTL_TCP_USES_PROGRESS_THREAD */ } free(tcp_btl); return OMPI_SUCCESS; } + +/** + * + */ +void mca_btl_tcp_dump(struct mca_btl_base_module_t* base_btl, + struct mca_btl_base_endpoint_t* endpoint, + int verbose) +{ + mca_btl_tcp_module_t* btl = (mca_btl_tcp_module_t*)base_btl; + mca_btl_base_err("%s TCP %p kernel_id %d\n" +#if MCA_BTL_TCP_STATISTICS + " | statistics: sent %lu recv %lu\n" +#endif /* MCA_BTL_TCP_STATISTICS */ + " | latency %u bandwidth %u\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void*)btl, btl->tcp_ifkindex, +#if MCA_BTL_TCP_STATISTICS + btl->tcp_bytes_sent, btl->btl_bytes_recv, +#endif /* MCA_BTL_TCP_STATISTICS */ + btl->super.btl_latency, btl->super.btl_bandwidth); + if( NULL != endpoint ) { + mca_btl_tcp_endpoint_dump( endpoint, "TCP" ); + } else if( verbose ) { + opal_list_item_t *item; + + for(item = opal_list_get_first(&btl->tcp_endpoints); + item != opal_list_get_end(&btl->tcp_endpoints); + item = opal_list_get_next(item)) { + mca_btl_tcp_endpoint_dump( (mca_btl_base_endpoint_t*)item, "TCP" ); + } + } +} + diff --git a/contrib/build-mca-comps-outside-of-tree/btl_tcp2_endpoint.c b/contrib/build-mca-comps-outside-of-tree/btl_tcp2_endpoint.c index f198654c793..778769d16ca 100644 --- a/contrib/build-mca-comps-outside-of-tree/btl_tcp2_endpoint.c +++ b/contrib/build-mca-comps-outside-of-tree/btl_tcp2_endpoint.c @@ -49,18 +49,15 @@ #include #endif /* HAVE_TIME_H */ -#include "opal/mca/event/event.h" - -#include "ompi/types.h" -#include "ompi/mca/btl/base/btl_base_error.h" #include "opal/util/net.h" +#include "opal/util/fd.h" +#include "opal/util/show_help.h" +#include "ompi/mca/btl/base/btl_base_error.h" +#include "ompi/mca/rte/rte.h" -#include "btl_tcp2.h" -#include "btl_tcp2_endpoint.h" -#include "btl_tcp2_proc.h" -#include "btl_tcp2_frag.h" -#include "btl_tcp2_addr.h" - +#include "btl_tcp_endpoint.h" +#include "btl_tcp_proc.h" +#include "btl_tcp_frag.h" /* * Initialize state of the endpoint instance. @@ -123,12 +120,10 @@ static void mca_btl_tcp2_endpoint_send_handler(int sd, short flags, void* user); * diagnostics */ -#if WANT_PEER_DUMP -static void mca_btl_tcp2_endpoint_dump(mca_btl_base_endpoint_t* btl_endpoint, const char* msg) +void mca_btl_tcp_endpoint_dump(mca_btl_base_endpoint_t* btl_endpoint, const char* msg) { - char src[64]; - char dst[64]; - int sndbuf,rcvbuf,nodelay,flags; + char src[64], dst[64], *status; + int sndbuf, rcvbuf, nodelay, flags = -1; #if OPAL_ENABLE_IPV6 struct sockaddr_storage inaddr; #else @@ -136,69 +131,102 @@ static void mca_btl_tcp2_endpoint_dump(mca_btl_base_endpoint_t* btl_endpoint, co #endif opal_socklen_t obtlen; opal_socklen_t addrlen = sizeof(inaddr); + opal_list_item_t *item; - getsockname(btl_endpoint->endpoint_sd, (struct sockaddr*)&inaddr, &addrlen); + if( -1 != btl_endpoint->endpoint_sd ) { + getsockname(btl_endpoint->endpoint_sd, (struct sockaddr*)&inaddr, &addrlen); #if OPAL_ENABLE_IPV6 - { - char *address; - address = (char *) opal_net_get_hostname((struct sockaddr*) &inaddr); - if (NULL != address) { - sprintf(src, "%s", address); + { + char *address; + address = (char *) opal_net_get_hostname((struct sockaddr*) &inaddr); + if (NULL != address) { + sprintf(src, "%s", address); + } } - } #else - sprintf(src, "%s", inet_ntoa(inaddr.sin_addr)); + sprintf(src, "%s", inet_ntoa(inaddr.sin_addr)); #endif - getpeername(btl_endpoint->endpoint_sd, (struct sockaddr*)&inaddr, &addrlen); + getpeername(btl_endpoint->endpoint_sd, (struct sockaddr*)&inaddr, &addrlen); #if OPAL_ENABLE_IPV6 - { - char *address; - address = (char *) opal_net_get_hostname ((struct sockaddr*) &inaddr); - if (NULL != address) { - sprintf(dst, "%s", address); + { + char *address; + address = (char *) opal_net_get_hostname ((struct sockaddr*) &inaddr); + if (NULL != address) { + sprintf(dst, "%s", address); + } } - } #else - sprintf(dst, "%s", inet_ntoa(inaddr.sin_addr)); + sprintf(dst, "%s", inet_ntoa(inaddr.sin_addr)); #endif - if((flags = fcntl(btl_endpoint->endpoint_sd, F_GETFL, 0)) < 0) { - BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)", - strerror(opal_socket_errno), opal_socket_errno)); - } + if((flags = fcntl(btl_endpoint->endpoint_sd, F_GETFL, 0)) < 0) { + BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)", + strerror(opal_socket_errno), opal_socket_errno)); + } #if defined(SO_SNDBUF) - obtlen = sizeof(sndbuf); - if(getsockopt(btl_endpoint->endpoint_sd, SOL_SOCKET, SO_SNDBUF, (char *)&sndbuf, &obtlen) < 0) { - BTL_ERROR(("SO_SNDBUF option: %s (%d)", - strerror(opal_socket_errno), opal_socket_errno)); - } + obtlen = sizeof(sndbuf); + if(getsockopt(btl_endpoint->endpoint_sd, SOL_SOCKET, SO_SNDBUF, (char *)&sndbuf, &obtlen) < 0) { + BTL_ERROR(("SO_SNDBUF option: %s (%d)", + strerror(opal_socket_errno), opal_socket_errno)); + } #else - sndbuf = -1; + sndbuf = -1; #endif #if defined(SO_RCVBUF) - obtlen = sizeof(rcvbuf); - if(getsockopt(btl_endpoint->endpoint_sd, SOL_SOCKET, SO_RCVBUF, (char *)&rcvbuf, &obtlen) < 0) { - BTL_ERROR(("SO_RCVBUF option: %s (%d)", - strerror(opal_socket_errno), opal_socket_errno)); - } + obtlen = sizeof(rcvbuf); + if(getsockopt(btl_endpoint->endpoint_sd, SOL_SOCKET, SO_RCVBUF, (char *)&rcvbuf, &obtlen) < 0) { + BTL_ERROR(("SO_RCVBUF option: %s (%d)", + strerror(opal_socket_errno), opal_socket_errno)); + } #else - rcvbuf = -1; + rcvbuf = -1; #endif #if defined(TCP_NODELAY) - obtlen = sizeof(nodelay); - if(getsockopt(btl_endpoint->endpoint_sd, IPPROTO_TCP, TCP_NODELAY, (char *)&nodelay, &obtlen) < 0) { - BTL_ERROR(("TCP_NODELAY option: %s (%d)", - strerror(opal_socket_errno), opal_socket_errno)); - } + obtlen = sizeof(nodelay); + if(getsockopt(btl_endpoint->endpoint_sd, IPPROTO_TCP, TCP_NODELAY, (char *)&nodelay, &obtlen) < 0) { + BTL_ERROR(("TCP_NODELAY option: %s (%d)", + strerror(opal_socket_errno), opal_socket_errno)); + } #else - nodelay = 0; + nodelay = 0; #endif + } + + mca_btl_base_err("%s %s: endpoint %p src %s - dst %s nodelay %d sndbuf %d rcvbuf %d flags %08x\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg, (void*)btl_endpoint, src, dst, nodelay, sndbuf, rcvbuf, flags); - BTL_VERBOSE(("%s: %s - %s nodelay %d sndbuf %d rcvbuf %d flags %08x", - msg, src, dst, nodelay, sndbuf, rcvbuf, flags)); + switch(btl_endpoint->endpoint_state) { + case MCA_BTL_TCP_CONNECTING: + status = "connecting"; break; + case MCA_BTL_TCP_CONNECT_ACK: + status = "connect ack"; break; + case MCA_BTL_TCP_CLOSED: + status = "closed"; break; + case MCA_BTL_TCP_FAILED: + status = "failed"; break; + case MCA_BTL_TCP_CONNECTED: + status = "connected"; break; + default: + status = "undefined"; break; + } + mca_btl_base_err("%s | [socket %d] [state %s] (nbo %s) (retries %u)\n" +#if MCA_BTL_TCP_ENDPOINT_CACHE + "\tcache %p length %lu pos %ld\n" +#endif /* MCA_BTL_TCP_ENDPOINT_CACHE */ + "\tpending: send %p recv %p\n", + msg, btl_endpoint->endpoint_sd, status, + (btl_endpoint->endpoint_nbo ? "true" : "false"), btl_endpoint->endpoint_retries, +#if MCA_BTL_TCP_ENDPOINT_CACHE + btl_endpoint->endpoint_cache, btl_endpoint->endpoint_cache_length, btl_endpoint->endpoint_cache_pos - btl_endpoint->endpoint_cache, +#endif /* MCA_BTL_TCP_ENDPOINT_CACHE */ + (void*)btl_endpoint->endpoint_send_frag, (void*)btl_endpoint->endpoint_recv_frag ); + for(item = opal_list_get_first(&btl_endpoint->endpoint_frags); + item != opal_list_get_end(&btl_endpoint->endpoint_frags); + item = opal_list_get_next(item)) { + mca_btl_tcp_dump_frag( (mca_btl_tcp_frag_t*)item, " | send" ); + } } -#endif /* * Initialize events to be used by the endpoint instance for TCP select/poll callbacks. @@ -211,22 +239,22 @@ static inline void mca_btl_tcp2_endpoint_event_init(mca_btl_base_endpoint_t* btl btl_endpoint->endpoint_cache_pos = btl_endpoint->endpoint_cache; #endif /* MCA_BTL_TCP_ENDPOINT_CACHE */ - opal_event_set(opal_event_base, &btl_endpoint->endpoint_recv_event, - btl_endpoint->endpoint_sd, - OPAL_EV_READ|OPAL_EV_PERSIST, - mca_btl_tcp2_endpoint_recv_handler, - btl_endpoint ); + opal_event_set(mca_btl_tcp_event_base, &btl_endpoint->endpoint_recv_event, + btl_endpoint->endpoint_sd, + OPAL_EV_READ|OPAL_EV_PERSIST, + mca_btl_tcp_endpoint_recv_handler, + btl_endpoint ); /** * The send event should be non persistent until the endpoint is * completely connected. This means, when the event is created it * will be fired only once, and when the endpoint is marked as * CONNECTED the event should be recreated with the correct flags. */ - opal_event_set(opal_event_base, &btl_endpoint->endpoint_send_event, - btl_endpoint->endpoint_sd, - OPAL_EV_WRITE, - mca_btl_tcp2_endpoint_send_handler, - btl_endpoint); + opal_event_set(mca_btl_tcp_event_base, &btl_endpoint->endpoint_send_event, + btl_endpoint->endpoint_sd, + OPAL_EV_WRITE, + mca_btl_tcp_endpoint_send_handler, + btl_endpoint); } @@ -239,7 +267,7 @@ int mca_btl_tcp2_endpoint_send(mca_btl_base_endpoint_t* btl_endpoint, mca_btl_tc { int rc = OMPI_SUCCESS; - OPAL_THREAD_LOCK(&btl_endpoint->endpoint_send_lock); + MCA_BTL_TCP_CRITICAL_SECTION_ENTER(&btl_endpoint->endpoint_send_lock); switch(btl_endpoint->endpoint_state) { case MCA_BTL_TCP_CONNECTING: case MCA_BTL_TCP_CONNECT_ACK: @@ -257,19 +285,18 @@ int mca_btl_tcp2_endpoint_send(mca_btl_base_endpoint_t* btl_endpoint, mca_btl_tc if(frag->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY && mca_btl_tcp2_frag_send(frag, btl_endpoint->endpoint_sd)) { int btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); - - OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock); - if( frag->base.des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK ) { - frag->base.des_cbfunc(&frag->btl->super, frag->endpoint, &frag->base, frag->rc); - } - if( btl_ownership ) { - MCA_BTL_TCP_FRAG_RETURN(frag); - } + opal_mutex_atomic_unlock(&btl_endpoint->endpoint_send_lock); + MCA_BTL_TCP_COMPLETE_FRAG_SEND(frag); return 1; } else { btl_endpoint->endpoint_send_frag = frag; - opal_event_add(&btl_endpoint->endpoint_send_event, 0); frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; +#define GB_DEFINED 0 +#if GB_DEFINED + opal_output(0, "%s:%d add the send event on socket %d\n", + __FILE__, __LINE__, btl_endpoint->endpoint_sd); /* GB */ +#endif /* GB_DEFINED */ + MCA_BTL_TCP_ACTIVATE_EVENT(&btl_endpoint->endpoint_send_event, 0); } } else { frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; @@ -277,7 +304,7 @@ int mca_btl_tcp2_endpoint_send(mca_btl_base_endpoint_t* btl_endpoint, mca_btl_tc } break; } - OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock); + MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(&btl_endpoint->endpoint_send_lock); return rc; } @@ -338,22 +365,20 @@ static int mca_btl_tcp2_endpoint_send_connect_ack(mca_btl_base_endpoint_t* btl_e bool mca_btl_tcp2_endpoint_accept(mca_btl_base_endpoint_t* btl_endpoint, struct sockaddr* addr, int sd) { - mca_btl_tcp2_proc_t* this_proc = mca_btl_tcp2_proc_local(); - mca_btl_tcp2_proc_t *endpoint_proc = btl_endpoint->endpoint_proc; + mca_btl_tcp_proc_t *endpoint_proc = btl_endpoint->endpoint_proc; + const orte_process_name_t *this_proc = &(ompi_proc_local()->proc_name); int cmpval; - OPAL_THREAD_LOCK(&btl_endpoint->endpoint_recv_lock); - OPAL_THREAD_LOCK(&btl_endpoint->endpoint_send_lock); - if(NULL == btl_endpoint->endpoint_addr) { - OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock); - OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); return false; } - cmpval = orte_util_compare_name_fields(ORTE_NS_CMP_ALL, + OPAL_THREAD_LOCK(&btl_endpoint->endpoint_recv_lock); + OPAL_THREAD_LOCK(&btl_endpoint->endpoint_send_lock); + + cmpval = ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL, &endpoint_proc->proc_ompi->proc_name, - &this_proc->proc_ompi->proc_name); + this_proc); if((btl_endpoint->endpoint_sd < 0) || (btl_endpoint->endpoint_state != MCA_BTL_TCP_CONNECTED && cmpval < 0)) { @@ -365,9 +390,16 @@ bool mca_btl_tcp2_endpoint_accept(mca_btl_base_endpoint_t* btl_endpoint, OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); return false; } - mca_btl_tcp2_endpoint_event_init(btl_endpoint); - opal_event_add(&btl_endpoint->endpoint_recv_event, 0); - mca_btl_tcp2_endpoint_connected(btl_endpoint); + mca_btl_tcp_endpoint_event_init(btl_endpoint); + /* NOT NEEDED if we remove the PERSISTENT flag when we create the + * first recv_event. + */ +#if GB_DEFINED + opal_output(0, "%s:%d add the recv event on socket %d\n", + __FILE__, __LINE__, btl_endpoint->endpoint_sd); /* GB */ +#endif /* GB_DEFINED */ + opal_event_add(&btl_endpoint->endpoint_recv_event, 0); /* TODO */ + mca_btl_tcp_endpoint_connected(btl_endpoint); #if OPAL_ENABLE_DEBUG && WANT_PEER_DUMP mca_btl_tcp2_endpoint_dump(btl_endpoint, "accepted"); #endif @@ -388,16 +420,19 @@ bool mca_btl_tcp2_endpoint_accept(mca_btl_base_endpoint_t* btl_endpoint, */ void mca_btl_tcp2_endpoint_close(mca_btl_base_endpoint_t* btl_endpoint) { - if(btl_endpoint->endpoint_sd < 0) - return; - btl_endpoint->endpoint_state = MCA_BTL_TCP_CLOSED; + int sd = btl_endpoint->endpoint_sd; + + do { + if( sd < 0 ) return; + } while ( opal_atomic_cmpset( &(btl_endpoint->endpoint_sd), sd, -1 ) ); + + CLOSE_THE_SOCKET(sd); btl_endpoint->endpoint_retries++; opal_event_del(&btl_endpoint->endpoint_recv_event); opal_event_del(&btl_endpoint->endpoint_send_event); - CLOSE_THE_SOCKET(btl_endpoint->endpoint_sd); - btl_endpoint->endpoint_sd = -1; #if MCA_BTL_TCP_ENDPOINT_CACHE - free( btl_endpoint->endpoint_cache ); + if( NULL != btl_endpoint->endpoint_cache ) + free( btl_endpoint->endpoint_cache ); btl_endpoint->endpoint_cache = NULL; btl_endpoint->endpoint_cache_pos = NULL; btl_endpoint->endpoint_cache_length = 0; @@ -417,16 +452,21 @@ static void mca_btl_tcp2_endpoint_connected(mca_btl_base_endpoint_t* btl_endpoin btl_endpoint->endpoint_retries = 0; /* Create the send event in a persistent manner. */ - opal_event_set(opal_event_base, &btl_endpoint->endpoint_send_event, - btl_endpoint->endpoint_sd, - OPAL_EV_WRITE | OPAL_EV_PERSIST, - mca_btl_tcp2_endpoint_send_handler, - btl_endpoint ); + opal_event_set(mca_btl_tcp_event_base, &btl_endpoint->endpoint_send_event, + btl_endpoint->endpoint_sd, + OPAL_EV_WRITE | OPAL_EV_PERSIST, + mca_btl_tcp_endpoint_send_handler, + btl_endpoint ); if(opal_list_get_size(&btl_endpoint->endpoint_frags) > 0) { - if(NULL == btl_endpoint->endpoint_send_frag) - btl_endpoint->endpoint_send_frag = (mca_btl_tcp2_frag_t*) + if(NULL == btl_endpoint->endpoint_send_frag) { + btl_endpoint->endpoint_send_frag = (mca_btl_tcp_frag_t*) opal_list_remove_first(&btl_endpoint->endpoint_frags); + } +#if GB_DEFINED + opal_output(0, "%s:%d add the send event on socket %d\n", + __FILE__, __LINE__, btl_endpoint->endpoint_sd); /* GB */ +#endif /* GB_DEFINED */ opal_event_add(&btl_endpoint->endpoint_send_event, 0); } } @@ -578,7 +618,11 @@ static int mca_btl_tcp2_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endp /* non-blocking so wait for completion */ if(opal_socket_errno == EINPROGRESS || opal_socket_errno == EWOULDBLOCK) { btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECTING; - opal_event_add(&btl_endpoint->endpoint_send_event, 0); +#if GB_DEFINED + opal_output(0, "%s:%d add the send event on socket %d\n", + __FILE__, __LINE__, btl_endpoint->endpoint_sd); /* GB */ +#endif /* GB_DEFINED */ + MCA_BTL_TCP_ACTIVATE_EVENT(&btl_endpoint->endpoint_send_event, 0); return OMPI_SUCCESS; } { @@ -597,7 +641,11 @@ static int mca_btl_tcp2_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endp /* send our globally unique process identifier to the endpoint */ if((rc = mca_btl_tcp2_endpoint_send_connect_ack(btl_endpoint)) == OMPI_SUCCESS) { btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK; - opal_event_add(&btl_endpoint->endpoint_recv_event, 0); +#if GB_DEFINED + opal_output(0, "%s:%d add the recv event on socket %d\n", + __FILE__, __LINE__, btl_endpoint->endpoint_sd); /* GB */ +#endif /* GB_DEFINED */ + MCA_BTL_TCP_ACTIVATE_EVENT(&btl_endpoint->endpoint_recv_event, 0); } else { mca_btl_tcp2_endpoint_close(btl_endpoint); } @@ -619,6 +667,10 @@ static void mca_btl_tcp2_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_ mca_btl_tcp2_proc_tosocks(btl_endpoint->endpoint_addr, &endpoint_addr); /* unregister from receiving event notifications */ +#if GB_DEFINED + opal_output(0, "%s:%d remove the send event on socket %d\n", + __FILE__, __LINE__, btl_endpoint->endpoint_sd); /* GB */ +#endif /* GB_DEFINED */ opal_event_del(&btl_endpoint->endpoint_send_event); /* check connect completion status */ @@ -630,6 +682,10 @@ static void mca_btl_tcp2_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_ return; } if(so_error == EINPROGRESS || so_error == EWOULDBLOCK) { +#if GB_DEFINED + opal_output(0, "%s:%d add the send event on socket %d\n", + __FILE__, __LINE__, btl_endpoint->endpoint_sd); /* GB */ +#endif /* GB_DEFINED */ opal_event_add(&btl_endpoint->endpoint_send_event, 0); return; } @@ -643,6 +699,10 @@ static void mca_btl_tcp2_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_ if(mca_btl_tcp2_endpoint_send_connect_ack(btl_endpoint) == OMPI_SUCCESS) { btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK; +#if GB_DEFINED + opal_output(0, "%s:%d add the recv event on socket %d\n", + __FILE__, __LINE__, btl_endpoint->endpoint_sd); /* GB */ +#endif /* GB_DEFINED */ opal_event_add(&btl_endpoint->endpoint_recv_event, 0); } else { mca_btl_tcp2_endpoint_close(btl_endpoint); @@ -688,9 +748,12 @@ static void mca_btl_tcp2_endpoint_recv_handler(int sd, short flags, void* user) mca_btl_tcp2_frag_t* frag; frag = btl_endpoint->endpoint_recv_frag; + + data_still_pending_on_endpoint: if(NULL == frag) { - if(mca_btl_tcp2_module.super.btl_max_send_size > - mca_btl_tcp2_module.super.btl_eager_limit) { + + if(mca_btl_tcp_module.super.btl_max_send_size > + mca_btl_tcp_module.super.btl_eager_limit) { MCA_BTL_TCP_FRAG_ALLOC_MAX(frag); } else { MCA_BTL_TCP_FRAG_ALLOC_EAGER(frag); @@ -703,30 +766,32 @@ static void mca_btl_tcp2_endpoint_recv_handler(int sd, short flags, void* user) MCA_BTL_TCP_FRAG_INIT_DST(frag, btl_endpoint); } -#if MCA_BTL_TCP_ENDPOINT_CACHE - assert( 0 == btl_endpoint->endpoint_cache_length ); - data_still_pending_on_endpoint: -#endif /* MCA_BTL_TCP_ENDPOINT_CACHE */ /* check for completion of non-blocking recv on the current fragment */ - if(mca_btl_tcp2_frag_recv(frag, btl_endpoint->endpoint_sd) == false) { + if( mca_btl_tcp_frag_recv(frag, btl_endpoint->endpoint_sd) == false ) { btl_endpoint->endpoint_recv_frag = frag; } else { btl_endpoint->endpoint_recv_frag = NULL; - if( MCA_BTL_TCP_HDR_TYPE_SEND == frag->hdr.type ) { - mca_btl_active_message_callback_t* reg; - reg = mca_btl_base_active_message_trigger + frag->hdr.base.tag; - reg->cbfunc(&frag->btl->super, frag->hdr.base.tag, &frag->base, reg->cbdata); - } + + TODO_MCA_BTL_TCP_RECV_TRIGGER_CB(frag); + #if MCA_BTL_TCP_ENDPOINT_CACHE if( 0 != btl_endpoint->endpoint_cache_length ) { +#if MCA_BTL_TCP_USES_PROGRESS_THREAD + /* Get a new fragment and try again */ + frag = NULL; +#else /* If the cache still contain some data we can reuse the same fragment * until we flush it completly. */ MCA_BTL_TCP_FRAG_INIT_DST(frag, btl_endpoint); +#endif /* MCA_BTL_TCP_USES_PROGRESS_THREAD */ goto data_still_pending_on_endpoint; } #endif /* MCA_BTL_TCP_ENDPOINT_CACHE */ + +#if !MCA_BTL_TCP_USES_PROGRESS_THREAD MCA_BTL_TCP_FRAG_RETURN(frag); +#endif /* MCA_BTL_TCP_USES_PROGRESS_THREAD */ } #if MCA_BTL_TCP_ENDPOINT_CACHE assert( 0 == btl_endpoint->endpoint_cache_length ); @@ -741,12 +806,13 @@ static void mca_btl_tcp2_endpoint_recv_handler(int sd, short flags, void* user) * of the MPI_Finalize. The first one will close the connections, * and all others will complain. */ - OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); break; default: - OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); + OPAL_THREAD_LOCK(&btl_endpoint->endpoint_recv_lock); BTL_ERROR(("invalid socket state(%d)", btl_endpoint->endpoint_state)); - mca_btl_tcp2_endpoint_close(btl_endpoint); + btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED; + mca_btl_tcp_endpoint_close(btl_endpoint); + OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); break; } } @@ -759,8 +825,8 @@ static void mca_btl_tcp2_endpoint_recv_handler(int sd, short flags, void* user) static void mca_btl_tcp2_endpoint_send_handler(int sd, short flags, void* user) { - mca_btl_tcp2_endpoint_t* btl_endpoint = (mca_btl_tcp2_endpoint_t *)user; - OPAL_THREAD_LOCK(&btl_endpoint->endpoint_send_lock); + mca_btl_tcp_endpoint_t* btl_endpoint = (mca_btl_tcp_endpoint_t *)user; + opal_mutex_atomic_lock(&btl_endpoint->endpoint_send_lock); switch(btl_endpoint->endpoint_state) { case MCA_BTL_TCP_CONNECTING: mca_btl_tcp2_endpoint_complete_connect(btl_endpoint); @@ -779,27 +845,31 @@ static void mca_btl_tcp2_endpoint_send_handler(int sd, short flags, void* user) opal_list_remove_first(&btl_endpoint->endpoint_frags); /* if required - update request status and release fragment */ - OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock); + opal_mutex_atomic_unlock(&btl_endpoint->endpoint_send_lock); assert( frag->base.des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK ); - frag->base.des_cbfunc(&frag->btl->super, frag->endpoint, &frag->base, frag->rc); - if( btl_ownership ) { - MCA_BTL_TCP_FRAG_RETURN(frag); - } - OPAL_THREAD_LOCK(&btl_endpoint->endpoint_send_lock); - + TODO_MCA_BTL_TCP_COMPLETE_FRAG_SEND(frag); + opal_mutex_atomic_lock(&btl_endpoint->endpoint_send_lock); } - /* if nothing else to do unregister for send event notifications */ + /* if no more data to send unregister the send notifications */ if(NULL == btl_endpoint->endpoint_send_frag) { +#if GB_DEFINED + opal_output(0, "%s:%d remove the send event on socket %d\n", + __FILE__, __LINE__, sd); /* GB */ +#endif /* GB_DEFINED */ opal_event_del(&btl_endpoint->endpoint_send_event); } break; default: BTL_ERROR(("invalid connection state (%d)", btl_endpoint->endpoint_state)); +#if GB_DEFINED + opal_output(0, "%s:%d remove the send event on socket %d\n", + __FILE__, __LINE__, sd); /* GB */ +#endif /* GB_DEFINED */ opal_event_del(&btl_endpoint->endpoint_send_event); break; } - OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock); + opal_mutex_atomic_unlock(&btl_endpoint->endpoint_send_lock); } diff --git a/contrib/build-mca-comps-outside-of-tree/btl_tcp2_frag.h b/contrib/build-mca-comps-outside-of-tree/btl_tcp2_frag.h index 2f8d78f9dd6..661e271683b 100644 --- a/contrib/build-mca-comps-outside-of-tree/btl_tcp2_frag.h +++ b/contrib/build-mca-comps-outside-of-tree/btl_tcp2_frag.h @@ -41,6 +41,14 @@ BEGIN_C_DECLS #define MCA_BTL_TCP_FRAG_IOVEC_NUMBER 4 +/** + * Commands for the threaded version when the fragments must be completed + * by one of the MPI bounded threads. + */ +#define MCA_BTL_TCP_FRAG_STEP_UNDEFINED ((uint16_t)0x0000) +#define MCA_BTL_TCP_FRAG_STEP_SEND_COMPLETE ((uint16_t)0x0001) +#define MCA_BTL_TCP_FRAG_STEP_RECV_COMPLETE ((uint16_t)0x0002) + /** * TCP fragment derived type. */ @@ -82,49 +90,77 @@ OBJ_CLASS_DECLARATION(mca_btl_tcp2_frag_user_t); #define MCA_BTL_TCP_FRAG_ALLOC_EAGER(frag) \ { \ ompi_free_list_item_t *item; \ - OMPI_FREE_LIST_GET(&mca_btl_tcp2_component.tcp_frag_eager, item); \ - frag = (mca_btl_tcp2_frag_t*) item; \ + MCA_BTL_TCP_CRITICAL_SECTION_ENTER(&mca_btl_tcp_component.tcp_frag_eager_mutex); \ + OMPI_FREE_LIST_GET_MT(&mca_btl_tcp_component.tcp_frag_eager, item); \ + MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(&mca_btl_tcp_component.tcp_frag_eager_mutex); \ + frag = (mca_btl_tcp_frag_t*) item; \ } #define MCA_BTL_TCP_FRAG_ALLOC_MAX(frag) \ { \ ompi_free_list_item_t *item; \ - OMPI_FREE_LIST_GET(&mca_btl_tcp2_component.tcp_frag_max, item); \ - frag = (mca_btl_tcp2_frag_t*) item; \ + MCA_BTL_TCP_CRITICAL_SECTION_ENTER(&mca_btl_tcp_component.tcp_frag_max_mutex); \ + OMPI_FREE_LIST_GET_MT(&mca_btl_tcp_component.tcp_frag_max, item); \ + MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(&mca_btl_tcp_component.tcp_frag_max_mutex); \ + frag = (mca_btl_tcp_frag_t*) item; \ } #define MCA_BTL_TCP_FRAG_ALLOC_USER(frag) \ { \ ompi_free_list_item_t *item; \ - OMPI_FREE_LIST_GET(&mca_btl_tcp2_component.tcp_frag_user, item); \ - frag = (mca_btl_tcp2_frag_t*) item; \ + MCA_BTL_TCP_CRITICAL_SECTION_ENTER(&mca_btl_tcp_component.tcp_frag_user_mutex); \ + OMPI_FREE_LIST_GET_MT(&mca_btl_tcp_component.tcp_frag_user, item); \ + MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(&mca_btl_tcp_component.tcp_frag_user_mutex); \ + frag = (mca_btl_tcp_frag_t*) item; \ } +#if MCA_BTL_TCP_USES_PROGRESS_THREAD #define MCA_BTL_TCP_FRAG_RETURN(frag) \ { \ - OMPI_FREE_LIST_RETURN(frag->my_list, (ompi_free_list_item_t*)(frag)); \ + (frag)->next_step = MCA_BTL_TCP_FRAG_STEP_UNDEFINED; \ + if( frag->my_list == &mca_btl_tcp_component.tcp_frag_eager ) { \ + MCA_BTL_TCP_CRITICAL_SECTION_ENTER(&mca_btl_tcp_component.tcp_frag_eager_mutex); \ + OMPI_FREE_LIST_RETURN_MT(frag->my_list, (ompi_free_list_item_t*)(frag)); \ + MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(&mca_btl_tcp_component.tcp_frag_eager_mutex); \ + } else if( frag->my_list == &mca_btl_tcp_component.tcp_frag_max ) { \ + MCA_BTL_TCP_CRITICAL_SECTION_ENTER(&mca_btl_tcp_component.tcp_frag_max_mutex); \ + OMPI_FREE_LIST_RETURN_MT(frag->my_list, (ompi_free_list_item_t*)(frag)); \ + MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(&mca_btl_tcp_component.tcp_frag_max_mutex); \ + } else { \ + assert( frag->my_list == &mca_btl_tcp_component.tcp_frag_user ); \ + MCA_BTL_TCP_CRITICAL_SECTION_ENTER(&mca_btl_tcp_component.tcp_frag_user_mutex); \ + OMPI_FREE_LIST_RETURN_MT(frag->my_list, (ompi_free_list_item_t*)(frag)); \ + MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(&mca_btl_tcp_component.tcp_frag_user_mutex); \ + } \ } +#else +#define MCA_BTL_TCP_FRAG_RETURN(frag) \ +{ \ + (frag)->next_step = MCA_BTL_TCP_FRAG_STEP_UNDEFINED; \ + OMPI_FREE_LIST_RETURN_MT(frag->my_list, (ompi_free_list_item_t*)(frag)); \ +} +#endif /* MCA_BTL_TCP_USES_PROGRESS_THREAD */ #define MCA_BTL_TCP_FRAG_INIT_DST(frag,ep) \ do { \ - frag->rc = 0; \ - frag->btl = ep->endpoint_btl; \ + frag->base.des_src = NULL; \ + frag->base.des_src_cnt = 0; \ + frag->base.des_dst = frag->segments; \ + frag->base.des_dst_cnt = 1; \ frag->endpoint = ep; \ frag->iov[0].iov_len = sizeof(frag->hdr); \ frag->iov[0].iov_base = (IOVBASE_TYPE*)&frag->hdr; \ frag->iov_cnt = 1; \ frag->iov_idx = 0; \ frag->iov_ptr = frag->iov; \ - frag->base.des_src = NULL; \ - frag->base.des_dst_cnt = 0; \ - frag->base.des_dst = frag->segments; \ - frag->base.des_dst_cnt = 1; \ + frag->rc = 0; \ } while(0) bool mca_btl_tcp2_frag_send(mca_btl_tcp2_frag_t*, int sd); bool mca_btl_tcp2_frag_recv(mca_btl_tcp2_frag_t*, int sd); +void mca_btl_tcp_dump_frag( mca_btl_tcp_frag_t* frag, char* msg ); END_C_DECLS #endif diff --git a/contrib/build-mca-comps-outside-of-tree/btl_tcp2_proc.c b/contrib/build-mca-comps-outside-of-tree/btl_tcp2_proc.c index 15ae3a2c49c..2834faa45a3 100644 --- a/contrib/build-mca-comps-outside-of-tree/btl_tcp2_proc.c +++ b/contrib/build-mca-comps-outside-of-tree/btl_tcp2_proc.c @@ -3,6 +3,7 @@ * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2010 The University of Tennessee and The University + * Copyright (c) 2004-2012 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -77,10 +78,10 @@ void mca_btl_tcp2_proc_construct(mca_btl_tcp2_proc_t* tcp_proc) void mca_btl_tcp2_proc_destruct(mca_btl_tcp2_proc_t* tcp_proc) { /* remove from list of all proc instances */ - OPAL_THREAD_LOCK(&mca_btl_tcp2_component.tcp_lock); - opal_proc_table_remove_value(&mca_btl_tcp2_component.tcp_procs, - tcp_proc->proc_ompi->proc_name); - OPAL_THREAD_UNLOCK(&mca_btl_tcp2_component.tcp_lock); + MCA_BTL_TCP_CRITICAL_SECTION_ENTER(&mca_btl_tcp_component.tcp_lock); + opal_hash_table_remove_value_uint64(&mca_btl_tcp_component.tcp_procs, + ompi_rte_hash_name(&tcp_proc->proc_ompi->proc_name)); + MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(&mca_btl_tcp_component.tcp_lock); /* release resources */ if(NULL != tcp_proc->proc_endpoints) { @@ -103,11 +104,11 @@ mca_btl_tcp2_proc_t* mca_btl_tcp2_proc_create(ompi_proc_t* ompi_proc) size_t size; mca_btl_tcp2_proc_t* btl_proc; - OPAL_THREAD_LOCK(&mca_btl_tcp2_component.tcp_lock); - rc = opal_proc_table_get_value(&mca_btl_tcp2_component.tcp_procs, - ompi_proc->proc_name, (void**)&btl_proc); + MCA_BTL_TCP_CRITICAL_SECTION_ENTER(&mca_btl_tcp_component.tcp_lock); + rc = opal_hash_table_get_value_uint64(&mca_btl_tcp_component.tcp_procs, + hash, (void**)&btl_proc); if(OMPI_SUCCESS == rc) { - OPAL_THREAD_UNLOCK(&mca_btl_tcp2_component.tcp_lock); + MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(&mca_btl_tcp_component.tcp_lock); return btl_proc; } @@ -117,9 +118,9 @@ mca_btl_tcp2_proc_t* mca_btl_tcp2_proc_create(ompi_proc_t* ompi_proc) btl_proc->proc_ompi = ompi_proc; /* add to hash table of all proc instance */ - opal_proc_table_set_value(&mca_btl_tcp2_component.tcp_procs, - ompi_proc->proc_name, btl_proc); - OPAL_THREAD_UNLOCK(&mca_btl_tcp2_component.tcp_lock); + opal_hash_table_set_value_uint64(&mca_btl_tcp_component.tcp_procs, + hash, btl_proc); + MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(&mca_btl_tcp_component.tcp_lock); /* lookup tcp parameters exported by this proc */ rc = ompi_modex_recv( &mca_btl_tcp2_component.super.btl_version, @@ -681,8 +682,8 @@ int mca_btl_tcp2_proc_insert( mca_btl_tcp2_proc_t* btl_proc, int mca_btl_tcp2_proc_remove(mca_btl_tcp2_proc_t* btl_proc, mca_btl_base_endpoint_t* btl_endpoint) { size_t i; - OPAL_THREAD_LOCK(&btl_proc->proc_lock); - for(i=0; iproc_endpoint_count; i++) { + MCA_BTL_TCP_CRITICAL_SECTION_ENTER(&btl_proc->proc_lock); + for( i = 0; i < btl_proc->proc_endpoint_count; i++ ) { if(btl_proc->proc_endpoints[i] == btl_endpoint) { memmove(btl_proc->proc_endpoints+i, btl_proc->proc_endpoints+i+1, (btl_proc->proc_endpoint_count-i-1)*sizeof(mca_btl_base_endpoint_t*)); @@ -700,7 +701,7 @@ int mca_btl_tcp2_proc_remove(mca_btl_tcp2_proc_t* btl_proc, mca_btl_base_endpoin break; } } - OPAL_THREAD_UNLOCK(&btl_proc->proc_lock); + MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(&btl_proc->proc_lock); return OMPI_SUCCESS; } @@ -710,11 +711,11 @@ int mca_btl_tcp2_proc_remove(mca_btl_tcp2_proc_t* btl_proc, mca_btl_base_endpoin */ mca_btl_tcp2_proc_t* mca_btl_tcp2_proc_lookup(const orte_process_name_t *name) { - mca_btl_tcp2_proc_t* proc = NULL; - OPAL_THREAD_LOCK(&mca_btl_tcp2_component.tcp_lock); - opal_proc_table_get_value(&mca_btl_tcp2_component.tcp_procs, - name->proc_name, (void**)&proc); - OPAL_THREAD_UNLOCK(&mca_btl_tcp2_component.tcp_lock); + mca_btl_tcp_proc_t* proc = NULL; + MCA_BTL_TCP_CRITICAL_SECTION_ENTER(&mca_btl_tcp_component.tcp_lock); + opal_hash_table_get_value_uint64(&mca_btl_tcp_component.tcp_procs, + ompi_rte_hash_name(name), (void**)&proc); + MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(&mca_btl_tcp_component.tcp_lock); return proc; } @@ -725,7 +726,7 @@ mca_btl_tcp2_proc_t* mca_btl_tcp2_proc_lookup(const orte_process_name_t *name) bool mca_btl_tcp2_proc_accept(mca_btl_tcp2_proc_t* btl_proc, struct sockaddr* addr, int sd) { size_t i; - OPAL_THREAD_LOCK(&btl_proc->proc_lock); + MCA_BTL_TCP_CRITICAL_SECTION_ENTER(&btl_proc->proc_lock); for( i = 0; i < btl_proc->proc_endpoint_count; i++ ) { mca_btl_base_endpoint_t* btl_endpoint = btl_proc->proc_endpoints[i]; /* Check all conditions before going to try to accept the connection. */ @@ -754,12 +755,12 @@ bool mca_btl_tcp2_proc_accept(mca_btl_tcp2_proc_t* btl_proc, struct sockaddr* ad ; } - if(mca_btl_tcp2_endpoint_accept(btl_endpoint, addr, sd)) { - OPAL_THREAD_UNLOCK(&btl_proc->proc_lock); + if(mca_btl_tcp_endpoint_accept(btl_endpoint, addr, sd)) { + MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(&btl_proc->proc_lock); return true; } } - OPAL_THREAD_UNLOCK(&btl_proc->proc_lock); + MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(&btl_proc->proc_lock); return false; } diff --git a/opal/mca/btl/tcp/btl_tcp.h b/opal/mca/btl/tcp/btl_tcp.h index 6b6b0e62e38..aea2f1bb137 100644 --- a/opal/mca/btl/tcp/btl_tcp.h +++ b/opal/mca/btl/tcp/btl_tcp.h @@ -43,14 +43,88 @@ /* Open MPI includes */ #include "opal/mca/event/event.h" -#include "opal/mca/btl/btl.h" -#include "opal/mca/btl/base/base.h" -#include "opal/mca/mpool/mpool.h" +#include "ompi/class/ompi_free_list.h" +#include "ompi/mca/btl/btl.h" +#include "ompi/mca/btl/base/base.h" +#include "ompi/mca/mpool/mpool.h" #include "opal/class/opal_hash_table.h" #define MCA_BTL_TCP_STATISTICS 0 BEGIN_C_DECLS +#if (HAVE_PTHREAD_H == 1) +#define MCA_BTL_TCP_USES_PROGRESS_THREAD 1 +#else +#define MCA_BTL_TCP_USES_PROGRESS_THREAD 0 +#endif /* (HAVE_PTHREAD_H == 1) */ + +extern opal_event_base_t* mca_btl_tcp_event_base; + +#define MCA_BTL_TCP_COMPLETE_FRAG_SEND(frag) \ + do { \ + int btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); \ + if( frag->base.des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK ) { \ + frag->base.des_cbfunc(&frag->endpoint->endpoint_btl->super, frag->endpoint, \ + &frag->base, frag->rc); \ + } \ + if( btl_ownership ) { \ + MCA_BTL_TCP_FRAG_RETURN(frag); \ + } \ + } while (0) +#define MCA_BTL_TCP_RECV_TRIGGER_CB(frag) \ + do { \ + if( MCA_BTL_TCP_HDR_TYPE_SEND == frag->hdr.type ) { \ + mca_btl_active_message_callback_t* reg; \ + reg = mca_btl_base_active_message_trigger + frag->hdr.base.tag; \ + reg->cbfunc(&frag->endpoint->endpoint_btl->super, frag->hdr.base.tag, &frag->base, reg->cbdata); \ + } \ + } while (0) + +#if MCA_BTL_TCP_USES_PROGRESS_THREAD +extern opal_list_t mca_btl_tcp_ready_frag_pending_queue; +extern opal_mutex_t mca_btl_tcp_ready_frag_mutex; +extern int mca_btl_tcp_pipe_to_progress[2]; + +#define MCA_BTL_TCP_CRITICAL_SECTION_ENTER(name) \ + opal_mutex_atomic_lock((name)) +#define MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(name) \ + opal_mutex_atomic_unlock((name)) + +#define TODO_MCA_BTL_TCP_COMPLETE_FRAG_SEND(frag) \ + do { \ + (frag)->next_step = MCA_BTL_TCP_FRAG_STEP_SEND_COMPLETE; \ + MCA_BTL_TCP_CRITICAL_SECTION_ENTER(&mca_btl_tcp_ready_frag_mutex); \ + opal_list_append(&mca_btl_tcp_ready_frag_pending_queue, \ + (opal_list_item_t*)frag); \ + MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(&mca_btl_tcp_ready_frag_mutex); \ + } while (0) + +#define TODO_MCA_BTL_TCP_RECV_TRIGGER_CB(frag) \ + do { \ + (frag)->next_step = MCA_BTL_TCP_FRAG_STEP_RECV_COMPLETE; \ + MCA_BTL_TCP_CRITICAL_SECTION_ENTER(&mca_btl_tcp_ready_frag_mutex); \ + opal_list_append(&mca_btl_tcp_ready_frag_pending_queue, \ + (opal_list_item_t*)frag); \ + MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(&mca_btl_tcp_ready_frag_mutex); \ + } while (0) +#define MCA_BTL_TCP_ACTIVATE_EVENT(event, value) \ + do { \ + opal_event_t* _event = (opal_event_t*)(event); \ + opal_fd_write( mca_btl_tcp_pipe_to_progress[1], sizeof(opal_event_t*), \ + &_event); \ + } while (0) +#else +#define MCA_BTL_TCP_CRITICAL_SECTION_ENTER(name) +#define MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(name) +#define TODO_MCA_BTL_TCP_COMPLETE_FRAG_SEND(frag) \ + MCA_BTL_TCP_COMPLETE_FRAG_SEND(frag) +#define TODO_MCA_BTL_TCP_RECV_TRIGGER_CB(frag) \ + MCA_BTL_TCP_RECV_TRIGGER_CB(frag) +#define MCA_BTL_TCP_ACTIVATE_EVENT(event, value) \ + do { \ + opal_event_add(event, (value)); \ + } while (0) +#endif /* MCA_BTL_TCP_USES_PROGRESS_THREAD */ /** * TCP BTL component. @@ -95,6 +169,12 @@ struct mca_btl_tcp_component_t { opal_free_list_t tcp_frag_max; opal_free_list_t tcp_frag_user; +#if MCA_BTL_TCP_USES_PROGRESS_THREAD + opal_event_t tcp_recv_thread_async_event; + opal_mutex_t tcp_frag_eager_mutex; + opal_mutex_t tcp_frag_max_mutex; + opal_mutex_t tcp_frag_user_mutex; +#endif /* Do we want to use TCP_NODELAY? */ int tcp_not_use_nodelay; @@ -292,6 +372,20 @@ mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src( uint32_t flags ); +extern mca_btl_base_descriptor_t* mca_btl_tcp_prepare_dst( + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* peer, + struct mca_mpool_base_registration_t*, + struct opal_convertor_t* convertor, + uint8_t order, + size_t reserve, + size_t* size, + uint32_t flags); + +extern void +mca_btl_tcp_dump(struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* endpoint, + int verbose); /** * Fault Tolerance Event Notification Function diff --git a/opal/mca/btl/tcp/btl_tcp_component.c b/opal/mca/btl/tcp/btl_tcp_component.c index a43d6453d0f..71d0c3d84d4 100644 --- a/opal/mca/btl/tcp/btl_tcp_component.c +++ b/opal/mca/btl/tcp/btl_tcp_component.c @@ -61,6 +61,8 @@ #include "opal/util/argv.h" #include "opal/util/net.h" #include "opal/util/proc.h" +#include "opal/util/net.h" +#include "opal/util/fd.h" #include "opal/util/show_help.h" #include "opal/constants.h" #include "opal/mca/btl/btl.h" @@ -69,6 +71,11 @@ #include "opal/mca/btl/base/btl_base_error.h" #include "opal/mca/pmix/pmix.h" +#include "ompi/constants.h" +#include "ompi/mca/btl/btl.h" +#include "ompi/mca/btl/base/base.h" +#include "ompi/runtime/ompi_module_exchange.h" +#include "ompi/mca/btl/base/btl_base_error.h" #include "btl_tcp.h" #include "btl_tcp_addr.h" #include "btl_tcp_proc.h" @@ -85,6 +92,16 @@ static int mca_btl_tcp_component_register(void); static int mca_btl_tcp_component_open(void); static int mca_btl_tcp_component_close(void); +opal_event_base_t* mca_btl_tcp_event_base = NULL; +#if MCA_BTL_TCP_USES_PROGRESS_THREAD +static int mca_btl_tcp_progress_thread_trigger = -1; +int mca_btl_tcp_pipe_to_progress[2] = { -1, -1 }; +static opal_thread_t mca_btl_tcp_progress_thread; +opal_list_t mca_btl_tcp_ready_frag_pending_queue; +opal_mutex_t mca_btl_tcp_ready_frag_mutex; +#endif /* MCA_BTL_TCP_USES_PROGRESS_THREAD */ +static char *mca_btl_tcp_if_seq_string; + mca_btl_tcp_component_t mca_btl_tcp_component = { .super = { /* First, the mca_base_component_t struct containing meta information @@ -102,6 +119,11 @@ mca_btl_tcp_component_t mca_btl_tcp_component = { }, .btl_init = mca_btl_tcp_component_init, +#if MCA_BTL_TCP_USES_PROGRESS_THREAD + mca_btl_tcp_component_progress, +#else + NULL, +#endif /* MCA_BTL_TCP_USES_PROGRESS_THREAD */ } }; @@ -165,8 +187,25 @@ struct mca_btl_tcp_event_t { }; typedef struct mca_btl_tcp_event_t mca_btl_tcp_event_t; -OBJ_CLASS_INSTANCE( mca_btl_tcp_event_t, opal_list_item_t, - NULL, NULL); +static void mca_btl_tcp_event_construct(mca_btl_tcp_event_t* event) +{ + MCA_BTL_TCP_CRITICAL_SECTION_ENTER(&mca_btl_tcp_component.tcp_lock); + opal_list_append(&mca_btl_tcp_component.tcp_events, &event->item); + MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(&mca_btl_tcp_component.tcp_lock); +} + +static void mca_btl_tcp_event_destruct(mca_btl_tcp_event_t* event) +{ + MCA_BTL_TCP_CRITICAL_SECTION_ENTER(&mca_btl_tcp_component.tcp_lock); + opal_list_remove_item(&mca_btl_tcp_component.tcp_events, &event->item); + MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(&mca_btl_tcp_component.tcp_lock); +} + +OBJ_CLASS_INSTANCE( + mca_btl_tcp_event_t, + opal_list_item_t, + mca_btl_tcp_event_construct, + mca_btl_tcp_event_destruct); /* @@ -306,6 +345,14 @@ static int mca_btl_tcp_component_open(void) OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_user, opal_free_list_t); opal_proc_table_init(&mca_btl_tcp_component.tcp_procs, 16, 256); +#if MCA_BTL_TCP_USES_PROGRESS_THREAD + OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_eager_mutex, opal_mutex_t); + OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_max_mutex, opal_mutex_t); + OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_user_mutex, opal_mutex_t); + OBJ_CONSTRUCT(&mca_btl_tcp_ready_frag_mutex, opal_mutex_t); + OBJ_CONSTRUCT(&mca_btl_tcp_ready_frag_pending_queue, opal_list_t); +#endif /* MCA_BTL_TCP_USES_PROGRESS_THREAD */ + /* if_include and if_exclude need to be mutually exclusive */ if (OPAL_SUCCESS != mca_base_var_check_exclusive("ompi", @@ -330,9 +377,16 @@ static int mca_btl_tcp_component_open(void) static int mca_btl_tcp_component_close(void) { - if (NULL != mca_btl_tcp_component.tcp_btls) - free(mca_btl_tcp_component.tcp_btls); + opal_list_item_t* item; + opal_list_item_t* next; + if (NULL != mca_btl_tcp_component.tcp_if_seq) { + free(mca_btl_tcp_component.tcp_if_seq); + } + if (NULL != mca_btl_tcp_component.tcp_btls) { + free(mca_btl_tcp_component.tcp_btls); + } + if (mca_btl_tcp_component.tcp_listen_sd >= 0) { opal_event_del(&mca_btl_tcp_component.tcp_recv_event); CLOSE_THE_SOCKET(mca_btl_tcp_component.tcp_listen_sd); @@ -346,6 +400,18 @@ static int mca_btl_tcp_component_close(void) } #endif + /* cleanup any pending events */ + MCA_BTL_TCP_CRITICAL_SECTION_ENTER(&mca_btl_tcp_component.tcp_lock); + for(item = opal_list_get_first(&mca_btl_tcp_component.tcp_events); + item != opal_list_get_end(&mca_btl_tcp_component.tcp_events); + item = next) { + mca_btl_tcp_event_t* event = (mca_btl_tcp_event_t*)item; + next = opal_list_get_next(item); + opal_event_del(&event->event); + OBJ_RELEASE(event); + } + MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(&mca_btl_tcp_component.tcp_lock); + /* release resources */ OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_procs); OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_frag_eager); @@ -357,6 +423,40 @@ static int mca_btl_tcp_component_close(void) mca_common_cuda_fini(); #endif /* OPAL_CUDA_SUPPORT */ +#if MCA_BTL_TCP_USES_PROGRESS_THREAD + OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_frag_eager_mutex); + OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_frag_max_mutex); + + if( (NULL != mca_btl_tcp_event_base) && + (mca_btl_tcp_event_base != opal_sync_event_base) ) { + /* Turn of the progress thread before moving forward */ + if( -1 != mca_btl_tcp_progress_thread_trigger ) { + mca_btl_tcp_progress_thread_trigger = 0; + /* Let the progress thread know that we're going away */ + if( -1 != mca_btl_tcp_pipe_to_progress[1] ) { + close(mca_btl_tcp_pipe_to_progress[1]); + mca_btl_tcp_pipe_to_progress[1] = -1; + } + while( -1 != mca_btl_tcp_progress_thread_trigger ) { + /*event_base_loopbreak(mca_btl_tcp_event_base);*/ + sched_yield(); + usleep(100); /* give app a chance to re-enter library */ + } + } + opal_event_del(&mca_btl_tcp_component.tcp_recv_thread_async_event); + opal_event_base_free(mca_btl_tcp_event_base); + mca_btl_tcp_event_base = NULL; + + /* Close the remaining pipes */ + if( -1 != mca_btl_tcp_pipe_to_progress[0] ) { + close(mca_btl_tcp_pipe_to_progress[0]); + mca_btl_tcp_pipe_to_progress[0] = -1; + } + } + OBJ_DESTRUCT(&mca_btl_tcp_ready_frag_mutex); + OBJ_DESTRUCT(&mca_btl_tcp_ready_frag_pending_queue); +#endif + return OPAL_SUCCESS; } @@ -652,14 +752,70 @@ static int mca_btl_tcp_component_create_instances(void) return ret; } +#if MCA_BTL_TCP_USES_PROGRESS_THREAD +static void* mca_btl_tcp_progress_thread_engine(opal_object_t *obj) +{ + opal_thread_t* current_thread = (opal_thread_t*)obj; + + while( 1 == (*((int*)current_thread->t_arg)) ) { + opal_event_loop(mca_btl_tcp_event_base, OPAL_EVLOOP_ONCE); + } + (*((int*)current_thread->t_arg)) = -1; + return NULL; +} + +static void mca_btl_tcp_component_event_async_handler(int fd, short unused, void *context) +{ + opal_event_t* event; + int rc; + + rc = read(fd, (void*)&event, sizeof(opal_event_t*)); + assert( fd == mca_btl_tcp_pipe_to_progress[0] ); + if( 0 == rc ) { + /* The main thread closed the pipe to trigger the shutdown procedure */ + opal_thread_t* current_thread = (opal_thread_t*)context; + (*((int*)current_thread->t_arg)) = 0; + } else { + opal_event_add(event, 0); + } +} + +int mca_btl_tcp_component_progress( void ) +{ + mca_btl_tcp_frag_t* frag; + int count = 0; + + for( ;; ) { + MCA_BTL_TCP_CRITICAL_SECTION_ENTER(&mca_btl_tcp_ready_frag_mutex); + frag = (mca_btl_tcp_frag_t*)opal_list_remove_first(&mca_btl_tcp_ready_frag_pending_queue); + MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(&mca_btl_tcp_ready_frag_mutex); + if( NULL == frag ) break; + + switch(frag->next_step) { + case MCA_BTL_TCP_FRAG_STEP_SEND_COMPLETE: + MCA_BTL_TCP_COMPLETE_FRAG_SEND(frag); + count++; + break; + case MCA_BTL_TCP_FRAG_STEP_RECV_COMPLETE: + MCA_BTL_TCP_RECV_TRIGGER_CB(frag); + MCA_BTL_TCP_FRAG_RETURN(frag); + count++; + break; + default: + break; + } + } + return count; +} +#endif + /* * Create a listen socket and bind to all interfaces */ static int mca_btl_tcp_component_create_listen(uint16_t af_family) { - int flags; - int sd; + int flags, sd, rc; struct sockaddr_storage inaddr; opal_socklen_t addrlen; @@ -678,17 +834,16 @@ static int mca_btl_tcp_component_create_listen(uint16_t af_family) #if OPAL_ENABLE_IPV6 { struct addrinfo hints, *res = NULL; - int error; memset (&hints, 0, sizeof(hints)); hints.ai_family = af_family; hints.ai_socktype = SOCK_STREAM; hints.ai_flags = AI_PASSIVE; - if ((error = getaddrinfo(NULL, "0", &hints, &res))) { + if ((rc = getaddrinfo(NULL, "0", &hints, &res))) { opal_output (0, - "mca_btl_tcp_create_listen: unable to resolve. %s\n", - gai_strerror (error)); + "mca_btl_tcp_create_listen: unable to resolve. %s\n", + gai_strerror (rc)); CLOSE_THE_SOCKET(sd); return OPAL_ERROR; } @@ -816,25 +971,86 @@ static int mca_btl_tcp_component_create_listen(uint16_t af_family) } } - /* register listen port */ +#if MCA_BTL_TCP_USES_PROGRESS_THREAD + /* Declare our intent to use threads. */ + opal_event_use_threads(); + if( NULL == mca_btl_tcp_event_base ) { + /* fall back to only one event base (the one shared by the entire Open MPI framework) */ + mca_btl_tcp_event_base = opal_sync_event_base; + + if( NULL == (mca_btl_tcp_event_base = opal_sync_event_base_create()) ) { + BTL_ERROR(("BTL TCP failed to create progress event base")); + goto move_forward_with_no_thread; + } + opal_event_base_priority_init(mca_btl_tcp_event_base, OPAL_EVENT_NUM_PRI); + + /* construct the thread object */ + OBJ_CONSTRUCT(&mca_btl_tcp_progress_thread, opal_thread_t); + + /** + * Create a pipe to communicate between the main thread and the progress thread. + */ + if (0 != pipe(mca_btl_tcp_pipe_to_progress)) { + opal_event_base_free(mca_btl_tcp_event_base); + /* fall back to only one event base (the one shared by the entire Open MPI framework */ + mca_btl_tcp_event_base = opal_sync_event_base; + mca_btl_tcp_progress_thread_trigger = -1; /* thread not started */ + goto move_forward_with_no_thread; + } + /* setup the receiving end of the pipe as non-blocking */ + if((flags = fcntl(mca_btl_tcp_pipe_to_progress[0], F_GETFL, 0)) < 0) { + BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)", + strerror(opal_socket_errno), opal_socket_errno)); + } else { + flags |= O_NONBLOCK; + if(fcntl(mca_btl_tcp_pipe_to_progress[0], F_SETFL, flags) < 0) + BTL_ERROR(("fcntl(F_SETFL) failed: %s (%d)", + strerror(opal_socket_errno), opal_socket_errno)); + } + /* Progress thread event */ + opal_event_set(mca_btl_tcp_event_base, &mca_btl_tcp_component.tcp_recv_thread_async_event, + mca_btl_tcp_pipe_to_progress[0], + OPAL_EV_READ|OPAL_EV_PERSIST, + mca_btl_tcp_component_event_async_handler, + &mca_btl_tcp_progress_thread ); + opal_event_add(&mca_btl_tcp_component.tcp_recv_thread_async_event, 0); + + /* fork off a thread to progress it */ + mca_btl_tcp_progress_thread.t_run = mca_btl_tcp_progress_thread_engine; + mca_btl_tcp_progress_thread.t_arg = &mca_btl_tcp_progress_thread_trigger; + mca_btl_tcp_progress_thread_trigger = 1; /* thread up and running */ + if( OPAL_SUCCESS != (rc = opal_thread_start(&mca_btl_tcp_progress_thread)) ) { + BTL_ERROR(("BTL TCP progress thread initialization failed (%d)", rc)); + opal_event_base_free(mca_btl_tcp_event_base); + /* fall back to only one event base (the one shared by the entire Open MPI framework */ + mca_btl_tcp_event_base = opal_sync_event_base; + mca_btl_tcp_progress_thread_trigger = -1; /* thread not started */ + goto move_forward_with_no_thread; + } + } + move_forward_with_no_thread: +#else + mca_btl_tcp_event_base = opal_sync_event_base; +#endif + + if (AF_INET == af_family) { + opal_event_set(mca_btl_tcp_event_base, &mca_btl_tcp_component.tcp_recv_event, + mca_btl_tcp_component.tcp_listen_sd, + OPAL_EV_READ|OPAL_EV_PERSIST, + mca_btl_tcp_component_accept_handler, + 0 ); + MCA_BTL_TCP_ACTIVATE_EVENT(&mca_btl_tcp_component.tcp_recv_event, 0); + } #if OPAL_ENABLE_IPV6 if (AF_INET6 == af_family) { - opal_event_set(opal_sync_event_base, &mca_btl_tcp_component.tcp6_recv_event, - mca_btl_tcp_component.tcp6_listen_sd, - OPAL_EV_READ|OPAL_EV_PERSIST, - mca_btl_tcp_component_accept_handler, - 0 ); - opal_event_add(&mca_btl_tcp_component.tcp6_recv_event, 0); - } else -#endif - { - opal_event_set(opal_sync_event_base, &mca_btl_tcp_component.tcp_recv_event, - mca_btl_tcp_component.tcp_listen_sd, - OPAL_EV_READ|OPAL_EV_PERSIST, - mca_btl_tcp_component_accept_handler, - 0 ); - opal_event_add(&mca_btl_tcp_component.tcp_recv_event, 0); + opal_event_set(mca_btl_tcp_event_base, &mca_btl_tcp_component.tcp6_recv_event, + mca_btl_tcp_component.tcp6_listen_sd, + OPAL_EV_READ|OPAL_EV_PERSIST, + mca_btl_tcp_component_accept_handler, + 0 ); + MCA_BTL_TCP_ACTIVATE_EVENT(&mca_btl_tcp_component.tcp6_recv_event, 0); } +#endif return OPAL_SUCCESS; } @@ -1048,9 +1264,11 @@ static void mca_btl_tcp_component_accept_handler( int incoming_sd, } mca_btl_tcp_set_socket_options(sd); + assert( NULL != mca_btl_tcp_event_base ); /* wait for receipt of peers process identifier to complete this connection */ event = OBJ_NEW(mca_btl_tcp_event_t); - opal_event_set(opal_sync_event_base, &event->event, sd, OPAL_EV_READ, mca_btl_tcp_component_recv_handler, event); + opal_event_set(mca_btl_tcp_event_base, &(event->event), sd, + OPAL_EV_READ, mca_btl_tcp_component_recv_handler, event); opal_event_add(&event->event, 0); } } @@ -1112,3 +1330,14 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user) (void)mca_btl_tcp_proc_accept(btl_proc, (struct sockaddr*)&addr, sd); } +/** + * Debugging infrastructure, absolutely not thread safe. Call with care. + */ +static void mca_btl_tcp_component_dump(void) +{ + uint32_t i; + + for( i = 0; i < mca_btl_tcp_component.tcp_num_btls; i++ ) { + mca_btl_tcp_dump( (mca_btl_base_module_t*)mca_btl_tcp_component.tcp_btls[i], NULL, 1 ); + } +} diff --git a/opal/mca/btl/tcp/btl_tcp_endpoint.h b/opal/mca/btl/tcp/btl_tcp_endpoint.h index f50c6a43af9..0798f3b0814 100644 --- a/opal/mca/btl/tcp/btl_tcp_endpoint.h +++ b/opal/mca/btl/tcp/btl_tcp_endpoint.h @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2007 The University of Tennessee and The University + * Copyright (c) 2004-2012 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -61,7 +61,7 @@ struct mca_btl_base_endpoint_t { struct mca_btl_tcp_frag_t* endpoint_send_frag; /**< current send frag being processed */ struct mca_btl_tcp_frag_t* endpoint_recv_frag; /**< current recv frag being processed */ mca_btl_tcp_state_t endpoint_state; /**< current state of the connection */ - size_t endpoint_retries; /**< number of connection retries attempted */ + uint32_t endpoint_retries; /**< number of connection retries attempted */ opal_list_t endpoint_frags; /**< list of pending frags to send */ opal_mutex_t endpoint_send_lock; /**< lock for concurrent access to endpoint state */ opal_mutex_t endpoint_recv_lock; /**< lock for concurrent access to endpoint state */ @@ -80,6 +80,7 @@ void mca_btl_tcp_endpoint_close(mca_btl_base_endpoint_t*); int mca_btl_tcp_endpoint_send(mca_btl_base_endpoint_t*, struct mca_btl_tcp_frag_t*); void mca_btl_tcp_endpoint_accept(mca_btl_base_endpoint_t*, struct sockaddr*, int); void mca_btl_tcp_endpoint_shutdown(mca_btl_base_endpoint_t*); +void mca_btl_tcp_endpoint_dump(mca_btl_base_endpoint_t* btl_endpoint, const char* msg); END_C_DECLS #endif diff --git a/opal/mca/btl/tcp/btl_tcp_frag.c b/opal/mca/btl/tcp/btl_tcp_frag.c index d9bf9b76e67..e27e4cc03da 100644 --- a/opal/mca/btl/tcp/btl_tcp_frag.c +++ b/opal/mca/btl/tcp/btl_tcp_frag.c @@ -10,7 +10,6 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2008-2012 Oracle and/or all its affiliates. All rights reserved. * Copyright (c) 2014 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2015 Research Organization for Information Science @@ -48,9 +47,9 @@ #include "btl_tcp_frag.h" #include "btl_tcp_endpoint.h" -static void mca_btl_tcp_frag_eager_constructor(mca_btl_tcp_frag_t* frag) -{ - frag->size = mca_btl_tcp_module.super.btl_eager_limit; +static void mca_btl_tcp_frag_eager_constructor(mca_btl_tcp_frag_t* frag) +{ + frag->size = mca_btl_tcp_module.super.btl_eager_limit; frag->my_list = &mca_btl_tcp_component.tcp_frag_eager; } @@ -151,6 +150,11 @@ bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t* frag, int sd) frag->iov_ptr->iov_base = (opal_iov_base_ptr_t) (((unsigned char*)frag->iov_ptr->iov_base) + cnt); frag->iov_ptr->iov_len -= cnt; +#define GB_DEFINED 0 +#if GB_DEFINED + opal_output(0, "%s:%d write %lu bytes from socket %d\n", + __FILE__, __LINE__, cnt, sd); /* GB */ +#endif /* GB_DEFINED */ break; } } @@ -159,12 +163,14 @@ bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t* frag, int sd) bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd) { - int cnt, dont_copy_data = 0; - size_t i, num_vecs; mca_btl_base_endpoint_t* btl_endpoint = frag->endpoint; + int i, num_vecs, dont_copy_data = 0; + ssize_t cnt; + struct iovec *iov_ptr; repeat: num_vecs = frag->iov_cnt; + iov_ptr = &frag->iov[frag->iov_idx]; #if MCA_BTL_TCP_ENDPOINT_CACHE if( 0 != btl_endpoint->endpoint_cache_length ) { size_t length; @@ -265,11 +271,13 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd) frag->iov[1].iov_len = frag->hdr.size; frag->iov_cnt++; #ifndef __sparc - /* The following cannot be done for sparc code +#if !MCA_BTL_TCP_USES_PROGRESS_THREAD + /* The following cannot be done for sparc code * because it causes alignment errors when accessing * structures later on in the btl and pml code. */ dont_copy_data = 1; +#endif #endif goto repeat; } @@ -297,3 +305,20 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd) } return false; } + +void mca_btl_tcp_dump_frag( mca_btl_tcp_frag_t* frag, char* msg ) +{ + int i; + + mca_btl_base_err("%s %s frag %p (endpoint %p) size %lu iov_cnt %d iov_idx %d next_step %s rc %d\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg, (void*)frag, (void*)frag->endpoint, frag->size, frag->iov_cnt, frag->iov_idx, + (MCA_BTL_TCP_FRAG_STEP_UNDEFINED == frag->next_step ? "undefined" : + (MCA_BTL_TCP_FRAG_STEP_SEND_COMPLETE == frag->next_step ? "send_complete" : + (MCA_BTL_TCP_FRAG_STEP_RECV_COMPLETE == frag->next_step ? "recv_complete" : "unknown"))), + frag->rc); + for( i = 0; i < frag->iov_cnt; i++ ) { + mca_btl_base_err("%s %s | iov[%d] = {%p, %lu}\n", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg, i, frag->iov[i].iov_base, frag->iov[i].iov_len); + } + mca_btl_base_err("%s +\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); +} diff --git a/opal/mca/btl/tcp/btl_tcp_hdr.h b/opal/mca/btl/tcp/btl_tcp_hdr.h index b7977762012..70467e43713 100644 --- a/opal/mca/btl/tcp/btl_tcp_hdr.h +++ b/opal/mca/btl/tcp/btl_tcp_hdr.h @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University + * Copyright (c) 2004-2012 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, diff --git a/opal/mca/btl/tcp/btl_tcp_proc.h b/opal/mca/btl/tcp/btl_tcp_proc.h index 5e56cd7deab..064e81fa2da 100644 --- a/opal/mca/btl/tcp/btl_tcp_proc.h +++ b/opal/mca/btl/tcp/btl_tcp_proc.h @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University + * Copyright (c) 2004-2012 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, From 92062492b9ad4ce6b362e0243c3d481d0492c632 Mon Sep 17 00:00:00 2001 From: Thananon Patinyasakdikul Date: Fri, 22 Jan 2016 15:41:44 -0500 Subject: [PATCH 2/3] Enable Threading in the BTL TCP Added mca parameter to turn progress thread on/off Add a flag to check if we have btl progress thread. Added macro for ob1 matching lock. Update the AUTHORS file. --- AUTHORS | 1 + ompi/mca/pml/ob1/pml_ob1.h | 20 ++- ompi/mca/pml/ob1/pml_ob1_component.c | 6 + ompi/mca/pml/ob1/pml_ob1_recvfrag.c | 16 +-- ompi/mca/pml/ob1/pml_ob1_recvreq.c | 14 +-- opal/mca/btl/btl.h | 3 + opal/mca/btl/tcp/btl_tcp.h | 34 +++-- opal/mca/btl/tcp/btl_tcp_component.c | 172 ++++++++++++++------------ opal/mca/btl/tcp/btl_tcp_endpoint.c | 14 ++- opal/mca/btl/tcp/btl_tcp_frag.c | 28 ++--- opal/mca/btl/tcp/btl_tcp_frag.h | 6 +- opal/mca/btl/tcp/help-mpi-btl-tcp.txt | 10 ++ 12 files changed, 193 insertions(+), 131 deletions(-) diff --git a/AUTHORS b/AUTHORS index 309526e7d93..9acaad14973 100644 --- a/AUTHORS +++ b/AUTHORS @@ -108,6 +108,7 @@ sylvain.jeaugey@bull.net Sylvain Jeaugey Bull terry.dontje@oracle.com Terry Dontje Sun, Oracle thkorde@sandia.gov Todd Kordenbrock SNL tmattox@gmail.com Tim Mattox IU, Cisco +tpatinya@vols.utk.edu Thananon Patinyasakdikul UTK tprins@lanl.gov Tim Prins IU, LANL twoodall@lanl.gov Tim Woodall LANL vasily@mellanox.com Vasily Filipov Mellanox diff --git a/ompi/mca/pml/ob1/pml_ob1.h b/ompi/mca/pml/ob1/pml_ob1.h index 2781d41c0be..aaa581e3282 100644 --- a/ompi/mca/pml/ob1/pml_ob1.h +++ b/ompi/mca/pml/ob1/pml_ob1.h @@ -89,7 +89,7 @@ typedef struct mca_pml_ob1_t mca_pml_ob1_t; extern mca_pml_ob1_t mca_pml_ob1; extern int mca_pml_ob1_output; - +extern bool mca_pml_ob1_matching_protection; /* * PML interface functions. */ @@ -261,7 +261,25 @@ do { \ OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); \ } while(0) +#define OB1_MATCHING_LOCK(lock) \ + do { \ + if( mca_pml_ob1_matching_protection ) { \ + opal_mutex_lock(lock); \ + } \ + else { OPAL_THREAD_LOCK(lock); } \ + } while(0) + + +#define OB1_MATCHING_UNLOCK(lock) \ + do { \ + if( mca_pml_ob1_matching_protection ) { \ + opal_mutex_unlock(lock); \ + } \ + else { OPAL_THREAD_UNLOCK(lock); } \ + } while(0) + + int mca_pml_ob1_send_fin(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl, opal_ptr_t hdr_frag, uint64_t size, uint8_t order, int status); diff --git a/ompi/mca/pml/ob1/pml_ob1_component.c b/ompi/mca/pml/ob1/pml_ob1_component.c index 172a4dfe7e2..65152ea5139 100644 --- a/ompi/mca/pml/ob1/pml_ob1_component.c +++ b/ompi/mca/pml/ob1/pml_ob1_component.c @@ -54,6 +54,7 @@ mca_pml_ob1_component_init( int* priority, bool enable_progress_threads, static int mca_pml_ob1_component_fini(void); int mca_pml_ob1_output = 0; static int mca_pml_ob1_verbose = 0; +bool mca_pml_ob1_matching_protection = false; mca_pml_base_component_2_0_0_t mca_pml_ob1_component = { /* First, the mca_base_component_t struct containing meta @@ -277,10 +278,15 @@ mca_pml_ob1_component_init( int* priority, OPAL_LIST_FOREACH(selected_btl, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) { mca_btl_base_module_t *btl = selected_btl->btl_module; + if (btl->btl_flags & MCA_BTL_FLAGS_BTL_PROGRESS_THREAD_ENABLED) { + mca_pml_ob1_matching_protection = true; + } + if (btl->btl_flags & MCA_BTL_FLAGS_SINGLE_ADD_PROCS) { mca_pml_ob1.super.pml_flags |= MCA_PML_BASE_FLAG_REQUIRE_WORLD; break; } + } /* Set this here (vs in component_open()) because diff --git a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c index 797d276b61c..45758eb0d27 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c @@ -161,7 +161,7 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl, * end points) from being processed, and potentially "loosing" * the fragment. */ - OPAL_THREAD_LOCK(&comm->matching_lock); + OB1_MATCHING_LOCK(&comm->matching_lock); /* get sequence number of next message that can be processed */ if(OPAL_UNLIKELY((((uint16_t) hdr->hdr_seq) != ((uint16_t) proc->expected_sequence)) || @@ -194,7 +194,7 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl, hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); /* release matching lock before processing fragment */ - OPAL_THREAD_UNLOCK(&comm->matching_lock); + OB1_MATCHING_UNLOCK(&comm->matching_lock); if(OPAL_LIKELY(match)) { bytes_received = segments->seg_len - OMPI_PML_OB1_MATCH_HDR_LEN; @@ -247,7 +247,7 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl, return; slow_path: - OPAL_THREAD_UNLOCK(&comm->matching_lock); + OB1_MATCHING_UNLOCK(&comm->matching_lock); mca_pml_ob1_recv_frag_match(btl, hdr, segments, num_segments, MCA_PML_OB1_HDR_TYPE_MATCH); } @@ -668,7 +668,7 @@ static int mca_pml_ob1_recv_frag_match( mca_btl_base_module_t *btl, * end points) from being processed, and potentially "loosing" * the fragment. */ - OPAL_THREAD_LOCK(&comm->matching_lock); + OB1_MATCHING_LOCK(&comm->matching_lock); /* get sequence number of next message that can be processed */ next_msg_seq_expected = (uint16_t)proc->expected_sequence; @@ -704,7 +704,7 @@ static int mca_pml_ob1_recv_frag_match( mca_btl_base_module_t *btl, hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); /* release matching lock before processing fragment */ - OPAL_THREAD_UNLOCK(&comm->matching_lock); + OB1_MATCHING_UNLOCK(&comm->matching_lock); if(OPAL_LIKELY(match)) { switch(type) { @@ -729,7 +729,7 @@ static int mca_pml_ob1_recv_frag_match( mca_btl_base_module_t *btl, * may now be used to form new matchs */ if(OPAL_UNLIKELY(opal_list_get_size(&proc->frags_cant_match) > 0)) { - OPAL_THREAD_LOCK(&comm->matching_lock); + OB1_MATCHING_LOCK(&comm->matching_lock); if((frag = check_cantmatch_for_match(proc))) { hdr = &frag->hdr.hdr_match; segments = frag->segments; @@ -738,7 +738,7 @@ static int mca_pml_ob1_recv_frag_match( mca_btl_base_module_t *btl, type = hdr->hdr_common.hdr_type; goto out_of_order_match; } - OPAL_THREAD_UNLOCK(&comm->matching_lock); + OB1_MATCHING_UNLOCK(&comm->matching_lock); } return OMPI_SUCCESS; @@ -749,7 +749,7 @@ static int mca_pml_ob1_recv_frag_match( mca_btl_base_module_t *btl, */ append_frag_to_list(&proc->frags_cant_match, btl, hdr, segments, num_segments, NULL); - OPAL_THREAD_UNLOCK(&comm->matching_lock); + OB1_MATCHING_UNLOCK(&comm->matching_lock); return OMPI_SUCCESS; } diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index f18b4e72cb3..cc665e9c607 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -104,7 +104,7 @@ static int mca_pml_ob1_recv_request_cancel(struct ompi_request_t* ompi_request, mca_pml_ob1_comm_t *ob1_comm = comm->c_pml_comm; /* The rest should be protected behind the match logic lock */ - OPAL_THREAD_LOCK(&ob1_comm->matching_lock); + OB1_MATCHING_LOCK(&ob1_comm->matching_lock); if( true == request->req_match_received ) { /* way to late to cancel this one */ OPAL_THREAD_UNLOCK(&ob1_comm->matching_lock); assert( OMPI_ANY_TAG != ompi_request->req_status.MPI_TAG ); /* not matched isn't it */ @@ -124,7 +124,7 @@ static int mca_pml_ob1_recv_request_cancel(struct ompi_request_t* ompi_request, * to true. Otherwise, the request will never be freed. */ request->req_recv.req_base.req_pml_complete = true; - OPAL_THREAD_UNLOCK(&ob1_comm->matching_lock); + OB1_MATCHING_UNLOCK(&ob1_comm->matching_lock); OPAL_THREAD_LOCK(&ompi_request_lock); ompi_request->req_status._cancelled = true; @@ -1177,7 +1177,7 @@ void mca_pml_ob1_recv_req_start(mca_pml_ob1_recv_request_t *req) MCA_PML_BASE_RECV_START(&req->req_recv.req_base); - OPAL_THREAD_LOCK(&ob1_comm->matching_lock); + OB1_MATCHING_LOCK(&ob1_comm->matching_lock); /** * The laps of time between the ACTIVATE event and the SEARCH_UNEX one include * the cost of the request lock. @@ -1219,7 +1219,7 @@ void mca_pml_ob1_recv_req_start(mca_pml_ob1_recv_request_t *req) it when the message comes in. */ append_recv_req_to_queue(queue, req); req->req_match_received = false; - OPAL_THREAD_UNLOCK(&ob1_comm->matching_lock); + OB1_MATCHING_UNLOCK(&ob1_comm->matching_lock); } else { if(OPAL_LIKELY(!IS_PROB_REQ(req))) { PERUSE_TRACE_COMM_EVENT(PERUSE_COMM_REQ_MATCH_UNEX, @@ -1237,7 +1237,7 @@ void mca_pml_ob1_recv_req_start(mca_pml_ob1_recv_request_t *req) opal_list_remove_item(&proc->unexpected_frags, (opal_list_item_t*)frag); - OPAL_THREAD_UNLOCK(&ob1_comm->matching_lock); + OB1_MATCHING_UNLOCK(&ob1_comm->matching_lock); switch(hdr->hdr_common.hdr_type) { case MCA_PML_OB1_HDR_TYPE_MATCH: @@ -1267,14 +1267,14 @@ void mca_pml_ob1_recv_req_start(mca_pml_ob1_recv_request_t *req) restarted with this request during mrecv */ opal_list_remove_item(&proc->unexpected_frags, (opal_list_item_t*)frag); - OPAL_THREAD_UNLOCK(&ob1_comm->matching_lock); + OB1_MATCHING_UNLOCK(&ob1_comm->matching_lock); req->req_recv.req_base.req_addr = frag; mca_pml_ob1_recv_request_matched_probe(req, frag->btl, frag->segments, frag->num_segments); } else { - OPAL_THREAD_UNLOCK(&ob1_comm->matching_lock); + OB1_MATCHING_UNLOCK(&ob1_comm->matching_lock); mca_pml_ob1_recv_request_matched_probe(req, frag->btl, frag->segments, frag->num_segments); } diff --git a/opal/mca/btl/btl.h b/opal/mca/btl/btl.h index 343a024f460..11ef04cc018 100644 --- a/opal/mca/btl/btl.h +++ b/opal/mca/btl/btl.h @@ -241,6 +241,9 @@ typedef uint8_t mca_btl_base_tag_t; * BTLs should not set this flag. */ #define MCA_BTL_FLAGS_SINGLE_ADD_PROCS 0x20000 +/* The BTL is using progress thread and need the protection on matching */ +#define MCA_BTL_FLAGS_BTL_PROGRESS_THREAD_ENABLED 0x40000 + /* Default exclusivity levels */ #define MCA_BTL_EXCLUSIVITY_HIGH (64*1024) /* internal loopback */ #define MCA_BTL_EXCLUSIVITY_DEFAULT 1024 /* GM/IB/etc. */ diff --git a/opal/mca/btl/tcp/btl_tcp.h b/opal/mca/btl/tcp/btl_tcp.h index aea2f1bb137..ed3214c6f79 100644 --- a/opal/mca/btl/tcp/btl_tcp.h +++ b/opal/mca/btl/tcp/btl_tcp.h @@ -43,19 +43,20 @@ /* Open MPI includes */ #include "opal/mca/event/event.h" -#include "ompi/class/ompi_free_list.h" -#include "ompi/mca/btl/btl.h" -#include "ompi/mca/btl/base/base.h" -#include "ompi/mca/mpool/mpool.h" +#include "opal/class/opal_free_list.h" +#include "opal/mca/btl/btl.h" +#include "opal/mca/btl/base/base.h" +#include "opal/mca/mpool/mpool.h" #include "opal/class/opal_hash_table.h" +#include "opal/util/fd.h" #define MCA_BTL_TCP_STATISTICS 0 BEGIN_C_DECLS #if (HAVE_PTHREAD_H == 1) -#define MCA_BTL_TCP_USES_PROGRESS_THREAD 1 +#define MCA_BTL_TCP_SUPPORT_PROGRESS_THREAD 1 #else -#define MCA_BTL_TCP_USES_PROGRESS_THREAD 0 +#define MCA_BTL_TCP_SUPPORT_PROGRESS_THREAD 0 #endif /* (HAVE_PTHREAD_H == 1) */ extern opal_event_base_t* mca_btl_tcp_event_base; @@ -80,10 +81,11 @@ extern opal_event_base_t* mca_btl_tcp_event_base; } \ } while (0) -#if MCA_BTL_TCP_USES_PROGRESS_THREAD +#if MCA_BTL_TCP_SUPPORT_PROGRESS_THREAD extern opal_list_t mca_btl_tcp_ready_frag_pending_queue; extern opal_mutex_t mca_btl_tcp_ready_frag_mutex; extern int mca_btl_tcp_pipe_to_progress[2]; +extern int mca_btl_tcp_progress_thread_trigger; #define MCA_BTL_TCP_CRITICAL_SECTION_ENTER(name) \ opal_mutex_atomic_lock((name)) @@ -109,9 +111,14 @@ extern int mca_btl_tcp_pipe_to_progress[2]; } while (0) #define MCA_BTL_TCP_ACTIVATE_EVENT(event, value) \ do { \ - opal_event_t* _event = (opal_event_t*)(event); \ - opal_fd_write( mca_btl_tcp_pipe_to_progress[1], sizeof(opal_event_t*), \ - &_event); \ + if(0 < mca_btl_tcp_progress_thread_trigger) { \ + opal_event_t* _event = (opal_event_t*)(event); \ + opal_fd_write( mca_btl_tcp_pipe_to_progress[1], sizeof(opal_event_t*), \ + &_event); \ + } \ + else { \ + opal_event_add(event, (value)); \ + } \ } while (0) #else #define MCA_BTL_TCP_CRITICAL_SECTION_ENTER(name) @@ -124,7 +131,7 @@ extern int mca_btl_tcp_pipe_to_progress[2]; do { \ opal_event_add(event, (value)); \ } while (0) -#endif /* MCA_BTL_TCP_USES_PROGRESS_THREAD */ +#endif /* MCA_BTL_TCP_SUPPORT_PROGRESS_THREAD */ /** * TCP BTL component. @@ -143,6 +150,7 @@ struct mca_btl_tcp_component_t { int tcp_endpoint_cache; /**< amount of cache on each endpoint */ opal_proc_table_t tcp_procs; /**< hash table of tcp proc structures */ opal_mutex_t tcp_lock; /**< lock for accessing module state */ + opal_list_t tcp_events; opal_event_t tcp_recv_event; /**< recv event for IPv4 listen socket */ int tcp_listen_sd; /**< IPv4 listen socket for incoming connection requests */ @@ -169,7 +177,9 @@ struct mca_btl_tcp_component_t { opal_free_list_t tcp_frag_max; opal_free_list_t tcp_frag_user; -#if MCA_BTL_TCP_USES_PROGRESS_THREAD + int tcp_enable_progress_thread; /** Support for tcp progress thread flag */ + +#if MCA_BTL_TCP_SUPPORT_PROGRESS_THREAD opal_event_t tcp_recv_thread_async_event; opal_mutex_t tcp_frag_eager_mutex; opal_mutex_t tcp_frag_max_mutex; diff --git a/opal/mca/btl/tcp/btl_tcp_component.c b/opal/mca/btl/tcp/btl_tcp_component.c index 71d0c3d84d4..19ae4f8a148 100644 --- a/opal/mca/btl/tcp/btl_tcp_component.c +++ b/opal/mca/btl/tcp/btl_tcp_component.c @@ -72,10 +72,9 @@ #include "opal/mca/pmix/pmix.h" #include "ompi/constants.h" -#include "ompi/mca/btl/btl.h" -#include "ompi/mca/btl/base/base.h" -#include "ompi/runtime/ompi_module_exchange.h" -#include "ompi/mca/btl/base/btl_base_error.h" +#include "opal/mca/btl/btl.h" +#include "opal/mca/btl/base/base.h" +#include "opal/mca/btl/base/btl_base_error.h" #include "btl_tcp.h" #include "btl_tcp_addr.h" #include "btl_tcp_proc.h" @@ -93,13 +92,13 @@ static int mca_btl_tcp_component_open(void); static int mca_btl_tcp_component_close(void); opal_event_base_t* mca_btl_tcp_event_base = NULL; -#if MCA_BTL_TCP_USES_PROGRESS_THREAD -static int mca_btl_tcp_progress_thread_trigger = -1; +#if MCA_BTL_TCP_SUPPORT_PROGRESS_THREAD +int mca_btl_tcp_progress_thread_trigger = -1; int mca_btl_tcp_pipe_to_progress[2] = { -1, -1 }; static opal_thread_t mca_btl_tcp_progress_thread; opal_list_t mca_btl_tcp_ready_frag_pending_queue; opal_mutex_t mca_btl_tcp_ready_frag_mutex; -#endif /* MCA_BTL_TCP_USES_PROGRESS_THREAD */ +#endif /* MCA_BTL_TCP_SUPPORT_PROGRESS_THREAD */ static char *mca_btl_tcp_if_seq_string; mca_btl_tcp_component_t mca_btl_tcp_component = { @@ -119,11 +118,11 @@ mca_btl_tcp_component_t mca_btl_tcp_component = { }, .btl_init = mca_btl_tcp_component_init, -#if MCA_BTL_TCP_USES_PROGRESS_THREAD - mca_btl_tcp_component_progress, -#else +//#if MCA_BTL_TCP_USES_PROGRESS_THREAD +// mca_btl_tcp_component_progress, +//#else NULL, -#endif /* MCA_BTL_TCP_USES_PROGRESS_THREAD */ +//#endif /* MCA_BTL_TCP_USES_PROGRESS_THREAD */ } }; @@ -289,6 +288,20 @@ static int mca_btl_tcp_component_register(void) free(message); #endif + /* Check if we should support async progress */ + mca_btl_tcp_param_register_int ("progress_thread", NULL, 0, OPAL_INFO_LVL_1, + &mca_btl_tcp_component.tcp_enable_progress_thread); +#if !defined(MCA_BTL_TCP_SUPPORT_PROGRESS_THREAD) + if( mca_btl_tcp_component.tcp_enable_progress_thread ) { + opal_show_help("help-mpi-btl-tcp.txt", + "unsuported progress thread", + true, "progress thread", + ompi_process_info.nodename, + mca_btl_tcp_component.tcp_if_seq, + "Progress thread support compiled out"); + } +#endif /* !defined(MCA_BTL_TCP_SUPPORT_PROGRESS_THREAD) */ + mca_btl_tcp_component.report_all_unfound_interfaces = false; (void) mca_base_component_var_register(&mca_btl_tcp_component.super.btl_version, "warn_all_unfound_interfaces", @@ -340,18 +353,19 @@ static int mca_btl_tcp_component_open(void) /* initialize objects */ OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_lock, opal_mutex_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_procs, opal_proc_table_t); + OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_events, opal_list_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_eager, opal_free_list_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_max, opal_free_list_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_user, opal_free_list_t); opal_proc_table_init(&mca_btl_tcp_component.tcp_procs, 16, 256); -#if MCA_BTL_TCP_USES_PROGRESS_THREAD +#if MCA_BTL_TCP_SUPPORT_PROGRESS_THREAD OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_eager_mutex, opal_mutex_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_max_mutex, opal_mutex_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_user_mutex, opal_mutex_t); OBJ_CONSTRUCT(&mca_btl_tcp_ready_frag_mutex, opal_mutex_t); OBJ_CONSTRUCT(&mca_btl_tcp_ready_frag_pending_queue, opal_list_t); -#endif /* MCA_BTL_TCP_USES_PROGRESS_THREAD */ +#endif /* MCA_BTL_TCP_SUPPORT_PROGRESS_THREAD */ /* if_include and if_exclude need to be mutually exclusive */ if (OPAL_SUCCESS != @@ -380,9 +394,6 @@ static int mca_btl_tcp_component_close(void) opal_list_item_t* item; opal_list_item_t* next; - if (NULL != mca_btl_tcp_component.tcp_if_seq) { - free(mca_btl_tcp_component.tcp_if_seq); - } if (NULL != mca_btl_tcp_component.tcp_btls) { free(mca_btl_tcp_component.tcp_btls); } @@ -423,7 +434,7 @@ static int mca_btl_tcp_component_close(void) mca_common_cuda_fini(); #endif /* OPAL_CUDA_SUPPORT */ -#if MCA_BTL_TCP_USES_PROGRESS_THREAD +#if MCA_BTL_TCP_SUPPORT_PROGRESS_THREAD OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_frag_eager_mutex); OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_frag_max_mutex); @@ -752,7 +763,7 @@ static int mca_btl_tcp_component_create_instances(void) return ret; } -#if MCA_BTL_TCP_USES_PROGRESS_THREAD +#if MCA_BTL_TCP_SUPPORT_PROGRESS_THREAD static void* mca_btl_tcp_progress_thread_engine(opal_object_t *obj) { opal_thread_t* current_thread = (opal_thread_t*)obj; @@ -779,7 +790,7 @@ static void mca_btl_tcp_component_event_async_handler(int fd, short unused, void opal_event_add(event, 0); } } - +/* int mca_btl_tcp_component_progress( void ) { mca_btl_tcp_frag_t* frag; @@ -807,6 +818,7 @@ int mca_btl_tcp_component_progress( void ) } return count; } +*/ #endif /* @@ -971,67 +983,67 @@ static int mca_btl_tcp_component_create_listen(uint16_t af_family) } } -#if MCA_BTL_TCP_USES_PROGRESS_THREAD - /* Declare our intent to use threads. */ - opal_event_use_threads(); - if( NULL == mca_btl_tcp_event_base ) { - /* fall back to only one event base (the one shared by the entire Open MPI framework) */ - mca_btl_tcp_event_base = opal_sync_event_base; + if(mca_btl_tcp_component.tcp_enable_progress_thread){ + /* Declare our intent to use threads. */ + opal_event_use_threads(); + if( NULL == mca_btl_tcp_event_base ) { + /* fall back to only one event base (the one shared by the entire Open MPI framework) */ - if( NULL == (mca_btl_tcp_event_base = opal_sync_event_base_create()) ) { - BTL_ERROR(("BTL TCP failed to create progress event base")); - goto move_forward_with_no_thread; - } - opal_event_base_priority_init(mca_btl_tcp_event_base, OPAL_EVENT_NUM_PRI); - - /* construct the thread object */ - OBJ_CONSTRUCT(&mca_btl_tcp_progress_thread, opal_thread_t); - - /** - * Create a pipe to communicate between the main thread and the progress thread. - */ - if (0 != pipe(mca_btl_tcp_pipe_to_progress)) { - opal_event_base_free(mca_btl_tcp_event_base); - /* fall back to only one event base (the one shared by the entire Open MPI framework */ - mca_btl_tcp_event_base = opal_sync_event_base; - mca_btl_tcp_progress_thread_trigger = -1; /* thread not started */ - goto move_forward_with_no_thread; - } - /* setup the receiving end of the pipe as non-blocking */ - if((flags = fcntl(mca_btl_tcp_pipe_to_progress[0], F_GETFL, 0)) < 0) { - BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)", - strerror(opal_socket_errno), opal_socket_errno)); - } else { - flags |= O_NONBLOCK; - if(fcntl(mca_btl_tcp_pipe_to_progress[0], F_SETFL, flags) < 0) - BTL_ERROR(("fcntl(F_SETFL) failed: %s (%d)", + if( NULL == (mca_btl_tcp_event_base = opal_event_base_create()) ) { + BTL_ERROR(("BTL TCP failed to create progress event base")); + goto move_forward_with_no_thread; + } + opal_event_base_priority_init(mca_btl_tcp_event_base, OPAL_EVENT_NUM_PRI); + + /* construct the thread object */ + OBJ_CONSTRUCT(&mca_btl_tcp_progress_thread, opal_thread_t); + + /** + * Create a pipe to communicate between the main thread and the progress thread. + */ + if (0 != pipe(mca_btl_tcp_pipe_to_progress)) { + opal_event_base_free(mca_btl_tcp_event_base); + /* fall back to only one event base (the one shared by the entire Open MPI framework */ + mca_btl_tcp_event_base = opal_sync_event_base; + mca_btl_tcp_progress_thread_trigger = -1; /* thread not started */ + goto move_forward_with_no_thread; + } + /* setup the receiving end of the pipe as non-blocking */ + if((flags = fcntl(mca_btl_tcp_pipe_to_progress[0], F_GETFL, 0)) < 0) { + BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)", strerror(opal_socket_errno), opal_socket_errno)); + } else { + flags |= O_NONBLOCK; + if(fcntl(mca_btl_tcp_pipe_to_progress[0], F_SETFL, flags) < 0) + BTL_ERROR(("fcntl(F_SETFL) failed: %s (%d)", + strerror(opal_socket_errno), opal_socket_errno)); + } + /* Progress thread event */ + opal_event_set(mca_btl_tcp_event_base, &mca_btl_tcp_component.tcp_recv_thread_async_event, + mca_btl_tcp_pipe_to_progress[0], + OPAL_EV_READ|OPAL_EV_PERSIST, + mca_btl_tcp_component_event_async_handler, + &mca_btl_tcp_progress_thread ); + opal_event_add(&mca_btl_tcp_component.tcp_recv_thread_async_event, 0); + + /* fork off a thread to progress it */ + mca_btl_tcp_progress_thread.t_run = mca_btl_tcp_progress_thread_engine; + mca_btl_tcp_progress_thread.t_arg = &mca_btl_tcp_progress_thread_trigger; + mca_btl_tcp_progress_thread_trigger = 1; /* thread up and running */ + if( OPAL_SUCCESS != (rc = opal_thread_start(&mca_btl_tcp_progress_thread)) ) { + BTL_ERROR(("BTL TCP progress thread initialization failed (%d)", rc)); + opal_event_base_free(mca_btl_tcp_event_base); + /* fall back to only one event base (the one shared by the entire Open MPI framework */ + mca_btl_tcp_event_base = opal_sync_event_base; + mca_btl_tcp_progress_thread_trigger = -1; /* thread not started */ + goto move_forward_with_no_thread; + } } - /* Progress thread event */ - opal_event_set(mca_btl_tcp_event_base, &mca_btl_tcp_component.tcp_recv_thread_async_event, - mca_btl_tcp_pipe_to_progress[0], - OPAL_EV_READ|OPAL_EV_PERSIST, - mca_btl_tcp_component_event_async_handler, - &mca_btl_tcp_progress_thread ); - opal_event_add(&mca_btl_tcp_component.tcp_recv_thread_async_event, 0); - - /* fork off a thread to progress it */ - mca_btl_tcp_progress_thread.t_run = mca_btl_tcp_progress_thread_engine; - mca_btl_tcp_progress_thread.t_arg = &mca_btl_tcp_progress_thread_trigger; - mca_btl_tcp_progress_thread_trigger = 1; /* thread up and running */ - if( OPAL_SUCCESS != (rc = opal_thread_start(&mca_btl_tcp_progress_thread)) ) { - BTL_ERROR(("BTL TCP progress thread initialization failed (%d)", rc)); - opal_event_base_free(mca_btl_tcp_event_base); - /* fall back to only one event base (the one shared by the entire Open MPI framework */ + } + else { + move_forward_with_no_thread: mca_btl_tcp_event_base = opal_sync_event_base; - mca_btl_tcp_progress_thread_trigger = -1; /* thread not started */ - goto move_forward_with_no_thread; - } } - move_forward_with_no_thread: -#else - mca_btl_tcp_event_base = opal_sync_event_base; -#endif if (AF_INET == af_family) { opal_event_set(mca_btl_tcp_event_base, &mca_btl_tcp_component.tcp_recv_event, @@ -1154,6 +1166,7 @@ mca_btl_base_module_t** mca_btl_tcp_component_init(int *num_btl_modules, bool enable_mpi_threads) { int ret = OPAL_SUCCESS; + unsigned int i; mca_btl_base_module_t **btls; *num_btl_modules = 0; @@ -1213,17 +1226,22 @@ mca_btl_base_module_t** mca_btl_tcp_component_init(int *num_btl_modules, if(OPAL_SUCCESS != (ret = mca_btl_tcp_component_exchange() )) { return 0; } - btls = (mca_btl_base_module_t **)malloc(mca_btl_tcp_component.tcp_num_btls * sizeof(mca_btl_base_module_t*)); if(NULL == btls) { return NULL; } + /* Register the btl to support the progress_thread */ + if (0 < mca_btl_tcp_progress_thread_trigger){ + for(i=0;isuper.btl_flags |= MCA_BTL_FLAGS_BTL_PROGRESS_THREAD_ENABLED; + } + } + #if OPAL_CUDA_SUPPORT mca_common_cuda_stage_one_init(); #endif /* OPAL_CUDA_SUPPORT */ - memcpy(btls, mca_btl_tcp_component.tcp_btls, mca_btl_tcp_component.tcp_num_btls*sizeof(mca_btl_tcp_module_t*)); *num_btl_modules = mca_btl_tcp_component.tcp_num_btls; return btls; diff --git a/opal/mca/btl/tcp/btl_tcp_endpoint.c b/opal/mca/btl/tcp/btl_tcp_endpoint.c index 2e291092405..6efc805f5ad 100644 --- a/opal/mca/btl/tcp/btl_tcp_endpoint.c +++ b/opal/mca/btl/tcp/btl_tcp_endpoint.c @@ -309,7 +309,7 @@ static inline void mca_btl_tcp_endpoint_event_init(mca_btl_base_endpoint_t* btl_ btl_endpoint->endpoint_cache_pos = btl_endpoint->endpoint_cache; #endif /* MCA_BTL_TCP_ENDPOINT_CACHE */ - opal_event_set(opal_sync_event_base, &btl_endpoint->endpoint_recv_event, + opal_event_set(mca_btl_tcp_event_base, &btl_endpoint->endpoint_recv_event, btl_endpoint->endpoint_sd, OPAL_EV_READ|OPAL_EV_PERSIST, mca_btl_tcp_endpoint_recv_handler, @@ -320,7 +320,7 @@ static inline void mca_btl_tcp_endpoint_event_init(mca_btl_base_endpoint_t* btl_ * will be fired only once, and when the endpoint is marked as * CONNECTED the event should be recreated with the correct flags. */ - opal_event_set(opal_sync_event_base, &btl_endpoint->endpoint_send_event, + opal_event_set(mca_btl_tcp_event_base, &btl_endpoint->endpoint_send_event, btl_endpoint->endpoint_sd, OPAL_EV_WRITE, mca_btl_tcp_endpoint_send_handler, @@ -368,8 +368,8 @@ int mca_btl_tcp_endpoint_send(mca_btl_base_endpoint_t* btl_endpoint, mca_btl_tcp } else { btl_endpoint->endpoint_send_frag = frag; MCA_BTL_TCP_ENDPOINT_DUMP(10, btl_endpoint, true, "event_add(send) [endpoint_send]"); - opal_event_add(&btl_endpoint->endpoint_send_event, 0); frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; + MCA_BTL_TCP_ACTIVATE_EVENT(&btl_endpoint->endpoint_send_event, 0); } } else { MCA_BTL_TCP_ENDPOINT_DUMP(10, btl_endpoint, true, "send fragment enqueued [endpoint_send]"); @@ -509,7 +509,7 @@ void mca_btl_tcp_endpoint_accept(mca_btl_base_endpoint_t* btl_endpoint, assert(btl_endpoint->endpoint_sd_next == -1); btl_endpoint->endpoint_sd_next = sd; - opal_event_evtimer_set(opal_sync_event_base, &btl_endpoint->endpoint_accept_event, + opal_event_evtimer_set(mca_btl_tcp_event_base, &btl_endpoint->endpoint_accept_event, mca_btl_tcp_endpoint_complete_accept, btl_endpoint); opal_event_add(&btl_endpoint->endpoint_accept_event, &now); } @@ -570,7 +570,7 @@ static void mca_btl_tcp_endpoint_connected(mca_btl_base_endpoint_t* btl_endpoint MCA_BTL_TCP_ENDPOINT_DUMP(1, btl_endpoint, true, "READY [endpoint_connected]"); /* Create the send event in a persistent manner. */ - opal_event_set(opal_sync_event_base, &btl_endpoint->endpoint_send_event, + opal_event_set(mca_btl_tcp_event_base, &btl_endpoint->endpoint_send_event, btl_endpoint->endpoint_sd, OPAL_EV_WRITE | OPAL_EV_PERSIST, mca_btl_tcp_endpoint_send_handler, @@ -760,7 +760,7 @@ static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpo if(opal_socket_errno == EINPROGRESS || opal_socket_errno == EWOULDBLOCK) { btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECTING; MCA_BTL_TCP_ENDPOINT_DUMP(10, btl_endpoint, true, "event_add(send) [start_connect]"); - opal_event_add(&btl_endpoint->endpoint_send_event, 0); + MCA_BTL_TCP_ACTIVATE_EVENT(&btl_endpoint->endpoint_send_event, 0); return OPAL_SUCCESS; } } @@ -790,6 +790,8 @@ static void mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_e opal_socklen_t so_length = sizeof(so_error); struct sockaddr_storage endpoint_addr; + opal_event_del(&btl_endpoint->endpoint_send_event); + mca_btl_tcp_proc_tosocks(btl_endpoint->endpoint_addr, &endpoint_addr); /* check connect completion status */ diff --git a/opal/mca/btl/tcp/btl_tcp_frag.c b/opal/mca/btl/tcp/btl_tcp_frag.c index e27e4cc03da..f5e41bb265d 100644 --- a/opal/mca/btl/tcp/btl_tcp_frag.c +++ b/opal/mca/btl/tcp/btl_tcp_frag.c @@ -47,6 +47,15 @@ #include "btl_tcp_frag.h" #include "btl_tcp_endpoint.h" +static void mca_btl_tcp_frag_common_constructor(mca_btl_tcp_frag_t* frag) +{ + frag->base.des_src = NULL; + frag->base.des_src_cnt = 0; + frag->base.des_dst = NULL; + frag->base.des_dst_cnt = 0; + frag->next_step = MCA_BTL_TCP_FRAG_STEP_UNDEFINED; +} + static void mca_btl_tcp_frag_eager_constructor(mca_btl_tcp_frag_t* frag) { frag->size = mca_btl_tcp_module.super.btl_eager_limit; @@ -106,6 +115,7 @@ size_t mca_btl_tcp_frag_dump(mca_btl_tcp_frag_t* frag, char* msg, char* buf, siz return used; } + bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t* frag, int sd) { int cnt=-1; @@ -271,7 +281,7 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd) frag->iov[1].iov_len = frag->hdr.size; frag->iov_cnt++; #ifndef __sparc -#if !MCA_BTL_TCP_USES_PROGRESS_THREAD +#if !MCA_BTL_TCP_SUPPORT_PROGRESS_THREAD /* The following cannot be done for sparc code * because it causes alignment errors when accessing * structures later on in the btl and pml code. @@ -306,19 +316,3 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd) return false; } -void mca_btl_tcp_dump_frag( mca_btl_tcp_frag_t* frag, char* msg ) -{ - int i; - - mca_btl_base_err("%s %s frag %p (endpoint %p) size %lu iov_cnt %d iov_idx %d next_step %s rc %d\n", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg, (void*)frag, (void*)frag->endpoint, frag->size, frag->iov_cnt, frag->iov_idx, - (MCA_BTL_TCP_FRAG_STEP_UNDEFINED == frag->next_step ? "undefined" : - (MCA_BTL_TCP_FRAG_STEP_SEND_COMPLETE == frag->next_step ? "send_complete" : - (MCA_BTL_TCP_FRAG_STEP_RECV_COMPLETE == frag->next_step ? "recv_complete" : "unknown"))), - frag->rc); - for( i = 0; i < frag->iov_cnt; i++ ) { - mca_btl_base_err("%s %s | iov[%d] = {%p, %lu}\n", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg, i, frag->iov[i].iov_base, frag->iov[i].iov_len); - } - mca_btl_base_err("%s +\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); -} diff --git a/opal/mca/btl/tcp/btl_tcp_frag.h b/opal/mca/btl/tcp/btl_tcp_frag.h index 9802069f4fa..4d9e714a07d 100644 --- a/opal/mca/btl/tcp/btl_tcp_frag.h +++ b/opal/mca/btl/tcp/btl_tcp_frag.h @@ -35,6 +35,7 @@ #include #endif +#include "orte/util/name_fns.h" #include "btl_tcp.h" #include "btl_tcp_hdr.h" @@ -56,6 +57,7 @@ struct mca_btl_tcp_frag_t { size_t iov_cnt; size_t iov_idx; size_t size; + uint16_t next_step; int rc; opal_free_list_t* my_list; /* fake rdma completion */ @@ -126,8 +128,6 @@ do { \ bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t*, int sd); bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t*, int sd); - -size_t mca_btl_tcp_frag_dump(mca_btl_tcp_frag_t*, char*, char*, size_t); - +size_t mca_btl_tcp_frag_dump(mca_btl_tcp_frag_t* frag, char* msg, char* buf, size_t length); END_C_DECLS #endif diff --git a/opal/mca/btl/tcp/help-mpi-btl-tcp.txt b/opal/mca/btl/tcp/help-mpi-btl-tcp.txt index 6f7c0e867e1..18c1521072b 100644 --- a/opal/mca/btl/tcp/help-mpi-btl-tcp.txt +++ b/opal/mca/btl/tcp/help-mpi-btl-tcp.txt @@ -56,6 +56,16 @@ most common causes when it does occur are: * The operating system ran out of file descriptors * The operating system ran out of memory +[unsuported progress thread] +WARNING: Support for the TCP progress thread has not been compiled in. +Fall back to the normal progress. + + Local host: %s + Value: %s + Message: %s + +# + Your Open MPI job will likely hang until the failure resason is fixed (e.g., more file descriptors and/or memory becomes available), and may eventually timeout / abort. From f69eba1bc42634d6194ceadca9f537f7c91200a9 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Mon, 21 Mar 2016 14:56:33 -0400 Subject: [PATCH 3/3] Update the copyright and cleanup the code. Per @jsquyres suggestion remove all trailing spaces. Credit to `sed -i.bak 's/ *$//' */[ch]`. --- .../btl_tcp2.c | 29 +------ .../btl_tcp2_endpoint.c | 77 +++++-------------- .../btl_tcp2_proc.c | 6 +- ompi/mca/pml/ob1/pml_ob1.h | 8 +- ompi/mca/pml/ob1/pml_ob1_component.c | 4 +- ompi/mca/pml/ob1/pml_ob1_recvfrag.c | 2 +- ompi/mca/pml/ob1/pml_ob1_recvreq.c | 2 +- opal/mca/btl/btl.h | 2 +- opal/mca/btl/tcp/btl_tcp.h | 40 +--------- opal/mca/btl/tcp/btl_tcp_component.c | 54 +++---------- opal/mca/btl/tcp/btl_tcp_endpoint.c | 2 +- opal/mca/btl/tcp/btl_tcp_endpoint.h | 2 +- opal/mca/btl/tcp/btl_tcp_frag.c | 28 ++----- opal/mca/btl/tcp/btl_tcp_frag.h | 3 +- opal/mca/btl/tcp/btl_tcp_hdr.h | 2 +- opal/mca/btl/tcp/btl_tcp_proc.h | 2 +- opal/mca/btl/tcp/help-mpi-btl-tcp.txt | 3 + 17 files changed, 62 insertions(+), 204 deletions(-) diff --git a/contrib/build-mca-comps-outside-of-tree/btl_tcp2.c b/contrib/build-mca-comps-outside-of-tree/btl_tcp2.c index 59fdc7fe75c..e2188d81383 100644 --- a/contrib/build-mca-comps-outside-of-tree/btl_tcp2.c +++ b/contrib/build-mca-comps-outside-of-tree/btl_tcp2.c @@ -33,7 +33,7 @@ #include "ompi/mca/mpool/base/base.h" #include "ompi/mca/mpool/mpool.h" #include "btl_tcp.h" -#include "btl_tcp_frag.h" +#include "btl_tcp_frag.h" #include "btl_tcp_proc.h" #include "btl_tcp_endpoint.h" @@ -61,7 +61,7 @@ mca_btl_tcp2_module_t mca_btl_tcp2_module = { mca_btl_tcp2_send, NULL, /* send immediate */ mca_btl_tcp_put, - NULL, /* get */ + NULL, /* get */ mca_btl_tcp_dump, NULL, /* mpool */ NULL, /* register error */ @@ -190,11 +190,7 @@ mca_btl_base_descriptor_t* mca_btl_tcp2_alloc( if( OPAL_UNLIKELY(NULL == frag) ) { return NULL; } - -#define GB_DEFINED 0 -#if GB_DEFINED - opal_output(0, "alloc_frag( size = %lu )\n", size); -#endif /* GB_DEFINED */ + frag->segments[0].seg_len = size; frag->segments[0].seg_addr.pval = frag+1; @@ -308,10 +304,6 @@ mca_btl_base_descriptor_t* mca_btl_tcp2_prepare_src( frag->base.des_flags = flags; frag->base.order = MCA_BTL_NO_ORDER; *size = max_data; -#if GB_DEFINED - opal_output(0, "prepare_src( bConverted = %lu, size = %lu\n", - convertor->bConverted, *size); -#endif /* GB_DEFINED */ return &frag->base; } @@ -359,10 +351,6 @@ mca_btl_base_descriptor_t* mca_btl_tcp2_prepare_dst( frag->base.des_dst_cnt = 1; frag->base.des_flags = flags; frag->base.order = MCA_BTL_NO_ORDER; -#if GB_DEFINED - opal_output(0, " prepare_dst( bConverted = %lu, size = %lu\n", - convertor->bConverted, *size); -#endif /* GB_DEFINED */ return &frag->base; } @@ -404,9 +392,6 @@ int mca_btl_tcp2_send( struct mca_btl_base_module_t* btl, frag->hdr.type = MCA_BTL_TCP_HDR_TYPE_SEND; frag->hdr.count = 0; if (endpoint->endpoint_nbo) MCA_BTL_TCP_HDR_HTON(frag->hdr); -#if GB_DEFINED - opal_output(0, "frag_send( size = %u )\n", frag->hdr.size ); -#endif /* GB_DEFINED */ return mca_btl_tcp_endpoint_send(endpoint,frag); } @@ -448,9 +433,6 @@ int mca_btl_tcp2_put( mca_btl_base_module_t* btl, frag->hdr.type = MCA_BTL_TCP_HDR_TYPE_PUT; frag->hdr.count = frag->base.des_dst_cnt; if (endpoint->endpoint_nbo) MCA_BTL_TCP_HDR_HTON(frag->hdr); -#if GB_DEFINED - opal_output(0, "frag_put( size = %u )\n", frag->hdr.size ); -#endif /* GB_DEFINED */ return ((i = mca_btl_tcp_endpoint_send(endpoint,frag)) >= 0 ? OMPI_SUCCESS : i); } @@ -488,9 +470,6 @@ int mca_btl_tcp2_get( frag->hdr.type = MCA_BTL_TCP_HDR_TYPE_GET; frag->hdr.count = frag->base.des_src_cnt; if (endpoint->endpoint_nbo) MCA_BTL_TCP_HDR_HTON(frag->hdr); -#if GB_DEFINED - opal_output(0, "frag_get( size = %u )\n", frag->hdr.size ); -#endif /* GB_DEFINED */ return ((rc = mca_btl_tcp_endpoint_send(endpoint,frag)) >= 0 ? OMPI_SUCCESS : rc); } @@ -541,7 +520,7 @@ void mca_btl_tcp_dump(struct mca_btl_base_module_t* base_btl, opal_list_item_t *item; for(item = opal_list_get_first(&btl->tcp_endpoints); - item != opal_list_get_end(&btl->tcp_endpoints); + item != opal_list_get_end(&btl->tcp_endpoints); item = opal_list_get_next(item)) { mca_btl_tcp_endpoint_dump( (mca_btl_base_endpoint_t*)item, "TCP" ); } diff --git a/contrib/build-mca-comps-outside-of-tree/btl_tcp2_endpoint.c b/contrib/build-mca-comps-outside-of-tree/btl_tcp2_endpoint.c index 778769d16ca..a1998de71af 100644 --- a/contrib/build-mca-comps-outside-of-tree/btl_tcp2_endpoint.c +++ b/contrib/build-mca-comps-outside-of-tree/btl_tcp2_endpoint.c @@ -55,7 +55,7 @@ #include "ompi/mca/btl/base/btl_base_error.h" #include "ompi/mca/rte/rte.h" -#include "btl_tcp_endpoint.h" +#include "btl_tcp_endpoint.h" #include "btl_tcp_proc.h" #include "btl_tcp_frag.h" @@ -160,7 +160,7 @@ void mca_btl_tcp_endpoint_dump(mca_btl_base_endpoint_t* btl_endpoint, const char #endif if((flags = fcntl(btl_endpoint->endpoint_sd, F_GETFL, 0)) < 0) { - BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)", + BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)", strerror(opal_socket_errno), opal_socket_errno)); } @@ -176,7 +176,7 @@ void mca_btl_tcp_endpoint_dump(mca_btl_base_endpoint_t* btl_endpoint, const char #if defined(SO_RCVBUF) obtlen = sizeof(rcvbuf); if(getsockopt(btl_endpoint->endpoint_sd, SOL_SOCKET, SO_RCVBUF, (char *)&rcvbuf, &obtlen) < 0) { - BTL_ERROR(("SO_RCVBUF option: %s (%d)", + BTL_ERROR(("SO_RCVBUF option: %s (%d)", strerror(opal_socket_errno), opal_socket_errno)); } #else @@ -185,7 +185,7 @@ void mca_btl_tcp_endpoint_dump(mca_btl_base_endpoint_t* btl_endpoint, const char #if defined(TCP_NODELAY) obtlen = sizeof(nodelay); if(getsockopt(btl_endpoint->endpoint_sd, IPPROTO_TCP, TCP_NODELAY, (char *)&nodelay, &obtlen) < 0) { - BTL_ERROR(("TCP_NODELAY option: %s (%d)", + BTL_ERROR(("TCP_NODELAY option: %s (%d)", strerror(opal_socket_errno), opal_socket_errno)); } #else @@ -193,7 +193,7 @@ void mca_btl_tcp_endpoint_dump(mca_btl_base_endpoint_t* btl_endpoint, const char #endif } - mca_btl_base_err("%s %s: endpoint %p src %s - dst %s nodelay %d sndbuf %d rcvbuf %d flags %08x\n", + mca_btl_base_err("%s %s: endpoint %p src %s - dst %s nodelay %d sndbuf %d rcvbuf %d flags %08x\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg, (void*)btl_endpoint, src, dst, nodelay, sndbuf, rcvbuf, flags); switch(btl_endpoint->endpoint_state) { @@ -222,7 +222,7 @@ void mca_btl_tcp_endpoint_dump(mca_btl_base_endpoint_t* btl_endpoint, const char #endif /* MCA_BTL_TCP_ENDPOINT_CACHE */ (void*)btl_endpoint->endpoint_send_frag, (void*)btl_endpoint->endpoint_recv_frag ); for(item = opal_list_get_first(&btl_endpoint->endpoint_frags); - item != opal_list_get_end(&btl_endpoint->endpoint_frags); + item != opal_list_get_end(&btl_endpoint->endpoint_frags); item = opal_list_get_next(item)) { mca_btl_tcp_dump_frag( (mca_btl_tcp_frag_t*)item, " | send" ); } @@ -239,9 +239,9 @@ static inline void mca_btl_tcp2_endpoint_event_init(mca_btl_base_endpoint_t* btl btl_endpoint->endpoint_cache_pos = btl_endpoint->endpoint_cache; #endif /* MCA_BTL_TCP_ENDPOINT_CACHE */ - opal_event_set(mca_btl_tcp_event_base, &btl_endpoint->endpoint_recv_event, - btl_endpoint->endpoint_sd, - OPAL_EV_READ|OPAL_EV_PERSIST, + opal_event_set(mca_btl_tcp_event_base, &btl_endpoint->endpoint_recv_event, + btl_endpoint->endpoint_sd, + OPAL_EV_READ|OPAL_EV_PERSIST, mca_btl_tcp_endpoint_recv_handler, btl_endpoint ); /** @@ -250,9 +250,9 @@ static inline void mca_btl_tcp2_endpoint_event_init(mca_btl_base_endpoint_t* btl * will be fired only once, and when the endpoint is marked as * CONNECTED the event should be recreated with the correct flags. */ - opal_event_set(mca_btl_tcp_event_base, &btl_endpoint->endpoint_send_event, - btl_endpoint->endpoint_sd, - OPAL_EV_WRITE, + opal_event_set(mca_btl_tcp_event_base, &btl_endpoint->endpoint_send_event, + btl_endpoint->endpoint_sd, + OPAL_EV_WRITE, mca_btl_tcp_endpoint_send_handler, btl_endpoint); } @@ -291,11 +291,6 @@ int mca_btl_tcp2_endpoint_send(mca_btl_base_endpoint_t* btl_endpoint, mca_btl_tc } else { btl_endpoint->endpoint_send_frag = frag; frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; -#define GB_DEFINED 0 -#if GB_DEFINED - opal_output(0, "%s:%d add the send event on socket %d\n", - __FILE__, __LINE__, btl_endpoint->endpoint_sd); /* GB */ -#endif /* GB_DEFINED */ MCA_BTL_TCP_ACTIVATE_EVENT(&btl_endpoint->endpoint_send_event, 0); } } else { @@ -376,7 +371,7 @@ bool mca_btl_tcp2_endpoint_accept(mca_btl_base_endpoint_t* btl_endpoint, OPAL_THREAD_LOCK(&btl_endpoint->endpoint_recv_lock); OPAL_THREAD_LOCK(&btl_endpoint->endpoint_send_lock); - cmpval = ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL, + cmpval = ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL, &endpoint_proc->proc_ompi->proc_name, this_proc); if((btl_endpoint->endpoint_sd < 0) || @@ -394,10 +389,6 @@ bool mca_btl_tcp2_endpoint_accept(mca_btl_base_endpoint_t* btl_endpoint, /* NOT NEEDED if we remove the PERSISTENT flag when we create the * first recv_event. */ -#if GB_DEFINED - opal_output(0, "%s:%d add the recv event on socket %d\n", - __FILE__, __LINE__, btl_endpoint->endpoint_sd); /* GB */ -#endif /* GB_DEFINED */ opal_event_add(&btl_endpoint->endpoint_recv_event, 0); /* TODO */ mca_btl_tcp_endpoint_connected(btl_endpoint); #if OPAL_ENABLE_DEBUG && WANT_PEER_DUMP @@ -452,8 +443,8 @@ static void mca_btl_tcp2_endpoint_connected(mca_btl_base_endpoint_t* btl_endpoin btl_endpoint->endpoint_retries = 0; /* Create the send event in a persistent manner. */ - opal_event_set(mca_btl_tcp_event_base, &btl_endpoint->endpoint_send_event, - btl_endpoint->endpoint_sd, + opal_event_set(mca_btl_tcp_event_base, &btl_endpoint->endpoint_send_event, + btl_endpoint->endpoint_sd, OPAL_EV_WRITE | OPAL_EV_PERSIST, mca_btl_tcp_endpoint_send_handler, btl_endpoint ); @@ -463,10 +454,6 @@ static void mca_btl_tcp2_endpoint_connected(mca_btl_base_endpoint_t* btl_endpoin btl_endpoint->endpoint_send_frag = (mca_btl_tcp_frag_t*) opal_list_remove_first(&btl_endpoint->endpoint_frags); } -#if GB_DEFINED - opal_output(0, "%s:%d add the send event on socket %d\n", - __FILE__, __LINE__, btl_endpoint->endpoint_sd); /* GB */ -#endif /* GB_DEFINED */ opal_event_add(&btl_endpoint->endpoint_send_event, 0); } } @@ -618,10 +605,6 @@ static int mca_btl_tcp2_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endp /* non-blocking so wait for completion */ if(opal_socket_errno == EINPROGRESS || opal_socket_errno == EWOULDBLOCK) { btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECTING; -#if GB_DEFINED - opal_output(0, "%s:%d add the send event on socket %d\n", - __FILE__, __LINE__, btl_endpoint->endpoint_sd); /* GB */ -#endif /* GB_DEFINED */ MCA_BTL_TCP_ACTIVATE_EVENT(&btl_endpoint->endpoint_send_event, 0); return OMPI_SUCCESS; } @@ -641,10 +624,6 @@ static int mca_btl_tcp2_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endp /* send our globally unique process identifier to the endpoint */ if((rc = mca_btl_tcp2_endpoint_send_connect_ack(btl_endpoint)) == OMPI_SUCCESS) { btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK; -#if GB_DEFINED - opal_output(0, "%s:%d add the recv event on socket %d\n", - __FILE__, __LINE__, btl_endpoint->endpoint_sd); /* GB */ -#endif /* GB_DEFINED */ MCA_BTL_TCP_ACTIVATE_EVENT(&btl_endpoint->endpoint_recv_event, 0); } else { mca_btl_tcp2_endpoint_close(btl_endpoint); @@ -667,10 +646,6 @@ static void mca_btl_tcp2_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_ mca_btl_tcp2_proc_tosocks(btl_endpoint->endpoint_addr, &endpoint_addr); /* unregister from receiving event notifications */ -#if GB_DEFINED - opal_output(0, "%s:%d remove the send event on socket %d\n", - __FILE__, __LINE__, btl_endpoint->endpoint_sd); /* GB */ -#endif /* GB_DEFINED */ opal_event_del(&btl_endpoint->endpoint_send_event); /* check connect completion status */ @@ -682,10 +657,6 @@ static void mca_btl_tcp2_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_ return; } if(so_error == EINPROGRESS || so_error == EWOULDBLOCK) { -#if GB_DEFINED - opal_output(0, "%s:%d add the send event on socket %d\n", - __FILE__, __LINE__, btl_endpoint->endpoint_sd); /* GB */ -#endif /* GB_DEFINED */ opal_event_add(&btl_endpoint->endpoint_send_event, 0); return; } @@ -699,10 +670,6 @@ static void mca_btl_tcp2_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_ if(mca_btl_tcp2_endpoint_send_connect_ack(btl_endpoint) == OMPI_SUCCESS) { btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK; -#if GB_DEFINED - opal_output(0, "%s:%d add the recv event on socket %d\n", - __FILE__, __LINE__, btl_endpoint->endpoint_sd); /* GB */ -#endif /* GB_DEFINED */ opal_event_add(&btl_endpoint->endpoint_recv_event, 0); } else { mca_btl_tcp2_endpoint_close(btl_endpoint); @@ -752,8 +719,8 @@ static void mca_btl_tcp2_endpoint_recv_handler(int sd, short flags, void* user) data_still_pending_on_endpoint: if(NULL == frag) { - if(mca_btl_tcp_module.super.btl_max_send_size > - mca_btl_tcp_module.super.btl_eager_limit) { + if(mca_btl_tcp_module.super.btl_max_send_size > + mca_btl_tcp_module.super.btl_eager_limit) { MCA_BTL_TCP_FRAG_ALLOC_MAX(frag); } else { MCA_BTL_TCP_FRAG_ALLOC_EAGER(frag); @@ -771,7 +738,7 @@ static void mca_btl_tcp2_endpoint_recv_handler(int sd, short flags, void* user) btl_endpoint->endpoint_recv_frag = frag; } else { btl_endpoint->endpoint_recv_frag = NULL; - + TODO_MCA_BTL_TCP_RECV_TRIGGER_CB(frag); #if MCA_BTL_TCP_ENDPOINT_CACHE @@ -853,19 +820,11 @@ static void mca_btl_tcp2_endpoint_send_handler(int sd, short flags, void* user) /* if no more data to send unregister the send notifications */ if(NULL == btl_endpoint->endpoint_send_frag) { -#if GB_DEFINED - opal_output(0, "%s:%d remove the send event on socket %d\n", - __FILE__, __LINE__, sd); /* GB */ -#endif /* GB_DEFINED */ opal_event_del(&btl_endpoint->endpoint_send_event); } break; default: BTL_ERROR(("invalid connection state (%d)", btl_endpoint->endpoint_state)); -#if GB_DEFINED - opal_output(0, "%s:%d remove the send event on socket %d\n", - __FILE__, __LINE__, sd); /* GB */ -#endif /* GB_DEFINED */ opal_event_del(&btl_endpoint->endpoint_send_event); break; } diff --git a/contrib/build-mca-comps-outside-of-tree/btl_tcp2_proc.c b/contrib/build-mca-comps-outside-of-tree/btl_tcp2_proc.c index 2834faa45a3..e81818e81ee 100644 --- a/contrib/build-mca-comps-outside-of-tree/btl_tcp2_proc.c +++ b/contrib/build-mca-comps-outside-of-tree/btl_tcp2_proc.c @@ -79,7 +79,7 @@ void mca_btl_tcp2_proc_destruct(mca_btl_tcp2_proc_t* tcp_proc) { /* remove from list of all proc instances */ MCA_BTL_TCP_CRITICAL_SECTION_ENTER(&mca_btl_tcp_component.tcp_lock); - opal_hash_table_remove_value_uint64(&mca_btl_tcp_component.tcp_procs, + opal_hash_table_remove_value_uint64(&mca_btl_tcp_component.tcp_procs, ompi_rte_hash_name(&tcp_proc->proc_ompi->proc_name)); MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(&mca_btl_tcp_component.tcp_lock); @@ -105,7 +105,7 @@ mca_btl_tcp2_proc_t* mca_btl_tcp2_proc_create(ompi_proc_t* ompi_proc) mca_btl_tcp2_proc_t* btl_proc; MCA_BTL_TCP_CRITICAL_SECTION_ENTER(&mca_btl_tcp_component.tcp_lock); - rc = opal_hash_table_get_value_uint64(&mca_btl_tcp_component.tcp_procs, + rc = opal_hash_table_get_value_uint64(&mca_btl_tcp_component.tcp_procs, hash, (void**)&btl_proc); if(OMPI_SUCCESS == rc) { MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(&mca_btl_tcp_component.tcp_lock); @@ -713,7 +713,7 @@ mca_btl_tcp2_proc_t* mca_btl_tcp2_proc_lookup(const orte_process_name_t *name) { mca_btl_tcp_proc_t* proc = NULL; MCA_BTL_TCP_CRITICAL_SECTION_ENTER(&mca_btl_tcp_component.tcp_lock); - opal_hash_table_get_value_uint64(&mca_btl_tcp_component.tcp_procs, + opal_hash_table_get_value_uint64(&mca_btl_tcp_component.tcp_procs, ompi_rte_hash_name(name), (void**)&proc); MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(&mca_btl_tcp_component.tcp_lock); return proc; diff --git a/ompi/mca/pml/ob1/pml_ob1.h b/ompi/mca/pml/ob1/pml_ob1.h index aaa581e3282..1f78c512076 100644 --- a/ompi/mca/pml/ob1/pml_ob1.h +++ b/ompi/mca/pml/ob1/pml_ob1.h @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2013 The University of Tennessee and The University + * Copyright (c) 2004-2016 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -267,7 +267,7 @@ do { \ opal_mutex_lock(lock); \ } \ else { OPAL_THREAD_LOCK(lock); } \ - } while(0) + } while(0) #define OB1_MATCHING_UNLOCK(lock) \ @@ -276,10 +276,10 @@ do { \ opal_mutex_unlock(lock); \ } \ else { OPAL_THREAD_UNLOCK(lock); } \ - } while(0) + } while(0) + - int mca_pml_ob1_send_fin(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl, opal_ptr_t hdr_frag, uint64_t size, uint8_t order, int status); diff --git a/ompi/mca/pml/ob1/pml_ob1_component.c b/ompi/mca/pml/ob1/pml_ob1_component.c index 65152ea5139..5445f4b8211 100644 --- a/ompi/mca/pml/ob1/pml_ob1_component.c +++ b/ompi/mca/pml/ob1/pml_ob1_component.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2009 The University of Tennessee and The University + * Copyright (c) 2004-2016 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -286,7 +286,7 @@ mca_pml_ob1_component_init( int* priority, mca_pml_ob1.super.pml_flags |= MCA_PML_BASE_FLAG_REQUIRE_WORLD; break; } - + } /* Set this here (vs in component_open()) because diff --git a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c index 45758eb0d27..5f3f8fdc484 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2013 The University of Tennessee and The University + * Copyright (c) 2004-2016 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart, diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index cc665e9c607..9d1d402e165 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2013 The University of Tennessee and The University + * Copyright (c) 2004-2016 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, diff --git a/opal/mca/btl/btl.h b/opal/mca/btl/btl.h index 11ef04cc018..fb23a095506 100644 --- a/opal/mca/btl/btl.h +++ b/opal/mca/btl/btl.h @@ -3,7 +3,7 @@ * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2008 The University of Tennessee and The University + * Copyright (c) 2004-2016 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, diff --git a/opal/mca/btl/tcp/btl_tcp.h b/opal/mca/btl/tcp/btl_tcp.h index ed3214c6f79..f2c8917b588 100644 --- a/opal/mca/btl/tcp/btl_tcp.h +++ b/opal/mca/btl/tcp/btl_tcp.h @@ -3,7 +3,7 @@ * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2012 The University of Tennessee and The University + * Copyright (c) 2004-2016 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -92,23 +92,6 @@ extern int mca_btl_tcp_progress_thread_trigger; #define MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(name) \ opal_mutex_atomic_unlock((name)) -#define TODO_MCA_BTL_TCP_COMPLETE_FRAG_SEND(frag) \ - do { \ - (frag)->next_step = MCA_BTL_TCP_FRAG_STEP_SEND_COMPLETE; \ - MCA_BTL_TCP_CRITICAL_SECTION_ENTER(&mca_btl_tcp_ready_frag_mutex); \ - opal_list_append(&mca_btl_tcp_ready_frag_pending_queue, \ - (opal_list_item_t*)frag); \ - MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(&mca_btl_tcp_ready_frag_mutex); \ - } while (0) - -#define TODO_MCA_BTL_TCP_RECV_TRIGGER_CB(frag) \ - do { \ - (frag)->next_step = MCA_BTL_TCP_FRAG_STEP_RECV_COMPLETE; \ - MCA_BTL_TCP_CRITICAL_SECTION_ENTER(&mca_btl_tcp_ready_frag_mutex); \ - opal_list_append(&mca_btl_tcp_ready_frag_pending_queue, \ - (opal_list_item_t*)frag); \ - MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(&mca_btl_tcp_ready_frag_mutex); \ - } while (0) #define MCA_BTL_TCP_ACTIVATE_EVENT(event, value) \ do { \ if(0 < mca_btl_tcp_progress_thread_trigger) { \ @@ -123,10 +106,6 @@ extern int mca_btl_tcp_progress_thread_trigger; #else #define MCA_BTL_TCP_CRITICAL_SECTION_ENTER(name) #define MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(name) -#define TODO_MCA_BTL_TCP_COMPLETE_FRAG_SEND(frag) \ - MCA_BTL_TCP_COMPLETE_FRAG_SEND(frag) -#define TODO_MCA_BTL_TCP_RECV_TRIGGER_CB(frag) \ - MCA_BTL_TCP_RECV_TRIGGER_CB(frag) #define MCA_BTL_TCP_ACTIVATE_EVENT(event, value) \ do { \ opal_event_add(event, (value)); \ @@ -235,13 +214,6 @@ extern mca_btl_base_module_t** mca_btl_tcp_component_init( ); -/** - * TCP component progress. - */ -extern int mca_btl_tcp_component_progress(void); - - - /** * Cleanup any resources held by the BTL. * @@ -382,16 +354,6 @@ mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src( uint32_t flags ); -extern mca_btl_base_descriptor_t* mca_btl_tcp_prepare_dst( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* peer, - struct mca_mpool_base_registration_t*, - struct opal_convertor_t* convertor, - uint8_t order, - size_t reserve, - size_t* size, - uint32_t flags); - extern void mca_btl_tcp_dump(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, diff --git a/opal/mca/btl/tcp/btl_tcp_component.c b/opal/mca/btl/tcp/btl_tcp_component.c index 19ae4f8a148..a2c141833c3 100644 --- a/opal/mca/btl/tcp/btl_tcp_component.c +++ b/opal/mca/btl/tcp/btl_tcp_component.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2014 The University of Tennessee and The University + * Copyright (c) 2004-2016 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -73,7 +73,7 @@ #include "ompi/constants.h" #include "opal/mca/btl/btl.h" -#include "opal/mca/btl/base/base.h" +#include "opal/mca/btl/base/base.h" #include "opal/mca/btl/base/btl_base_error.h" #include "btl_tcp.h" #include "btl_tcp_addr.h" @@ -118,11 +118,7 @@ mca_btl_tcp_component_t mca_btl_tcp_component = { }, .btl_init = mca_btl_tcp_component_init, -//#if MCA_BTL_TCP_USES_PROGRESS_THREAD -// mca_btl_tcp_component_progress, -//#else - NULL, -//#endif /* MCA_BTL_TCP_USES_PROGRESS_THREAD */ + .btl_progress = NULL, } }; @@ -397,7 +393,7 @@ static int mca_btl_tcp_component_close(void) if (NULL != mca_btl_tcp_component.tcp_btls) { free(mca_btl_tcp_component.tcp_btls); } - + if (mca_btl_tcp_component.tcp_listen_sd >= 0) { opal_event_del(&mca_btl_tcp_component.tcp_recv_event); CLOSE_THE_SOCKET(mca_btl_tcp_component.tcp_listen_sd); @@ -414,7 +410,7 @@ static int mca_btl_tcp_component_close(void) /* cleanup any pending events */ MCA_BTL_TCP_CRITICAL_SECTION_ENTER(&mca_btl_tcp_component.tcp_lock); for(item = opal_list_get_first(&mca_btl_tcp_component.tcp_events); - item != opal_list_get_end(&mca_btl_tcp_component.tcp_events); + item != opal_list_get_end(&mca_btl_tcp_component.tcp_events); item = next) { mca_btl_tcp_event_t* event = (mca_btl_tcp_event_t*)item; next = opal_list_get_next(item); @@ -438,7 +434,7 @@ static int mca_btl_tcp_component_close(void) OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_frag_eager_mutex); OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_frag_max_mutex); - if( (NULL != mca_btl_tcp_event_base) && + if( (NULL != mca_btl_tcp_event_base) && (mca_btl_tcp_event_base != opal_sync_event_base) ) { /* Turn of the progress thread before moving forward */ if( -1 != mca_btl_tcp_progress_thread_trigger ) { @@ -790,35 +786,6 @@ static void mca_btl_tcp_component_event_async_handler(int fd, short unused, void opal_event_add(event, 0); } } -/* -int mca_btl_tcp_component_progress( void ) -{ - mca_btl_tcp_frag_t* frag; - int count = 0; - - for( ;; ) { - MCA_BTL_TCP_CRITICAL_SECTION_ENTER(&mca_btl_tcp_ready_frag_mutex); - frag = (mca_btl_tcp_frag_t*)opal_list_remove_first(&mca_btl_tcp_ready_frag_pending_queue); - MCA_BTL_TCP_CRITICAL_SECTION_LEAVE(&mca_btl_tcp_ready_frag_mutex); - if( NULL == frag ) break; - - switch(frag->next_step) { - case MCA_BTL_TCP_FRAG_STEP_SEND_COMPLETE: - MCA_BTL_TCP_COMPLETE_FRAG_SEND(frag); - count++; - break; - case MCA_BTL_TCP_FRAG_STEP_RECV_COMPLETE: - MCA_BTL_TCP_RECV_TRIGGER_CB(frag); - MCA_BTL_TCP_FRAG_RETURN(frag); - count++; - break; - default: - break; - } - } - return count; -} -*/ #endif /* @@ -1010,12 +977,12 @@ static int mca_btl_tcp_component_create_listen(uint16_t af_family) } /* setup the receiving end of the pipe as non-blocking */ if((flags = fcntl(mca_btl_tcp_pipe_to_progress[0], F_GETFL, 0)) < 0) { - BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)", + BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)", strerror(opal_socket_errno), opal_socket_errno)); } else { flags |= O_NONBLOCK; if(fcntl(mca_btl_tcp_pipe_to_progress[0], F_SETFL, flags) < 0) - BTL_ERROR(("fcntl(F_SETFL) failed: %s (%d)", + BTL_ERROR(("fcntl(F_SETFL) failed: %s (%d)", strerror(opal_socket_errno), opal_socket_errno)); } /* Progress thread event */ @@ -1233,8 +1200,8 @@ mca_btl_base_module_t** mca_btl_tcp_component_init(int *num_btl_modules, } /* Register the btl to support the progress_thread */ - if (0 < mca_btl_tcp_progress_thread_trigger){ - for(i=0;isuper.btl_flags |= MCA_BTL_FLAGS_BTL_PROGRESS_THREAD_ENABLED; } } @@ -1242,6 +1209,7 @@ mca_btl_base_module_t** mca_btl_tcp_component_init(int *num_btl_modules, #if OPAL_CUDA_SUPPORT mca_common_cuda_stage_one_init(); #endif /* OPAL_CUDA_SUPPORT */ + memcpy(btls, mca_btl_tcp_component.tcp_btls, mca_btl_tcp_component.tcp_num_btls*sizeof(mca_btl_tcp_module_t*)); *num_btl_modules = mca_btl_tcp_component.tcp_num_btls; return btls; diff --git a/opal/mca/btl/tcp/btl_tcp_endpoint.c b/opal/mca/btl/tcp/btl_tcp_endpoint.c index 6efc805f5ad..6fd5546af39 100644 --- a/opal/mca/btl/tcp/btl_tcp_endpoint.c +++ b/opal/mca/btl/tcp/btl_tcp_endpoint.c @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2015 The University of Tennessee and The University + * Copyright (c) 2004-2016 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, diff --git a/opal/mca/btl/tcp/btl_tcp_endpoint.h b/opal/mca/btl/tcp/btl_tcp_endpoint.h index 0798f3b0814..28b45ddfe01 100644 --- a/opal/mca/btl/tcp/btl_tcp_endpoint.h +++ b/opal/mca/btl/tcp/btl_tcp_endpoint.h @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2012 The University of Tennessee and The University + * Copyright (c) 2004-2016 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, diff --git a/opal/mca/btl/tcp/btl_tcp_frag.c b/opal/mca/btl/tcp/btl_tcp_frag.c index f5e41bb265d..85eee6be42c 100644 --- a/opal/mca/btl/tcp/btl_tcp_frag.c +++ b/opal/mca/btl/tcp/btl_tcp_frag.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2014 The University of Tennessee and The University + * Copyright (c) 2004-2016 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -47,18 +47,9 @@ #include "btl_tcp_frag.h" #include "btl_tcp_endpoint.h" -static void mca_btl_tcp_frag_common_constructor(mca_btl_tcp_frag_t* frag) -{ - frag->base.des_src = NULL; - frag->base.des_src_cnt = 0; - frag->base.des_dst = NULL; - frag->base.des_dst_cnt = 0; - frag->next_step = MCA_BTL_TCP_FRAG_STEP_UNDEFINED; -} - -static void mca_btl_tcp_frag_eager_constructor(mca_btl_tcp_frag_t* frag) -{ - frag->size = mca_btl_tcp_module.super.btl_eager_limit; +static void mca_btl_tcp_frag_eager_constructor(mca_btl_tcp_frag_t* frag) +{ + frag->size = mca_btl_tcp_module.super.btl_eager_limit; frag->my_list = &mca_btl_tcp_component.tcp_frag_eager; } @@ -115,7 +106,6 @@ size_t mca_btl_tcp_frag_dump(mca_btl_tcp_frag_t* frag, char* msg, char* buf, siz return used; } - bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t* frag, int sd) { int cnt=-1; @@ -160,11 +150,9 @@ bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t* frag, int sd) frag->iov_ptr->iov_base = (opal_iov_base_ptr_t) (((unsigned char*)frag->iov_ptr->iov_base) + cnt); frag->iov_ptr->iov_len -= cnt; -#define GB_DEFINED 0 -#if GB_DEFINED - opal_output(0, "%s:%d write %lu bytes from socket %d\n", - __FILE__, __LINE__, cnt, sd); /* GB */ -#endif /* GB_DEFINED */ + OPAL_OUTPUT_VERBOSE((100, opal_btl_base_framework.framework_output, + "%s:%d write %d bytes on socket %d\n", + __FILE__, __LINE__, cnt, sd)); break; } } @@ -282,7 +270,7 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd) frag->iov_cnt++; #ifndef __sparc #if !MCA_BTL_TCP_SUPPORT_PROGRESS_THREAD - /* The following cannot be done for sparc code + /* The following cannot be done for sparc code * because it causes alignment errors when accessing * structures later on in the btl and pml code. */ diff --git a/opal/mca/btl/tcp/btl_tcp_frag.h b/opal/mca/btl/tcp/btl_tcp_frag.h index 4d9e714a07d..b73da8f6edb 100644 --- a/opal/mca/btl/tcp/btl_tcp_frag.h +++ b/opal/mca/btl/tcp/btl_tcp_frag.h @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2014 The University of Tennessee and The University + * Copyright (c) 2004-2016 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -35,7 +35,6 @@ #include #endif -#include "orte/util/name_fns.h" #include "btl_tcp.h" #include "btl_tcp_hdr.h" diff --git a/opal/mca/btl/tcp/btl_tcp_hdr.h b/opal/mca/btl/tcp/btl_tcp_hdr.h index 70467e43713..b7977762012 100644 --- a/opal/mca/btl/tcp/btl_tcp_hdr.h +++ b/opal/mca/btl/tcp/btl_tcp_hdr.h @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2012 The University of Tennessee and The University + * Copyright (c) 2004-2005 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, diff --git a/opal/mca/btl/tcp/btl_tcp_proc.h b/opal/mca/btl/tcp/btl_tcp_proc.h index 064e81fa2da..5e56cd7deab 100644 --- a/opal/mca/btl/tcp/btl_tcp_proc.h +++ b/opal/mca/btl/tcp/btl_tcp_proc.h @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2012 The University of Tennessee and The University + * Copyright (c) 2004-2006 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, diff --git a/opal/mca/btl/tcp/help-mpi-btl-tcp.txt b/opal/mca/btl/tcp/help-mpi-btl-tcp.txt index 18c1521072b..b3d19cea28f 100644 --- a/opal/mca/btl/tcp/help-mpi-btl-tcp.txt +++ b/opal/mca/btl/tcp/help-mpi-btl-tcp.txt @@ -1,6 +1,9 @@ # -*- text -*- # # Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2015-2016 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. # $COPYRIGHT$ # # Additional copyrights may follow