From 2f91ce728151e26096105da51232c0c8886f9b03 Mon Sep 17 00:00:00 2001 From: Alex Mikheev Date: Tue, 25 Oct 2016 18:07:16 +0300 Subject: [PATCH 01/14] OSHMEM: mxm versions less than 2.0 are no longer supported Signed-off-by: Alex Mikheev --- oshmem/mca/atomic/mxm/atomic_mxm.h | 3 +- oshmem/mca/atomic/mxm/atomic_mxm_cswap.c | 9 - oshmem/mca/atomic/mxm/atomic_mxm_fadd.c | 14 -- oshmem/mca/spml/ikrit/spml_ikrit.c | 214 +------------------ oshmem/mca/spml/ikrit/spml_ikrit.h | 9 - oshmem/mca/spml/ikrit/spml_ikrit_component.c | 27 +-- 6 files changed, 10 insertions(+), 266 deletions(-) diff --git a/oshmem/mca/atomic/mxm/atomic_mxm.h b/oshmem/mca/atomic/mxm/atomic_mxm.h index 8d06cb3fcf0..c2a98b892f5 100644 --- a/oshmem/mca/atomic/mxm/atomic_mxm.h +++ b/oshmem/mca/atomic/mxm/atomic_mxm.h @@ -62,7 +62,7 @@ OBJ_CLASS_DECLARATION(mca_atomic_mxm_module_t); END_C_DECLS -#if MXM_API >= MXM_VERSION(2,0) +/* move to spml/ikrit */ static inline mxm_mem_key_t *to_mxm_mkey(sshmem_mkey_t *mkey) { if (0 == mkey->len) { @@ -70,6 +70,5 @@ static inline mxm_mem_key_t *to_mxm_mkey(sshmem_mkey_t *mkey) { } return (mxm_mem_key_t *)mkey->u.data; } -#endif #endif /* MCA_ATOMIC_MXM_H */ diff --git a/oshmem/mca/atomic/mxm/atomic_mxm_cswap.c b/oshmem/mca/atomic/mxm/atomic_mxm_cswap.c index 8e56a1014a0..70a6f96248c 100644 --- a/oshmem/mca/atomic/mxm/atomic_mxm_cswap.c +++ b/oshmem/mca/atomic/mxm/atomic_mxm_cswap.c @@ -87,23 +87,14 @@ int mca_atomic_mxm_cswap(void *target, sreq.base.data.buffer.memh = MXM_INVALID_MEM_HANDLE; sreq.op.atomic.remote_vaddr = (uintptr_t) remote_addr; -#if MXM_API < MXM_VERSION(2,0) - sreq.base.flags = 0; - sreq.op.atomic.remote_memh = MXM_INVALID_MEM_HANDLE; -#else sreq.flags = 0; sreq.op.atomic.remote_mkey = to_mxm_mkey(r_mkey); -#endif sreq.op.atomic.order = nlong_order; if (NULL == cond) { sreq.opcode = MXM_REQ_OP_ATOMIC_SWAP; } else { -#if MXM_API < MXM_VERSION(2,0) - memcpy(&sreq.op.atomic.value8, cond, nlong); -#else memcpy(&sreq.op.atomic.value, cond, nlong); -#endif sreq.opcode = MXM_REQ_OP_ATOMIC_CSWAP; } diff --git a/oshmem/mca/atomic/mxm/atomic_mxm_fadd.c b/oshmem/mca/atomic/mxm/atomic_mxm_fadd.c index 2c2accd322b..8173b218828 100644 --- a/oshmem/mca/atomic/mxm/atomic_mxm_fadd.c +++ b/oshmem/mca/atomic/mxm/atomic_mxm_fadd.c @@ -84,13 +84,8 @@ int mca_atomic_mxm_fadd(void *target, sreq.base.data_type = MXM_REQ_DATA_BUFFER; sreq.op.atomic.remote_vaddr = (uintptr_t) remote_addr; -#if MXM_API < MXM_VERSION(2,0) - sreq.op.atomic.remote_memh = MXM_INVALID_MEM_HANDLE; - memcpy(&sreq.op.atomic.value8, value, nlong); -#else sreq.op.atomic.remote_mkey = to_mxm_mkey(r_mkey); memcpy(&sreq.op.atomic.value, value, nlong); -#endif sreq.op.atomic.order = nlong_order; /* Do we need atomic 'add' or atomic 'fetch and add'? */ @@ -98,22 +93,13 @@ int mca_atomic_mxm_fadd(void *target, sreq.base.data.buffer.ptr = dummy_buf; sreq.base.data.buffer.length = nlong; sreq.base.data.buffer.memh = MXM_INVALID_MEM_HANDLE; -#if MXM_API < MXM_VERSION(2,0) - sreq.base.flags = MXM_REQ_FLAG_SEND_SYNC; - sreq.opcode = MXM_REQ_OP_ATOMIC_ADD; -#else sreq.flags = 0; sreq.opcode = MXM_REQ_OP_ATOMIC_FADD; -#endif } else { sreq.base.data.buffer.ptr = prev; sreq.base.data.buffer.length = nlong; sreq.base.data.buffer.memh = MXM_INVALID_MEM_HANDLE; -#if MXM_API < MXM_VERSION(2,0) - sreq.base.flags = 0; -#else sreq.flags = 0; -#endif sreq.opcode = MXM_REQ_OP_ATOMIC_FADD; } diff --git a/oshmem/mca/spml/ikrit/spml_ikrit.c b/oshmem/mca/spml/ikrit/spml_ikrit.c index 3e5ecc5313b..1303d894ee4 100644 --- a/oshmem/mca/spml/ikrit/spml_ikrit.c +++ b/oshmem/mca/spml/ikrit/spml_ikrit.c @@ -70,29 +70,6 @@ struct mca_spml_ikrit_put_request { typedef struct mca_spml_ikrit_put_request mca_spml_ikrit_put_request_t; OBJ_CLASS_DECLARATION(mca_spml_ikrit_put_request_t); -#if MXM_API < MXM_VERSION(2,0) -static int spml_ikrit_get_ep_address(spml_ikrit_mxm_ep_conn_info_t *ep_info, - mxm_ptl_id_t ptlid) -{ - size_t addrlen; - mxm_error_t err; - - addrlen = sizeof(ep_info->addr.ptl_addr[ptlid]); - err = mxm_ep_address(mca_spml_ikrit.mxm_ep, - ptlid, - (struct sockaddr *) &ep_info->addr.ptl_addr[ptlid], - &addrlen); - if (MXM_OK != err) { - orte_show_help("help-oshmem-spml-ikrit.txt", - "unable to get endpoint address", - true, - mxm_error_string(err)); - return OSHMEM_ERROR; - } - - return OSHMEM_SUCCESS; -} -#else static inline mxm_mem_key_t *to_mxm_mkey(sshmem_mkey_t *mkey) { if (0 == mkey->len) { @@ -100,8 +77,6 @@ static inline mxm_mem_key_t *to_mxm_mkey(sshmem_mkey_t *mkey) { } return (mxm_mem_key_t *)mkey->u.data; } -#endif - static inline void mca_spml_irkit_req_wait(mxm_req_base_t *req) { @@ -234,32 +209,6 @@ mca_spml_ikrit_t mca_spml_ikrit = { } }; -#if MXM_API < MXM_VERSION(2,0) -void mca_spml_ikrit_dump_stats(void); -void mca_spml_ikrit_dump_stats() -{ - int num_procs; - int i; - char sbuf[1024]; - FILE *fp; - - fp = fmemopen(sbuf, sizeof(sbuf), "rw"); - num_procs = oshmem_num_procs(); - for (i = 0; i < num_procs; i++) { - mxm_print_conn_state(mca_spml_ikrit.mxm_peers[i]->mxm_conn, - MXM_STATE_DETAIL_LEVEL_DATA, - "", - fp); - printf("=========== pe:%d conn:%p stats:\n %s==================\n", - i, - mca_spml_ikrit.mxm_peers[i]->mxm_conn, - sbuf); - rewind(fp); - } - fclose(fp); -} -#endif - static inline mca_spml_ikrit_put_request_t *alloc_put_req(void) { mca_spml_ikrit_put_request_t *req; @@ -341,12 +290,7 @@ static int create_ptl_idx(int dst_pe) return OSHMEM_ERROR; OSHMEM_PROC_DATA(proc)->num_transports = 1; -#if MXM_API < MXM_VERSION(2,0) - if (oshmem_my_proc_id() == dst_pe) - OSHMEM_PROC_DATA(proc)->transport_ids[0] = MXM_PTL_SELF; - else -#endif - OSHMEM_PROC_DATA(proc)->transport_ids[0] = MXM_PTL_RDMA; + OSHMEM_PROC_DATA(proc)->transport_ids[0] = MXM_PTL_RDMA; return OSHMEM_SUCCESS; } @@ -382,11 +326,9 @@ int mca_spml_ikrit_del_procs(ompi_proc_t** procs, size_t nprocs) int my_rank = oshmem_my_proc_id(); oshmem_shmem_barrier(); -#if MXM_API >= MXM_VERSION(2,0) if (mca_spml_ikrit.bulk_disconnect) { mxm_ep_powerdown(mca_spml_ikrit.mxm_ep); } -#endif while (NULL != opal_list_remove_first(&mca_spml_ikrit.active_peers)) { }; @@ -412,12 +354,7 @@ int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs) spml_ikrit_mxm_ep_conn_info_t *ep_info = NULL; spml_ikrit_mxm_ep_conn_info_t *ep_hw_rdma_info = NULL; spml_ikrit_mxm_ep_conn_info_t my_ep_info = {{0}}; -#if MXM_API < MXM_VERSION(2,0) - mxm_conn_req_t *conn_reqs; - int timeout; -#else size_t mxm_addr_len = MXM_MAX_ADDR_LEN; -#endif mxm_error_t err; size_t i, n; int rc = OSHMEM_ERROR; @@ -426,14 +363,6 @@ int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs) OBJ_CONSTRUCT(&mca_spml_ikrit.active_peers, opal_list_t); /* Allocate connection requests */ -#if MXM_API < MXM_VERSION(2,0) - conn_reqs = malloc(nprocs * sizeof(mxm_conn_req_t)); - if (NULL == conn_reqs) { - rc = OSHMEM_ERR_OUT_OF_RESOURCE; - goto bail; - } - memset(conn_reqs, 0x0, sizeof(mxm_conn_req_t)); -#endif ep_info = calloc(sizeof(spml_ikrit_mxm_ep_conn_info_t), nprocs); if (NULL == ep_info) { rc = OSHMEM_ERR_OUT_OF_RESOURCE; @@ -455,18 +384,6 @@ int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs) goto bail; } -#if MXM_API < MXM_VERSION(2,0) - if (OSHMEM_SUCCESS - != spml_ikrit_get_ep_address(&my_ep_info, MXM_PTL_SELF)) { - rc = OSHMEM_ERROR; - goto bail; - } - if (OSHMEM_SUCCESS - != spml_ikrit_get_ep_address(&my_ep_info, MXM_PTL_RDMA)) { - rc = OSHMEM_ERROR; - goto bail; - } -#else if (mca_spml_ikrit.hw_rdma_channel) { err = mxm_ep_get_address(mca_spml_ikrit.mxm_hw_rdma_ep, &my_ep_info.addr.ep_addr, &mxm_addr_len); if (MXM_OK != err) { @@ -485,7 +402,7 @@ int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs) rc = OSHMEM_ERROR; goto bail; } -#endif + oshmem_shmem_allgather(&my_ep_info, ep_info, sizeof(spml_ikrit_mxm_ep_conn_info_t)); @@ -504,13 +421,6 @@ int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs) } mca_spml_ikrit.mxm_peers[i]->pe = i; -#if MXM_API < MXM_VERSION(2,0) - conn_reqs[i].ptl_addr[MXM_PTL_SELF] = - (struct sockaddr *) &ep_info[i].addr.ptl_addr[MXM_PTL_SELF]; - conn_reqs[i].ptl_addr[MXM_PTL_SHM] = NULL; - conn_reqs[i].ptl_addr[MXM_PTL_RDMA] = - (struct sockaddr *) &ep_info[i].addr.ptl_addr[MXM_PTL_RDMA]; -#else err = mxm_ep_connect(mca_spml_ikrit.mxm_ep, ep_info[i].addr.ep_addr, &mca_spml_ikrit.mxm_peers[i]->mxm_conn); if (MXM_OK != err) { SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err)); @@ -528,55 +438,18 @@ int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs) } else { mca_spml_ikrit.mxm_peers[i]->mxm_hw_rdma_conn = mca_spml_ikrit.mxm_peers[i]->mxm_conn; } -#endif } -#if MXM_API < MXM_VERSION(2,0) - /* Connect to remote peers */ - if (mxm_get_version() < MXM_VERSION(1,5)) { - timeout = 1000; - } else { - timeout = -1; - } - err = mxm_ep_connect(mca_spml_ikrit.mxm_ep, conn_reqs, nprocs, timeout); - if (MXM_OK != err) { - SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err)); - for (i = 0; i < nprocs; ++i) { - if (MXM_OK != conn_reqs[i].error) { - SPML_ERROR("MXM EP connect to %s error: %s\n", - procs[i]->proc_hostname, mxm_error_string(conn_reqs[i].error)); - } - } - rc = OSHMEM_ERR_CONNECTION_FAILED; - goto bail; - } - - /* Save returned connections */ - for (i = 0; i < nprocs; ++i) { - mca_spml_ikrit.mxm_peers[i]->mxm_conn = conn_reqs[i].conn; - if (OSHMEM_SUCCESS != create_ptl_idx(i)) { - rc = OSHMEM_ERR_CONNECTION_FAILED; - goto bail; - } - - mxm_conn_ctx_set(conn_reqs[i].conn, mca_spml_ikrit.mxm_peers[i]); - } - - if (conn_reqs) - free(conn_reqs); -#endif if (ep_info) free(ep_info); if (ep_hw_rdma_info) free(ep_hw_rdma_info); -#if MXM_API >= MXM_VERSION(2,0) if (mca_spml_ikrit.bulk_connect) { /* Need a barrier to ensure remote peers already created connection */ oshmem_shmem_barrier(); mxm_ep_wireup(mca_spml_ikrit.mxm_ep); } -#endif proc_self = oshmem_proc_group_find(oshmem_group_all, my_rank); /* identify local processes and change transport to SHM */ @@ -598,10 +471,6 @@ int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs) return OSHMEM_SUCCESS; bail: -#if MXM_API < MXM_VERSION(2,0) - if (conn_reqs) - free(conn_reqs); -#endif if (ep_info) free(ep_info); if (ep_hw_rdma_info) @@ -619,10 +488,8 @@ sshmem_mkey_t *mca_spml_ikrit_register(void* addr, { int i; sshmem_mkey_t *mkeys; -#if MXM_API >= MXM_VERSION(2,0) mxm_error_t err; mxm_mem_key_t *m_key; -#endif *count = 0; mkeys = (sshmem_mkey_t *) calloc(1, MXM_PTL_LAST * sizeof(*mkeys)); @@ -643,19 +510,10 @@ sshmem_mkey_t *mca_spml_ikrit_register(void* addr, } mkeys[i].spml_context = 0; break; -#if MXM_API < MXM_VERSION(2,0) - case MXM_PTL_SELF: - mkeys[i].len = 0; - mkeys[i].spml_context = 0; - mkeys[i].va_base = addr; - break; -#endif case MXM_PTL_RDMA: mkeys[i].va_base = addr; mkeys[i].spml_context = 0; -#if MXM_API < MXM_VERSION(2,0) - mkeys[i].len = 0; -#else + if (mca_spml_ikrit.ud_only) { mkeys[i].len = 0; break; @@ -681,7 +539,6 @@ sshmem_mkey_t *mca_spml_ikrit_register(void* addr, SPML_ERROR("Failed to get memory key: %s", mxm_error_string(err)); goto error_out; } -#endif break; default: @@ -714,16 +571,12 @@ int mca_spml_ikrit_deregister(sshmem_mkey_t *mkeys) for (i = 0; i < MXM_PTL_LAST; i++) { switch (i) { -#if MXM_API < MXM_VERSION(2,0) - case MXM_PTL_SELF: -#endif case MXM_PTL_SHM: break; case MXM_PTL_RDMA: /* dereg memory */ if (!mkeys[i].spml_context) break; -#if MXM_API >= MXM_VERSION(2,0) mxm_mem_unmap(mca_spml_ikrit.mxm_context, (void *)mkeys[i].va_base, (unsigned long)mkeys[i].spml_context, @@ -731,7 +584,6 @@ int mca_spml_ikrit_deregister(sshmem_mkey_t *mkeys) if (0 < mkeys[i].len) { free(mkeys[i].u.data); } -#endif break; } } @@ -765,14 +617,6 @@ int mca_spml_ikrit_oob_get_mkeys(int pe, uint32_t seg, sshmem_mkey_t *mkeys) if (ptl != MXM_PTL_RDMA) return OSHMEM_ERROR; -#if MXM_API < MXM_VERSION(2,0) - if (seg > 1) - return OSHMEM_ERROR; - - mkeys[ptl].len = 0; - mkeys[ptl].u.key = MAP_SEGMENT_SHM_INVALID; - return OSHMEM_SUCCESS; -#else /* we are actually registering memory in 2.0 and later. * So can only skip mkey exchange when ud is the only transport */ @@ -783,7 +627,6 @@ int mca_spml_ikrit_oob_get_mkeys(int pe, uint32_t seg, sshmem_mkey_t *mkeys) } return OSHMEM_ERROR; -#endif } static int mca_spml_ikrit_get_helper(mxm_send_req_t *sreq, @@ -824,12 +667,7 @@ static int mca_spml_ikrit_get_helper(mxm_send_req_t *sreq, sreq->base.data_type = MXM_REQ_DATA_BUFFER; sreq->base.data.buffer.ptr = dst_addr; sreq->base.data.buffer.length = size; -#if MXM_API < MXM_VERSION(2,0) - sreq->base.data.buffer.memh = NULL; - sreq->op.mem.remote_memh = NULL; -#else sreq->op.mem.remote_mkey = to_mxm_mkey(r_mkey); -#endif sreq->opcode = MXM_REQ_OP_GET; sreq->op.mem.remote_vaddr = (intptr_t) rva; sreq->base.state = MXM_REQ_NEW; @@ -957,11 +795,7 @@ int mca_spml_ikrit_get_async(void *src_addr, return OSHMEM_ERROR; } -#if MXM_API < MXM_VERSION(2,0) - get_req->mxm_req.base.flags = 0; -#else get_req->mxm_req.flags = 0; -#endif get_req->mxm_req.base.completed_cb = get_completion_cb; get_req->mxm_req.base.context = get_req; OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_gets, 1); @@ -997,10 +831,6 @@ static int mca_spml_ikrit_mxm_fence(int dst) fence_req->mxm_req.base.mq = mca_spml_ikrit.mxm_mq; fence_req->mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst]->mxm_conn; -#if MXM_API < MXM_VERSION(2,0) - fence_req->mxm_req.opcode = MXM_REQ_OP_FENCE; - fence_req->mxm_req.base.flags = MXM_REQ_FLAG_SEND_SYNC; -#else fence_req->mxm_req.opcode = MXM_REQ_OP_PUT_SYNC; fence_req->mxm_req.flags = MXM_REQ_SEND_FLAG_FENCE; fence_req->mxm_req.op.mem.remote_vaddr = 0; @@ -1008,7 +838,6 @@ static int mca_spml_ikrit_mxm_fence(int dst) fence_req->mxm_req.base.data_type = MXM_REQ_DATA_BUFFER; fence_req->mxm_req.base.data.buffer.ptr = 0; fence_req->mxm_req.base.data.buffer.length = 0; -#endif fence_req->mxm_req.base.state = MXM_REQ_NEW; fence_req->mxm_req.base.completed_cb = fence_completion_cb; fence_req->mxm_req.base.context = fence_req; @@ -1041,19 +870,11 @@ static inline void put_completion_cb(void *ctx) if (0 < peer->n_active_puts) { peer->n_active_puts--; -#if MXM_API < MXM_VERSION(2,0) - if (0 == peer->n_active_puts && - (put_req->mxm_req.base.flags & MXM_REQ_FLAG_SEND_SYNC)) { - opal_list_remove_item(&mca_spml_ikrit.active_peers, &peer->super); - peer->need_fence = 0; - } -#else if (0 == peer->n_active_puts && (put_req->mxm_req.opcode == MXM_REQ_OP_PUT_SYNC)) { opal_list_remove_item(&mca_spml_ikrit.active_peers, &peer->super); peer->need_fence = 0; } -#endif } put_req->req_put.req_base.req_spml_complete = true; @@ -1137,7 +958,7 @@ static inline int mca_spml_ikrit_put_internal(void* dst_addr, put_req->mxm_req.base.mq = mca_spml_ikrit.mxm_mq; /* request immediate responce if we are getting low on send buffers. We only get responce from remote on ack timeout. * Also request explicit ack once in a while */ -#if MXM_API < MXM_VERSION(2,0) +#if 0 put_req->mxm_req.opcode = MXM_REQ_OP_PUT; if (mca_spml_ikrit.free_list_max - mca_spml_ikrit.n_active_puts <= SPML_IKRIT_PUT_LOW_WATER || (mca_spml_ikrit.mxm_peers[dst]->n_active_puts + 1) % SPML_IKRIT_PACKETS_PER_SYNC == 0) { @@ -1146,7 +967,7 @@ static inline int mca_spml_ikrit_put_internal(void* dst_addr, } else { put_req->mxm_req.base.flags = MXM_REQ_FLAG_SEND_LAZY|MXM_REQ_FLAG_SEND_SYNC; } -#else +#endif put_req->mxm_req.flags = 0; if (mca_spml_ikrit.free_list_max - mca_spml_ikrit.n_active_puts <= SPML_IKRIT_PUT_LOW_WATER || (int)opal_list_get_size(&mca_spml_ikrit.active_peers) > mca_spml_ikrit.unsync_conn_max || @@ -1163,7 +984,6 @@ static inline int mca_spml_ikrit_put_internal(void* dst_addr, put_req->mxm_req.opcode = MXM_REQ_OP_PUT_SYNC; } } -#endif put_req->mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst]->mxm_conn; put_req->mxm_req.base.data_type = MXM_REQ_DATA_BUFFER; @@ -1175,12 +995,7 @@ static inline int mca_spml_ikrit_put_internal(void* dst_addr, put_req->mxm_req.base.state = MXM_REQ_NEW; put_req->pe = dst; -#if MXM_API < MXM_VERSION(2,0) - put_req->mxm_req.base.data.buffer.memh = NULL; - put_req->mxm_req.op.mem.remote_memh = NULL; -#else put_req->mxm_req.op.mem.remote_mkey = to_mxm_mkey(r_mkey); -#endif OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_puts, 1); if (mca_spml_ikrit.mxm_peers[dst]->need_fence == 0) { @@ -1262,11 +1077,7 @@ int mca_spml_ikrit_put_simple(void* dst_addr, /* fill out request */ mxm_req.base.mq = mca_spml_ikrit.mxm_mq; -#if MXM_API < MXM_VERSION(2,0) - mxm_req.base.flags = MXM_REQ_FLAG_BLOCKING; -#else mxm_req.flags = MXM_REQ_SEND_FLAG_BLOCKING; -#endif mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst]->mxm_conn; mxm_req.base.data_type = MXM_REQ_DATA_BUFFER; mxm_req.base.data.buffer.ptr = src_addr; @@ -1278,12 +1089,7 @@ int mca_spml_ikrit_put_simple(void* dst_addr, mxm_req.base.state = MXM_REQ_NEW; mxm_req.base.error = MXM_OK; -#if MXM_API < MXM_VERSION(2, 0) - mxm_req.base.data.buffer.memh = NULL; - mxm_req.op.mem.remote_memh = NULL; -#else mxm_req.op.mem.remote_mkey = to_mxm_mkey(r_mkey); -#endif if (mca_spml_ikrit.mxm_peers[dst]->need_fence == 0) { opal_list_append(&mca_spml_ikrit.active_peers, @@ -1371,6 +1177,7 @@ int mca_spml_ikrit_fence(void) oshmem_request_wait_any_completion(); } + SPML_VERBOSE(20, "fence completed"); return OSHMEM_SUCCESS; } @@ -1392,9 +1199,6 @@ int mca_spml_ikrit_recv(void* buf, size_t size, int src) req.base.state = MXM_REQ_NEW; req.base.mq = mca_spml_ikrit.mxm_mq; req.base.conn = NULL; -#if MXM_API < MXM_VERSION(2,0) - req.base.flags = MXM_REQ_FLAG_BLOCKING; -#endif req.base.completed_cb = NULL; req.base.data_type = MXM_REQ_DATA_BUFFER; @@ -1436,11 +1240,7 @@ int mca_spml_ikrit_send(void* buf, req.base.state = MXM_REQ_NEW; req.base.mq = mca_spml_ikrit.mxm_mq; req.base.conn = mca_spml_ikrit.mxm_peers[dst]->mxm_conn; -#if MXM_API < MXM_VERSION(2,0) - req.base.flags = MXM_REQ_FLAG_BLOCKING; -#else - req.flags = MXM_REQ_SEND_FLAG_BLOCKING; -#endif + req.flags = MXM_REQ_SEND_FLAG_BLOCKING; req.base.completed_cb = NULL; req.base.data_type = MXM_REQ_DATA_BUFFER; diff --git a/oshmem/mca/spml/ikrit/spml_ikrit.h b/oshmem/mca/spml/ikrit/spml_ikrit.h index 45117b500d7..33d7333b31d 100644 --- a/oshmem/mca/spml/ikrit/spml_ikrit.h +++ b/oshmem/mca/spml/ikrit/spml_ikrit.h @@ -40,11 +40,6 @@ #define MXM_VERSION(major, minor) (((major)< -#include -#endif - #define MXM_SHMEM_MQ_ID 0x7119 /* start request explicit ack once our buffer pool is less than watermark */ @@ -103,9 +98,7 @@ struct mca_spml_ikrit_t { int hw_rdma_channel; /* true if we provide separate channel that has true one sided capability */ int np; -#if MXM_API >= MXM_VERSION(2,0) int unsync_conn_max; -#endif size_t put_zcopy_threshold; /* enable zcopy in put if message size is greater than the threshold */ }; @@ -114,11 +107,9 @@ typedef struct mca_spml_ikrit_t mca_spml_ikrit_t; #define MXM_MAX_ADDR_LEN 512 -#if MXM_API >= MXM_VERSION(2,0) #define MXM_PTL_SHM 0 #define MXM_PTL_RDMA 1 #define MXM_PTL_LAST 2 -#endif typedef struct spml_ikrit_mxm_ep_conn_info_t { union { diff --git a/oshmem/mca/spml/ikrit/spml_ikrit_component.c b/oshmem/mca/spml/ikrit/spml_ikrit_component.c index e698cfa8d82..45cba8eb4b4 100644 --- a/oshmem/mca/spml/ikrit/spml_ikrit_component.c +++ b/oshmem/mca/spml/ikrit/spml_ikrit_component.c @@ -233,17 +233,11 @@ static int mca_spml_ikrit_component_register(void) &mca_spml_ikrit.mxm_tls); mca_spml_ikrit_param_register_int("np", -#if MXM_API <= MXM_VERSION(2,0) - 128, -#else - 0, -#endif - "[integer] Minimal allowed job's NP to activate ikrit", &mca_spml_ikrit.np); -#if MXM_API >= MXM_VERSION(2,0) + 0, + "[integer] Minimal allowed job's NP to activate ikrit", &mca_spml_ikrit.np); mca_spml_ikrit_param_register_int("unsync_conn_max", 8, "[integer] Max number of connections that do not require notification of PUT operation remote completion. Increasing this number improves efficiency of p2p communication but increases overhead of shmem_fence/shmem_quiet/shmem_barrier", &mca_spml_ikrit.unsync_conn_max); -#endif mca_spml_ikrit_param_register_size_t("put_zcopy_threshold", 16384ULL, "[size_t] Use zero copy put if message size is greater than the threshold", @@ -312,10 +306,6 @@ static int mca_spml_ikrit_component_open(void) return OSHMEM_ERROR; } -#if MXM_API < MXM_VERSION(2,0) - mca_spml_ikrit.ud_only = 1; - mca_spml_ikrit.mxm_ctx_opts->ptl_bitmap = (MXM_BIT(MXM_PTL_SELF) | MXM_BIT(MXM_PTL_RDMA)); -#endif SPML_VERBOSE(5, "UD only mode is %s", mca_spml_ikrit.ud_only ? "enabled" : "disabled"); @@ -354,15 +344,10 @@ static int mca_spml_ikrit_component_close(void) } if (mca_spml_ikrit.mxm_context) { mxm_cleanup(mca_spml_ikrit.mxm_context); -#if MXM_API < MXM_VERSION(2,0) - mxm_config_free(mca_spml_ikrit.mxm_ep_opts); - mxm_config_free(mca_spml_ikrit.mxm_ctx_opts); -#else mxm_config_free_ep_opts(mca_spml_ikrit.mxm_ep_opts); mxm_config_free_context_opts(mca_spml_ikrit.mxm_ctx_opts); if (mca_spml_ikrit.hw_rdma_channel) mxm_config_free_ep_opts(mca_spml_ikrit.mxm_ep_hw_rdma_opts); -#endif } mca_spml_ikrit.mxm_mq = NULL; mca_spml_ikrit.mxm_context = NULL; @@ -373,14 +358,6 @@ static int spml_ikrit_mxm_init(void) { mxm_error_t err; -#if MXM_API < MXM_VERSION(2,0) - /* Only relevant for SHM PTL - ignore */ - mca_spml_ikrit.mxm_ep_opts->job_id = 0; - mca_spml_ikrit.mxm_ep_opts->local_rank = 0; - mca_spml_ikrit.mxm_ep_opts->num_local_procs = 0; - mca_spml_ikrit.mxm_ep_opts->rdma.drain_cq = 1; -#endif - /* Open MXM endpoint */ err = mxm_ep_create(mca_spml_ikrit.mxm_context, mca_spml_ikrit.mxm_ep_opts, From 0826e63363051b830481b45603eb01e5d0d23bd9 Mon Sep 17 00:00:00 2001 From: Alex Mikheev Date: Tue, 25 Oct 2016 18:20:58 +0300 Subject: [PATCH 02/14] OSHMEM: spml_ikrit: makes quiet wait for get_nbi requests shmem_quit() shall complete all outstanding get_nbi() requests Signed-off-by: Alex Mikheev --- oshmem/mca/spml/ikrit/spml_ikrit.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/oshmem/mca/spml/ikrit/spml_ikrit.c b/oshmem/mca/spml/ikrit/spml_ikrit.c index 1303d894ee4..aa77eb7b449 100644 --- a/oshmem/mca/spml/ikrit/spml_ikrit.c +++ b/oshmem/mca/spml/ikrit/spml_ikrit.c @@ -1177,6 +1177,9 @@ int mca_spml_ikrit_fence(void) oshmem_request_wait_any_completion(); } + while (0 < mca_spml_ikrit.n_active_gets) { + oshmem_request_wait_any_completion(); + } SPML_VERBOSE(20, "fence completed"); return OSHMEM_SUCCESS; From b5c7c7de7835d7d3d81f6168be6b1d9af409e363 Mon Sep 17 00:00:00 2001 From: Alex Mikheev Date: Wed, 26 Oct 2016 15:10:29 +0300 Subject: [PATCH 03/14] OSHMEM: memheap: disable oob if allgather mkey exchange is used In this case there is no point to add another progress callback Signed-off-by: Alex Mikheev --- oshmem/mca/memheap/base/memheap_base_mkey.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/oshmem/mca/memheap/base/memheap_base_mkey.c b/oshmem/mca/memheap/base/memheap_base_mkey.c index 5e230f3aef2..09fe07965de 100644 --- a/oshmem/mca/memheap/base/memheap_base_mkey.c +++ b/oshmem/mca/memheap/base/memheap_base_mkey.c @@ -53,6 +53,7 @@ struct oob_comm { int mkeys_rcvd; oob_comm_request_t req_pool[MEMHEAP_RECV_REQS_MAX]; opal_list_t req_list; + int is_inited; }; mca_memheap_map_t* memheap_map = NULL; @@ -435,6 +436,7 @@ int memheap_oob_init(mca_memheap_map_t *map) } opal_progress_register(oshmem_mkey_recv_cb); + memheap_oob.is_inited = 1; return rc; } @@ -444,6 +446,10 @@ void memheap_oob_destruct(void) int i; oob_comm_request_t *r; + if (!memheap_oob.is_inited) { + return; + } + opal_progress_unregister(oshmem_mkey_recv_cb); for (i = 0; i < MEMHEAP_RECV_REQS_MAX; i++) { @@ -455,6 +461,7 @@ void memheap_oob_destruct(void) OBJ_DESTRUCT(&memheap_oob.req_list); OBJ_DESTRUCT(&memheap_oob.lck); OBJ_DESTRUCT(&memheap_oob.cond); + memheap_oob.is_inited = 0; } static int send_buffer(int pe, opal_buffer_t *msg) @@ -699,6 +706,10 @@ sshmem_mkey_t * mca_memheap_base_get_cached_mkey_slow(map_segment_t *s, int rc; sshmem_mkey_t *mkey; + if (!memheap_oob.is_inited) { + return NULL; + } + s->mkeys_cache[pe] = (sshmem_mkey_t *) calloc(memheap_map->num_transports, sizeof(sshmem_mkey_t)); if (!s->mkeys_cache[pe]) From df74d549dcbcc7f49566e801570d0db38a51b2d7 Mon Sep 17 00:00:00 2001 From: Alex Mikheev Date: Wed, 26 Oct 2016 17:14:59 +0300 Subject: [PATCH 04/14] OSHMEM: spml ikrit: changes mxm_peers layout use single array instead of array of pointers Signed-off-by: Alex Mikheev --- oshmem/mca/atomic/mxm/atomic_mxm_cswap.c | 2 +- oshmem/mca/atomic/mxm/atomic_mxm_fadd.c | 2 +- oshmem/mca/spml/ikrit/spml_ikrit.c | 56 +++++++++++------------- oshmem/mca/spml/ikrit/spml_ikrit.h | 6 +-- 4 files changed, 31 insertions(+), 35 deletions(-) diff --git a/oshmem/mca/atomic/mxm/atomic_mxm_cswap.c b/oshmem/mca/atomic/mxm/atomic_mxm_cswap.c index 70a6f96248c..4b3c4896bbd 100644 --- a/oshmem/mca/atomic/mxm/atomic_mxm_cswap.c +++ b/oshmem/mca/atomic/mxm/atomic_mxm_cswap.c @@ -77,7 +77,7 @@ int mca_atomic_mxm_cswap(void *target, /* mxm request init */ sreq.base.state = MXM_REQ_NEW; sreq.base.mq = mca_atomic_mxm_spml_self->mxm_mq; - sreq.base.conn = mca_atomic_mxm_spml_self->mxm_peers[pe]->mxm_hw_rdma_conn; + sreq.base.conn = mca_atomic_mxm_spml_self->mxm_peers[pe].mxm_hw_rdma_conn; sreq.base.completed_cb = NULL; sreq.base.data_type = MXM_REQ_DATA_BUFFER; diff --git a/oshmem/mca/atomic/mxm/atomic_mxm_fadd.c b/oshmem/mca/atomic/mxm/atomic_mxm_fadd.c index 8173b218828..65206d8f69c 100644 --- a/oshmem/mca/atomic/mxm/atomic_mxm_fadd.c +++ b/oshmem/mca/atomic/mxm/atomic_mxm_fadd.c @@ -79,7 +79,7 @@ int mca_atomic_mxm_fadd(void *target, /* mxm request init */ sreq.base.state = MXM_REQ_NEW; sreq.base.mq = mca_atomic_mxm_spml_self->mxm_mq; - sreq.base.conn = mca_atomic_mxm_spml_self->mxm_peers[pe]->mxm_hw_rdma_conn; + sreq.base.conn = mca_atomic_mxm_spml_self->mxm_peers[pe].mxm_hw_rdma_conn; sreq.base.completed_cb = NULL; sreq.base.data_type = MXM_REQ_DATA_BUFFER; diff --git a/oshmem/mca/spml/ikrit/spml_ikrit.c b/oshmem/mca/spml/ikrit/spml_ikrit.c index aa77eb7b449..b5928c7cfd7 100644 --- a/oshmem/mca/spml/ikrit/spml_ikrit.c +++ b/oshmem/mca/spml/ikrit/spml_ikrit.c @@ -336,13 +336,13 @@ int mca_spml_ikrit_del_procs(ompi_proc_t** procs, size_t nprocs) for (n = 0; n < nprocs; n++) { i = (my_rank + n) % nprocs; - mxm_ep_disconnect(mca_spml_ikrit.mxm_peers[i]->mxm_conn); + mxm_ep_disconnect(mca_spml_ikrit.mxm_peers[i].mxm_conn); if (mca_spml_ikrit.hw_rdma_channel) { - assert(mca_spml_ikrit.mxm_peers[i]->mxm_hw_rdma_conn != mca_spml_ikrit.mxm_peers[i]->mxm_conn); - mxm_ep_disconnect(mca_spml_ikrit.mxm_peers[i]->mxm_hw_rdma_conn); + assert(mca_spml_ikrit.mxm_peers[i].mxm_hw_rdma_conn != mca_spml_ikrit.mxm_peers[i].mxm_conn); + mxm_ep_disconnect(mca_spml_ikrit.mxm_peers[i].mxm_hw_rdma_conn); } destroy_ptl_idx(i); - OBJ_RELEASE(mca_spml_ikrit.mxm_peers[i]); + OBJ_DESTRUCT(&mca_spml_ikrit.mxm_peers[i]); } free(mca_spml_ikrit.mxm_peers); @@ -377,8 +377,7 @@ int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs) } } - mca_spml_ikrit.mxm_peers = (mxm_peer_t **) malloc(nprocs - * sizeof(*(mca_spml_ikrit.mxm_peers))); + mca_spml_ikrit.mxm_peers = (mxm_peer_t *) malloc(nprocs * sizeof(mxm_peer_t)); if (NULL == mca_spml_ikrit.mxm_peers) { rc = OSHMEM_ERR_OUT_OF_RESOURCE; goto bail; @@ -414,29 +413,26 @@ int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs) /* mxm 2.0 keeps its connections on a list. Make sure * that list have different order on every rank */ i = (my_rank + n) % nprocs; - mca_spml_ikrit.mxm_peers[i] = OBJ_NEW(mxm_peer_t); - if (NULL == mca_spml_ikrit.mxm_peers[i]) { - rc = OSHMEM_ERR_OUT_OF_RESOURCE; - goto bail; - } - mca_spml_ikrit.mxm_peers[i]->pe = i; + OBJ_CONSTRUCT(&mca_spml_ikrit.mxm_peers[i], mxm_peer_t); - err = mxm_ep_connect(mca_spml_ikrit.mxm_ep, ep_info[i].addr.ep_addr, &mca_spml_ikrit.mxm_peers[i]->mxm_conn); + mca_spml_ikrit.mxm_peers[i].pe = i; + + err = mxm_ep_connect(mca_spml_ikrit.mxm_ep, ep_info[i].addr.ep_addr, &mca_spml_ikrit.mxm_peers[i].mxm_conn); if (MXM_OK != err) { SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err)); goto bail; } if (OSHMEM_SUCCESS != create_ptl_idx(i)) goto bail; - mxm_conn_ctx_set(mca_spml_ikrit.mxm_peers[i]->mxm_conn, mca_spml_ikrit.mxm_peers[i]); + mxm_conn_ctx_set(mca_spml_ikrit.mxm_peers[i].mxm_conn, &mca_spml_ikrit.mxm_peers[i]); if (mca_spml_ikrit.hw_rdma_channel) { - err = mxm_ep_connect(mca_spml_ikrit.mxm_hw_rdma_ep, ep_hw_rdma_info[i].addr.ep_addr, &mca_spml_ikrit.mxm_peers[i]->mxm_hw_rdma_conn); + err = mxm_ep_connect(mca_spml_ikrit.mxm_hw_rdma_ep, ep_hw_rdma_info[i].addr.ep_addr, &mca_spml_ikrit.mxm_peers[i].mxm_hw_rdma_conn); if (MXM_OK != err) { SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err)); goto bail; } } else { - mca_spml_ikrit.mxm_peers[i]->mxm_hw_rdma_conn = mca_spml_ikrit.mxm_peers[i]->mxm_conn; + mca_spml_ikrit.mxm_peers[i].mxm_hw_rdma_conn = mca_spml_ikrit.mxm_peers[i].mxm_conn; } } @@ -663,7 +659,7 @@ static int mca_spml_ikrit_get_helper(mxm_send_req_t *sreq, /* mxm does not really cares for get lkey */ sreq->base.mq = mca_spml_ikrit.mxm_mq; - sreq->base.conn = mca_spml_ikrit.mxm_peers[src]->mxm_conn; + sreq->base.conn = mca_spml_ikrit.mxm_peers[src].mxm_conn; sreq->base.data_type = MXM_REQ_DATA_BUFFER; sreq->base.data.buffer.ptr = dst_addr; sreq->base.data.buffer.length = size; @@ -830,7 +826,7 @@ static int mca_spml_ikrit_mxm_fence(int dst) } fence_req->mxm_req.base.mq = mca_spml_ikrit.mxm_mq; - fence_req->mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst]->mxm_conn; + fence_req->mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst].mxm_conn; fence_req->mxm_req.opcode = MXM_REQ_OP_PUT_SYNC; fence_req->mxm_req.flags = MXM_REQ_SEND_FLAG_FENCE; fence_req->mxm_req.op.mem.remote_vaddr = 0; @@ -853,7 +849,7 @@ static inline void put_completion_cb(void *ctx) mxm_peer_t *peer; OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_puts, -1); - peer = mca_spml_ikrit.mxm_peers[put_req->pe]; + peer = &mca_spml_ikrit.mxm_peers[put_req->pe]; /* this was last put in progress. Remove peer from the list so that we do not need explicit fence */ #if SPML_IKRIT_PUT_DEBUG == 1 @@ -971,7 +967,7 @@ static inline int mca_spml_ikrit_put_internal(void* dst_addr, put_req->mxm_req.flags = 0; if (mca_spml_ikrit.free_list_max - mca_spml_ikrit.n_active_puts <= SPML_IKRIT_PUT_LOW_WATER || (int)opal_list_get_size(&mca_spml_ikrit.active_peers) > mca_spml_ikrit.unsync_conn_max || - (mca_spml_ikrit.mxm_peers[dst]->n_active_puts + 1) % SPML_IKRIT_PACKETS_PER_SYNC == 0) { + (mca_spml_ikrit.mxm_peers[dst].n_active_puts + 1) % SPML_IKRIT_PACKETS_PER_SYNC == 0) { need_progress = 1; put_req->mxm_req.opcode = MXM_REQ_OP_PUT_SYNC; } else { @@ -985,7 +981,7 @@ static inline int mca_spml_ikrit_put_internal(void* dst_addr, } } - put_req->mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst]->mxm_conn; + put_req->mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst].mxm_conn; put_req->mxm_req.base.data_type = MXM_REQ_DATA_BUFFER; put_req->mxm_req.base.data.buffer.ptr = src_addr; put_req->mxm_req.base.data.buffer.length = size; @@ -998,13 +994,13 @@ static inline int mca_spml_ikrit_put_internal(void* dst_addr, put_req->mxm_req.op.mem.remote_mkey = to_mxm_mkey(r_mkey); OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_puts, 1); - if (mca_spml_ikrit.mxm_peers[dst]->need_fence == 0) { + if (mca_spml_ikrit.mxm_peers[dst].need_fence == 0) { opal_list_append(&mca_spml_ikrit.active_peers, - &mca_spml_ikrit.mxm_peers[dst]->super); - mca_spml_ikrit.mxm_peers[dst]->need_fence = 1; + &mca_spml_ikrit.mxm_peers[dst].super); + mca_spml_ikrit.mxm_peers[dst].need_fence = 1; } - mca_spml_ikrit.mxm_peers[dst]->n_active_puts++; + mca_spml_ikrit.mxm_peers[dst].n_active_puts++; SPML_IKRIT_MXM_POST_SEND(put_req->mxm_req); @@ -1078,7 +1074,7 @@ int mca_spml_ikrit_put_simple(void* dst_addr, /* fill out request */ mxm_req.base.mq = mca_spml_ikrit.mxm_mq; mxm_req.flags = MXM_REQ_SEND_FLAG_BLOCKING; - mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst]->mxm_conn; + mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst].mxm_conn; mxm_req.base.data_type = MXM_REQ_DATA_BUFFER; mxm_req.base.data.buffer.ptr = src_addr; mxm_req.base.data.buffer.length = size; @@ -1091,10 +1087,10 @@ int mca_spml_ikrit_put_simple(void* dst_addr, mxm_req.op.mem.remote_mkey = to_mxm_mkey(r_mkey); - if (mca_spml_ikrit.mxm_peers[dst]->need_fence == 0) { + if (mca_spml_ikrit.mxm_peers[dst].need_fence == 0) { opal_list_append(&mca_spml_ikrit.active_peers, - &mca_spml_ikrit.mxm_peers[dst]->super); - mca_spml_ikrit.mxm_peers[dst]->need_fence = 1; + &mca_spml_ikrit.mxm_peers[dst].super); + mca_spml_ikrit.mxm_peers[dst].need_fence = 1; } SPML_IKRIT_MXM_POST_SEND(mxm_req); @@ -1242,7 +1238,7 @@ int mca_spml_ikrit_send(void* buf, req.base.state = MXM_REQ_NEW; req.base.mq = mca_spml_ikrit.mxm_mq; - req.base.conn = mca_spml_ikrit.mxm_peers[dst]->mxm_conn; + req.base.conn = mca_spml_ikrit.mxm_peers[dst].mxm_conn; req.flags = MXM_REQ_SEND_FLAG_BLOCKING; req.base.completed_cb = NULL; diff --git a/oshmem/mca/spml/ikrit/spml_ikrit.h b/oshmem/mca/spml/ikrit/spml_ikrit.h index 33d7333b31d..f4efed4e8ad 100644 --- a/oshmem/mca/spml/ikrit/spml_ikrit.h +++ b/oshmem/mca/spml/ikrit/spml_ikrit.h @@ -74,7 +74,7 @@ struct mca_spml_ikrit_t { mxm_ep_h mxm_ep; mxm_ep_h mxm_hw_rdma_ep; mxm_mq_h mxm_mq; - mxm_peer_t **mxm_peers; + mxm_peer_t *mxm_peers; int32_t n_active_puts; int32_t n_active_gets; @@ -107,8 +107,8 @@ typedef struct mca_spml_ikrit_t mca_spml_ikrit_t; #define MXM_MAX_ADDR_LEN 512 -#define MXM_PTL_SHM 0 -#define MXM_PTL_RDMA 1 +#define MXM_PTL_RDMA 0 +#define MXM_PTL_SHM 1 #define MXM_PTL_LAST 2 typedef struct spml_ikrit_mxm_ep_conn_info_t { From 23c3dc83459b14156315732b02675a74afd947bf Mon Sep 17 00:00:00 2001 From: Alex Mikheev Date: Thu, 27 Oct 2016 14:00:46 +0300 Subject: [PATCH 05/14] OSHMEM: mxm: optimize mxm_peer layout. Signed-off-by: Alex Mikheev --- oshmem/mca/atomic/mxm/atomic_mxm_cswap.c | 8 +- oshmem/mca/atomic/mxm/atomic_mxm_fadd.c | 8 +- oshmem/mca/spml/base/base.h | 2 + oshmem/mca/spml/ikrit/spml_ikrit.c | 156 ++++++++--------------- oshmem/mca/spml/ikrit/spml_ikrit.h | 27 ++-- 5 files changed, 71 insertions(+), 130 deletions(-) diff --git a/oshmem/mca/atomic/mxm/atomic_mxm_cswap.c b/oshmem/mca/atomic/mxm/atomic_mxm_cswap.c index 4b3c4896bbd..e75dc3a8f5b 100644 --- a/oshmem/mca/atomic/mxm/atomic_mxm_cswap.c +++ b/oshmem/mca/atomic/mxm/atomic_mxm_cswap.c @@ -34,13 +34,11 @@ int mca_atomic_mxm_cswap(void *target, unsigned my_pe; uint8_t nlong_order; void *remote_addr; - int ptl_id; mxm_send_req_t sreq; mxm_error_t mxm_err; sshmem_mkey_t *r_mkey; my_pe = oshmem_my_proc_id(); - ptl_id = -1; mxm_err = MXM_OK; switch (nlong) { @@ -62,11 +60,7 @@ int mca_atomic_mxm_cswap(void *target, return OSHMEM_ERR_BAD_PARAM; } - ptl_id = OSHMEM_PROC_DATA(oshmem_proc_group_all(pe))->transport_ids[0]; - if (MXM_PTL_SHM == ptl_id) { - ptl_id = MXM_PTL_RDMA; - } - r_mkey = mca_memheap_base_get_cached_mkey(pe, target, ptl_id, &remote_addr); + r_mkey = mca_memheap_base_get_cached_mkey(pe, target, MXM_PTL_RDMA, &remote_addr); if (!r_mkey) { ATOMIC_ERROR("[#%d] %p is not address of symmetric variable", my_pe, target); diff --git a/oshmem/mca/atomic/mxm/atomic_mxm_fadd.c b/oshmem/mca/atomic/mxm/atomic_mxm_fadd.c index 65206d8f69c..afde70dcadd 100644 --- a/oshmem/mca/atomic/mxm/atomic_mxm_fadd.c +++ b/oshmem/mca/atomic/mxm/atomic_mxm_fadd.c @@ -35,14 +35,12 @@ int mca_atomic_mxm_fadd(void *target, unsigned my_pe; uint8_t nlong_order; void *remote_addr; - int ptl_id; mxm_send_req_t sreq; mxm_error_t mxm_err; sshmem_mkey_t *r_mkey; static char dummy_buf[8]; my_pe = oshmem_my_proc_id(); - ptl_id = -1; mxm_err = MXM_OK; switch (nlong) { @@ -64,11 +62,7 @@ int mca_atomic_mxm_fadd(void *target, return OSHMEM_ERR_BAD_PARAM; } - ptl_id = OSHMEM_PROC_DATA(oshmem_proc_group_all(pe))->transport_ids[0]; - if (MXM_PTL_SHM == ptl_id) { - ptl_id = MXM_PTL_RDMA; - } - r_mkey = mca_memheap_base_get_cached_mkey(pe, target, ptl_id, &remote_addr); + r_mkey = mca_memheap_base_get_cached_mkey(pe, target, MXM_PTL_RDMA, &remote_addr); if (!r_mkey) { ATOMIC_ERROR("[#%d] %p is not address of symmetric variable", my_pe, target); diff --git a/oshmem/mca/spml/base/base.h b/oshmem/mca/spml/base/base.h index af2ad32f432..b53894b21c7 100644 --- a/oshmem/mca/spml/base/base.h +++ b/oshmem/mca/spml/base/base.h @@ -104,6 +104,8 @@ OSHMEM_DECLSPEC extern mca_base_framework_t oshmem_spml_base_framework; #define SPML_VERBOSE(level, ...) #endif +#define SPML_VERBOSE_FASTPATH(level, ...) + #define SPML_ERROR(...) \ oshmem_output(oshmem_spml_base_framework.framework_output, \ "Error %s:%d - %s()", __SPML_FILE__, __LINE__, __func__, __VA_ARGS__) diff --git a/oshmem/mca/spml/ikrit/spml_ikrit.c b/oshmem/mca/spml/ikrit/spml_ikrit.c index b5928c7cfd7..638eb2b9653 100644 --- a/oshmem/mca/spml/ikrit/spml_ikrit.c +++ b/oshmem/mca/spml/ikrit/spml_ikrit.c @@ -55,6 +55,11 @@ do { \ } \ } while(0) +static int mca_spml_ikrit_get_async(void *src_addr, + size_t size, + void *dst_addr, + int src); + typedef struct spml_ikrit_am_hdr { uint64_t va; } spml_ikrit_am_hdr_t; @@ -279,47 +284,20 @@ int mca_spml_ikrit_enable(bool enable) return OSHMEM_SUCCESS; } -static int create_ptl_idx(int dst_pe) -{ - ompi_proc_t *proc; - - proc = oshmem_proc_group_find(oshmem_group_all, dst_pe); - - OSHMEM_PROC_DATA(proc)->transport_ids = (char *) malloc(MXM_PTL_LAST * sizeof(char)); - if (NULL == OSHMEM_PROC_DATA(proc)->transport_ids) - return OSHMEM_ERROR; - - OSHMEM_PROC_DATA(proc)->num_transports = 1; - OSHMEM_PROC_DATA(proc)->transport_ids[0] = MXM_PTL_RDMA; - return OSHMEM_SUCCESS; -} - -static void destroy_ptl_idx(int dst_pe) -{ - ompi_proc_t *proc; - - proc = oshmem_proc_group_find(oshmem_group_all, dst_pe); - if (NULL != OSHMEM_PROC_DATA(proc)->transport_ids) - free(OSHMEM_PROC_DATA(proc)->transport_ids); -} - static void mxm_peer_construct(mxm_peer_t *p) { - p->pe = -1; + p->pe = -1; p->n_active_puts = 0; - p->need_fence = 0; + p->need_fence = 0; + p->ptl_id = MXM_PTL_RDMA; + OBJ_CONSTRUCT(&p->link, opal_list_item_t); } static void mxm_peer_destruct(mxm_peer_t *p) { - /* may be we need to remov item from list */ + OBJ_DESTRUCT(&p->link); } -OBJ_CLASS_INSTANCE( mxm_peer_t, - opal_list_item_t, - mxm_peer_construct, - mxm_peer_destruct); - int mca_spml_ikrit_del_procs(ompi_proc_t** procs, size_t nprocs) { size_t i, n; @@ -341,8 +319,7 @@ int mca_spml_ikrit_del_procs(ompi_proc_t** procs, size_t nprocs) assert(mca_spml_ikrit.mxm_peers[i].mxm_hw_rdma_conn != mca_spml_ikrit.mxm_peers[i].mxm_conn); mxm_ep_disconnect(mca_spml_ikrit.mxm_peers[i].mxm_hw_rdma_conn); } - destroy_ptl_idx(i); - OBJ_DESTRUCT(&mca_spml_ikrit.mxm_peers[i]); + mxm_peer_destruct(&mca_spml_ikrit.mxm_peers[i]); } free(mca_spml_ikrit.mxm_peers); @@ -377,7 +354,7 @@ int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs) } } - mca_spml_ikrit.mxm_peers = (mxm_peer_t *) malloc(nprocs * sizeof(mxm_peer_t)); + mca_spml_ikrit.mxm_peers = (mxm_peer_t *) calloc(nprocs , sizeof(mxm_peer_t)); if (NULL == mca_spml_ikrit.mxm_peers) { rc = OSHMEM_ERR_OUT_OF_RESOURCE; goto bail; @@ -413,8 +390,7 @@ int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs) /* mxm 2.0 keeps its connections on a list. Make sure * that list have different order on every rank */ i = (my_rank + n) % nprocs; - OBJ_CONSTRUCT(&mca_spml_ikrit.mxm_peers[i], mxm_peer_t); - + mxm_peer_construct(&mca_spml_ikrit.mxm_peers[i]); mca_spml_ikrit.mxm_peers[i].pe = i; err = mxm_ep_connect(mca_spml_ikrit.mxm_ep, ep_info[i].addr.ep_addr, &mca_spml_ikrit.mxm_peers[i].mxm_conn); @@ -422,8 +398,6 @@ int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs) SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err)); goto bail; } - if (OSHMEM_SUCCESS != create_ptl_idx(i)) - goto bail; mxm_conn_ctx_set(mca_spml_ikrit.mxm_peers[i].mxm_conn, &mca_spml_ikrit.mxm_peers[i]); if (mca_spml_ikrit.hw_rdma_channel) { err = mxm_ep_connect(mca_spml_ikrit.mxm_hw_rdma_ep, ep_hw_rdma_info[i].addr.ep_addr, &mca_spml_ikrit.mxm_peers[i].mxm_hw_rdma_conn); @@ -457,10 +431,8 @@ int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs) if (procs[i] == proc_self) continue; - /* use zcopy for put/get via sysv shared memory */ - OSHMEM_PROC_DATA(procs[i])->transport_ids[0] = MXM_PTL_SHM; - OSHMEM_PROC_DATA(procs[i])->transport_ids[1] = MXM_PTL_RDMA; - OSHMEM_PROC_DATA(procs[i])->num_transports = 2; + /* use zcopy for put/get via sysv shared memory with fallback to RDMA */ + mca_spml_ikrit.mxm_peers[i].ptl_id = MXM_PTL_SHM; } SPML_VERBOSE(50, "*** ADDED PROCS ***"); @@ -591,16 +563,7 @@ int mca_spml_ikrit_deregister(sshmem_mkey_t *mkeys) static inline int get_ptl_id(int dst) { - ompi_proc_t *proc; - - /* get endpoint and btl */ - proc = oshmem_proc_group_all(dst); - if (!proc) { - SPML_ERROR("Can not find destination proc for pe=%d", dst); - oshmem_shmem_abort(-1); - return -1; - } - return OSHMEM_PROC_DATA(proc)->transport_ids[0]; + return mca_spml_ikrit.mxm_peers[dst].ptl_id; } int mca_spml_ikrit_oob_get_mkeys(int pe, uint32_t seg, sshmem_mkey_t *mkeys) @@ -625,27 +588,22 @@ int mca_spml_ikrit_oob_get_mkeys(int pe, uint32_t seg, sshmem_mkey_t *mkeys) return OSHMEM_ERROR; } -static int mca_spml_ikrit_get_helper(mxm_send_req_t *sreq, - void *src_addr, - size_t size, - void *dst_addr, - int src) +static inline int mca_spml_ikrit_get_helper(mxm_send_req_t *sreq, + void *src_addr, + size_t size, + void *dst_addr, + int src) { /* shmem spec states that get() operations are blocking. So it is enough to have single mxm request. Also we count on mxm doing copy */ void *rva; sshmem_mkey_t *r_mkey; - int ptl_id; - ptl_id = get_ptl_id(src); /* already tried to send via shm and failed. go via rdma */ - if (ptl_id == MXM_PTL_SHM) - ptl_id = MXM_PTL_RDMA; - /** * Get the address to the remote rkey. **/ - r_mkey = mca_memheap_base_get_cached_mkey(src, src_addr, ptl_id, &rva); + r_mkey = mca_memheap_base_get_cached_mkey(src, src_addr, MXM_PTL_RDMA, &rva); if (!r_mkey) { SPML_ERROR("pe=%d: %p is not address of shared variable", src, src_addr); @@ -653,9 +611,10 @@ static int mca_spml_ikrit_get_helper(mxm_send_req_t *sreq, return OSHMEM_ERROR; } - SPML_VERBOSE(100, - "get: pe:%d ptl=%d src=%p -> dst: %p sz=%d. src_rva=%p, %s", - src, ptl_id, src_addr, dst_addr, (int)size, (void *)rva, mca_spml_base_mkey2str(r_mkey)); + SPML_VERBOSE_FASTPATH(100, + "get: pe:%d ptl=%d src=%p -> dst: %p sz=%d. src_rva=%p, %s", + src, MXM_PTL_RDMA, src_addr, dst_addr, (int)size, (void *)rva, + mca_spml_base_mkey2str(r_mkey)); /* mxm does not really cares for get lkey */ sreq->base.mq = mca_spml_ikrit.mxm_mq; @@ -698,9 +657,10 @@ static inline int mca_spml_ikrit_get_shm(void *src_addr, if (!mca_memheap_base_can_local_copy(r_mkey, src_addr)) return OSHMEM_ERROR; - SPML_VERBOSE(100, - "shm get: pe:%d src=%p -> dst: %p sz=%d. src_rva=%p, %s", - src, src_addr, dst_addr, (int)size, (void *)rva, mca_spml_base_mkey2str(r_mkey)); + SPML_VERBOSE_FASTPATH(100, + "shm get: pe:%d src=%p -> dst: %p sz=%d. src_rva=%p, %s", + src, src_addr, dst_addr, (int)size, (void *)rva, mca_spml_base_mkey2str(r_mkey)); + memcpy(dst_addr, (void *) (unsigned long) rva, size); opal_progress(); return OSHMEM_SUCCESS; @@ -763,11 +723,10 @@ static inline void get_completion_cb(void *ctx) oshmem_request_free((oshmem_request_t**) &get_req); } -/* extension. used 4 fence implementation b4 fence was added to mxm */ -int mca_spml_ikrit_get_async(void *src_addr, - size_t size, - void *dst_addr, - int src) +static inline int mca_spml_ikrit_get_async(void *src_addr, + size_t size, + void *dst_addr, + int src) { mca_spml_ikrit_get_request_t *get_req; @@ -868,7 +827,7 @@ static inline void put_completion_cb(void *ctx) peer->n_active_puts--; if (0 == peer->n_active_puts && (put_req->mxm_req.opcode == MXM_REQ_OP_PUT_SYNC)) { - opal_list_remove_item(&mca_spml_ikrit.active_peers, &peer->super); + opal_list_remove_item(&mca_spml_ikrit.active_peers, &peer->link); peer->need_fence = 0; } } @@ -911,13 +870,11 @@ static inline int mca_spml_ikrit_put_internal(void* dst_addr, return OSHMEM_ERROR; } -#if SPML_IKRIT_PUT_DEBUG == 1 - - SPML_VERBOSE(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s", - dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva, mca_spml_base_mkey2str(r_mkey)); -#endif - if (ptl_id == MXM_PTL_SHM) { + SPML_VERBOSE_FASTPATH(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s", + dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva, + mca_spml_base_mkey2str(r_mkey)); + if (OPAL_UNLIKELY(MXM_PTL_SHM == ptl_id)) { if (mca_memheap_base_can_local_copy(r_mkey, dst_addr)) { memcpy((void *) (unsigned long) rva, src_addr, size); /* call progress as often as we would have with regular put */ @@ -926,8 +883,7 @@ static inline int mca_spml_ikrit_put_internal(void* dst_addr, return OSHMEM_SUCCESS; } /* segment not mapped - fallback to rmda */ - ptl_id = MXM_PTL_RDMA; - r_mkey = mca_memheap_base_get_cached_mkey(dst, dst_addr, ptl_id, &rva); + r_mkey = mca_memheap_base_get_cached_mkey(dst, dst_addr, MXM_PTL_RDMA, &rva); if (!r_mkey) { SPML_ERROR("pe=%d: %p is not address of shared variable", dst, dst_addr); @@ -936,10 +892,9 @@ static inline int mca_spml_ikrit_put_internal(void* dst_addr, } } -#if SPML_IKRIT_PUT_DEBUG == 1 - SPML_VERBOSE(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s", - dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva, mca_spml_base_mkey2str(r_mkey)); -#endif + SPML_VERBOSE_FASTPATH(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s", + dst, MXM_PTL_RDMA, dst_addr, src_addr, (int)size, (void *)rva, + mca_spml_base_mkey2str(r_mkey)); put_req = alloc_put_req(); if (NULL == put_req) { @@ -996,7 +951,7 @@ static inline int mca_spml_ikrit_put_internal(void* dst_addr, OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_puts, 1); if (mca_spml_ikrit.mxm_peers[dst].need_fence == 0) { opal_list_append(&mca_spml_ikrit.active_peers, - &mca_spml_ikrit.mxm_peers[dst].super); + &mca_spml_ikrit.mxm_peers[dst].link); mca_spml_ikrit.mxm_peers[dst].need_fence = 1; } @@ -1038,12 +993,11 @@ int mca_spml_ikrit_put_simple(void* dst_addr, return OSHMEM_ERROR; } -#if SPML_IKRIT_PUT_DEBUG == 1 - SPML_VERBOSE(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s", - dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva, mca_spml_base_mkey2str(r_mkey)); -#endif - if (ptl_id == MXM_PTL_SHM) { + SPML_VERBOSE_FASTPATH(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s", + dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva, + mca_spml_base_mkey2str(r_mkey)); + if (MXM_PTL_SHM == ptl_id) { if (mca_memheap_base_can_local_copy(r_mkey, dst_addr)) { memcpy((void *) (unsigned long) rva, src_addr, size); /* call progress as often as we would have with regular put */ @@ -1052,11 +1006,10 @@ int mca_spml_ikrit_put_simple(void* dst_addr, return OSHMEM_SUCCESS; } /* segment not mapped - fallback to rmda */ - ptl_id = MXM_PTL_RDMA; r_mkey = mca_memheap_base_get_cached_mkey(dst, //(unsigned long) dst_addr, dst_addr, - ptl_id, + MXM_PTL_RDMA, &rva); if (!r_mkey) { SPML_ERROR("pe=%d: %p is not address of shared variable", @@ -1066,10 +1019,9 @@ int mca_spml_ikrit_put_simple(void* dst_addr, } } -#if SPML_IKRIT_PUT_DEBUG == 1 - SPML_VERBOSE(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s", - dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva, mca_spml_base_mkey2str(r_mkey)); -#endif + SPML_VERBOSE_FASTPATH(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s", + dst, MXM_PTL_RDMA, dst_addr, src_addr, (int)size, (void *)rva, + mca_spml_base_mkey2str(r_mkey)); /* fill out request */ mxm_req.base.mq = mca_spml_ikrit.mxm_mq; @@ -1089,7 +1041,7 @@ int mca_spml_ikrit_put_simple(void* dst_addr, if (mca_spml_ikrit.mxm_peers[dst].need_fence == 0) { opal_list_append(&mca_spml_ikrit.active_peers, - &mca_spml_ikrit.mxm_peers[dst].super); + &mca_spml_ikrit.mxm_peers[dst].link); mca_spml_ikrit.mxm_peers[dst].need_fence = 1; } @@ -1163,7 +1115,7 @@ int mca_spml_ikrit_fence(void) /* puts(unless are send sync) are completed by remote side lazily. That is either when remote decides to * ack window which can take hundreds of ms. So speed things up by doing fence */ while (NULL != (item = opal_list_remove_first(&mca_spml_ikrit.active_peers))) { - peer = (mxm_peer_t *) item; + peer = spml_ikrit_container_of(item, mxm_peer_t, link); peer->n_active_puts = 0; peer->need_fence = 0; mca_spml_ikrit_mxm_fence(peer->pe); diff --git a/oshmem/mca/spml/ikrit/spml_ikrit.h b/oshmem/mca/spml/ikrit/spml_ikrit.h index f4efed4e8ad..8580a24c05e 100644 --- a/oshmem/mca/spml/ikrit/spml_ikrit.h +++ b/oshmem/mca/spml/ikrit/spml_ikrit.h @@ -47,22 +47,31 @@ /* request explicit ack (SYNC) per every X put requests per connection */ #define SPML_IKRIT_PACKETS_PER_SYNC 64 +#define spml_ikrit_container_of(ptr, type, member) ( \ + (type *)( ((char *)(ptr)) - offsetof(type,member) )) + +#define MXM_MAX_ADDR_LEN 512 + +#define MXM_PTL_RDMA 0 +#define MXM_PTL_SHM 1 +#define MXM_PTL_LAST 2 + BEGIN_C_DECLS /** * UD MXM SPML module */ struct mxm_peer { - opal_list_item_t super; mxm_conn_h mxm_conn; mxm_conn_h mxm_hw_rdma_conn; - int pe; + uint8_t ptl_id; + opal_list_item_t link; int32_t n_active_puts; - int need_fence; + uint32_t pe; + uint8_t need_fence; }; typedef struct mxm_peer mxm_peer_t; -OBJ_CLASS_DECLARATION(mxm_peer_t); struct mca_spml_ikrit_t { mca_spml_base_module_t super; @@ -105,11 +114,6 @@ struct mca_spml_ikrit_t { typedef struct mca_spml_ikrit_t mca_spml_ikrit_t; -#define MXM_MAX_ADDR_LEN 512 - -#define MXM_PTL_RDMA 0 -#define MXM_PTL_SHM 1 -#define MXM_PTL_LAST 2 typedef struct spml_ikrit_mxm_ep_conn_info_t { union { @@ -130,11 +134,6 @@ extern int mca_spml_ikrit_get_nb(void* src_addr, void* dst_addr, int src, void **handle); -/* extension. used 4 fence implementation b4 fence was added to mxm */ -extern int mca_spml_ikrit_get_async(void *src_addr, - size_t size, - void *dst_addr, - int src); extern int mca_spml_ikrit_put(void* dst_addr, size_t size, From 61bd59a3693fa56d239f76d8fdea81e4c6eceb21 Mon Sep 17 00:00:00 2001 From: Alex Mikheev Date: Thu, 27 Oct 2016 14:15:35 +0300 Subject: [PATCH 06/14] OSHMEM: fixes addr_acessible() check every possible transport Signed-off-by: Alex Mikheev --- oshmem/mca/memheap/base/base.h | 5 +++++ oshmem/shmem/c/shmem_addr_accessible.c | 10 ++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/oshmem/mca/memheap/base/base.h b/oshmem/mca/memheap/base/base.h index 34e92346d54..b8da84f4d9f 100644 --- a/oshmem/mca/memheap/base/base.h +++ b/oshmem/mca/memheap/base/base.h @@ -226,6 +226,11 @@ static inline sshmem_mkey_t *mca_memheap_base_get_cached_mkey(int pe, return mca_memheap_base_get_cached_mkey_slow(s, pe, va, btl_id, rva); } +static inline int mca_memheap_base_num_transports(void) +{ + return memheap_map->num_transports; +} + END_C_DECLS #endif /* MCA_MEMHEAP_BASE_H */ diff --git a/oshmem/shmem/c/shmem_addr_accessible.c b/oshmem/shmem/c/shmem_addr_accessible.c index 7d1a2486f86..8d44ff41818 100644 --- a/oshmem/shmem/c/shmem_addr_accessible.c +++ b/oshmem/shmem/c/shmem_addr_accessible.c @@ -26,10 +26,16 @@ int shmem_addr_accessible(const void *addr, int pe) { void* rva; sshmem_mkey_t *mkey; + int i; RUNTIME_CHECK_INIT(); - mkey = mca_memheap_base_get_cached_mkey(pe, (void *)addr, oshmem_get_transport_id(pe), &rva); + for (i = 0; i < mca_memheap_base_num_transports(); i++) { + mkey = mca_memheap_base_get_cached_mkey(pe, (void *)addr, i, &rva); + if (mkey) { + return 1; + } + } - return mkey ? 1 : 0; + return 0; } From defcc3ddc1e4655a6c48b50ae31cb9398155b183 Mon Sep 17 00:00:00 2001 From: Alex Mikheev Date: Thu, 27 Oct 2016 17:00:47 +0300 Subject: [PATCH 07/14] OSHMEM: spml ikrit: get/put request cleanup Signed-off-by: Alex Mikheev --- oshmem/mca/spml/ikrit/spml_ikrit.c | 180 ++++++----------------------- oshmem/mca/spml/ikrit/spml_ikrit.h | 3 +- 2 files changed, 36 insertions(+), 147 deletions(-) diff --git a/oshmem/mca/spml/ikrit/spml_ikrit.c b/oshmem/mca/spml/ikrit/spml_ikrit.c index 638eb2b9653..c8409979ef7 100644 --- a/oshmem/mca/spml/ikrit/spml_ikrit.c +++ b/oshmem/mca/spml/ikrit/spml_ikrit.c @@ -60,20 +60,13 @@ static int mca_spml_ikrit_get_async(void *src_addr, void *dst_addr, int src); -typedef struct spml_ikrit_am_hdr { - uint64_t va; -} spml_ikrit_am_hdr_t; - struct mca_spml_ikrit_put_request { - mca_spml_base_put_request_t req_put; - mxm_send_req_t mxm_req; - int pe; - mxm_req_buffer_t iov[2]; - spml_ikrit_am_hdr_t am_pkt; + opal_free_list_item_t link; /* must be a first member */ + mxm_send_req_t mxm_req; + int pe; }; typedef struct mca_spml_ikrit_put_request mca_spml_ikrit_put_request_t; -OBJ_CLASS_DECLARATION(mca_spml_ikrit_put_request_t); static inline mxm_mem_key_t *to_mxm_mkey(sshmem_mkey_t *mkey) { @@ -94,95 +87,55 @@ static inline void mca_spml_irkit_req_wait(mxm_req_base_t *req) } while (!mxm_req_test(req)); } -static int mca_spml_ikrit_put_request_free(struct oshmem_request_t** request) +static inline void free_put_req(mca_spml_ikrit_put_request_t *put_req) { - mca_spml_ikrit_put_request_t *put_req = - *(mca_spml_ikrit_put_request_t **) request; - - OPAL_THREAD_LOCK(&oshmem_request_lock); - assert(false == put_req->req_put.req_base.req_free_called); - put_req->req_put.req_base.req_free_called = true; opal_free_list_return (&mca_spml_base_put_requests, (opal_free_list_item_t*)put_req); opal_memchecker_base_mem_noaccess(put_req, sizeof(*put_req)); - OPAL_THREAD_UNLOCK(&oshmem_request_lock); - - *request = SHMEM_REQUEST_NULL; /*MPI_REQUEST_NULL;*/ - - return OSHMEM_SUCCESS; } -static int mca_spml_ikrit_put_request_cancel(struct oshmem_request_t * request, - int complete) +static inline mca_spml_ikrit_put_request_t *alloc_put_req(void) { - return OSHMEM_SUCCESS; -} + mca_spml_ikrit_put_request_t *req; + opal_free_list_item_t* item; -static void mca_spml_ikrit_put_request_construct(mca_spml_ikrit_put_request_t* req) -{ - req->req_put.req_base.req_type = MCA_SPML_REQUEST_PUT; - req->req_put.req_base.req_oshmem.req_free = mca_spml_ikrit_put_request_free; - req->req_put.req_base.req_oshmem.req_cancel = - mca_spml_ikrit_put_request_cancel; -} + item = opal_free_list_wait (&mca_spml_base_put_requests); + assert(item != NULL); -static void mca_spml_ikrit_put_request_destruct(mca_spml_ikrit_put_request_t* req) -{ + req = (mca_spml_ikrit_put_request_t *) item; + opal_memchecker_base_mem_undefined(req, sizeof(*req)); + + return req; } -OBJ_CLASS_INSTANCE( mca_spml_ikrit_put_request_t, - mca_spml_base_put_request_t, - mca_spml_ikrit_put_request_construct, - mca_spml_ikrit_put_request_destruct); struct mca_spml_ikrit_get_request { - mca_spml_base_get_request_t req_get; - mxm_send_req_t mxm_req; + opal_free_list_item_t link; /* must be a first member */ + mxm_send_req_t mxm_req; }; typedef struct mca_spml_ikrit_get_request mca_spml_ikrit_get_request_t; -OBJ_CLASS_DECLARATION(mca_spml_ikrit_get_request_t); -static int mca_spml_ikrit_get_request_free(struct oshmem_request_t** request) +static inline void free_get_req(mca_spml_ikrit_get_request_t *get_req) { - mca_spml_ikrit_get_request_t *get_req = - *(mca_spml_ikrit_get_request_t **) request; - - OPAL_THREAD_LOCK(&oshmem_request_lock); - assert(false == get_req->req_get.req_base.req_free_called); - get_req->req_get.req_base.req_free_called = true; opal_free_list_return (&mca_spml_base_get_requests, (opal_free_list_item_t*)get_req); opal_memchecker_base_mem_noaccess(get_req, sizeof(*get_req)); - OPAL_THREAD_UNLOCK(&oshmem_request_lock); - - *request = SHMEM_REQUEST_NULL; /*MPI_REQUEST_NULL;*/ - - return OSHMEM_SUCCESS; } -static int mca_spml_ikrit_get_request_cancel(struct oshmem_request_t * request, - int complete) +static inline mca_spml_ikrit_get_request_t *alloc_get_req(void) { - return OSHMEM_SUCCESS; -} + mca_spml_ikrit_get_request_t *req; + opal_free_list_item_t* item; -static void mca_spml_ikrit_get_request_construct(mca_spml_ikrit_get_request_t* req) -{ - req->req_get.req_base.req_type = MCA_SPML_REQUEST_GET; - req->req_get.req_base.req_oshmem.req_free = mca_spml_ikrit_get_request_free; - req->req_get.req_base.req_oshmem.req_cancel = - mca_spml_ikrit_get_request_cancel; -} + item = opal_free_list_wait (&mca_spml_base_get_requests); + assert(item != NULL); -static void mca_spml_ikrit_get_request_destruct(mca_spml_ikrit_get_request_t* req) -{ + req = (mca_spml_ikrit_get_request_t *) item; + opal_memchecker_base_mem_undefined(req, sizeof(*req)); + return req; } -OBJ_CLASS_INSTANCE( mca_spml_ikrit_get_request_t, - mca_spml_base_get_request_t, - mca_spml_ikrit_get_request_construct, - mca_spml_ikrit_get_request_destruct); int mca_spml_ikrit_put_simple(void* dst_addr, size_t size, @@ -214,42 +167,6 @@ mca_spml_ikrit_t mca_spml_ikrit = { } }; -static inline mca_spml_ikrit_put_request_t *alloc_put_req(void) -{ - mca_spml_ikrit_put_request_t *req; - opal_free_list_item_t* item; - - item = opal_free_list_wait (&mca_spml_base_put_requests); - - req = (mca_spml_ikrit_put_request_t *) item; - opal_memchecker_base_mem_undefined(req, sizeof(*req)); - opal_memchecker_base_mem_defined(&req->req_put.req_base, - sizeof(req->req_put.req_base)); - - req->req_put.req_base.req_free_called = false; - req->req_put.req_base.req_oshmem.req_complete = false; - - return req; -} - -static inline mca_spml_ikrit_get_request_t *alloc_get_req(void) -{ - mca_spml_ikrit_get_request_t *req; - opal_free_list_item_t* item; - - item = opal_free_list_wait (&mca_spml_base_get_requests); - - req = (mca_spml_ikrit_get_request_t *) item; - opal_memchecker_base_mem_undefined(req, sizeof(*req)); - opal_memchecker_base_mem_defined(&req->req_get.req_base, - sizeof(req->req_get.req_base)); - - req->req_get.req_base.req_free_called = false; - req->req_get.req_base.req_oshmem.req_complete = false; - - return req; -} - int mca_spml_ikrit_enable(bool enable) { SPML_VERBOSE(50, "*** ikrit ENABLED ****"); @@ -260,7 +177,7 @@ int mca_spml_ikrit_enable(bool enable) opal_free_list_init (&mca_spml_base_put_requests, sizeof(mca_spml_ikrit_put_request_t), opal_cache_line_size, - OBJ_CLASS(mca_spml_ikrit_put_request_t), + OBJ_CLASS(opal_free_list_item_t), 0, opal_cache_line_size, mca_spml_ikrit.free_list_num, @@ -271,7 +188,7 @@ int mca_spml_ikrit_enable(bool enable) opal_free_list_init (&mca_spml_base_get_requests, sizeof(mca_spml_ikrit_get_request_t), opal_cache_line_size, - OBJ_CLASS(mca_spml_ikrit_get_request_t), + OBJ_CLASS(opal_free_list_item_t), 0, opal_cache_line_size, mca_spml_ikrit.free_list_num, @@ -286,7 +203,6 @@ int mca_spml_ikrit_enable(bool enable) static void mxm_peer_construct(mxm_peer_t *p) { - p->pe = -1; p->n_active_puts = 0; p->need_fence = 0; p->ptl_id = MXM_PTL_RDMA; @@ -391,7 +307,6 @@ int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs) * that list have different order on every rank */ i = (my_rank + n) % nprocs; mxm_peer_construct(&mca_spml_ikrit.mxm_peers[i]); - mca_spml_ikrit.mxm_peers[i].pe = i; err = mxm_ep_connect(mca_spml_ikrit.mxm_ep, ep_info[i].addr.ep_addr, &mca_spml_ikrit.mxm_peers[i].mxm_conn); if (MXM_OK != err) { @@ -716,11 +631,7 @@ static inline void get_completion_cb(void *ctx) mca_spml_ikrit_get_request_t *get_req = (mca_spml_ikrit_get_request_t *) ctx; OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_gets, -1); - get_req->req_get.req_base.req_spml_complete = true; - get_req->req_get.req_base.req_oshmem.req_status.SHMEM_ERROR = - OSHMEM_SUCCESS; - oshmem_request_complete(&get_req->req_get.req_base.req_oshmem, 1); - oshmem_request_free((oshmem_request_t**) &get_req); + free_get_req(get_req); } static inline int mca_spml_ikrit_get_async(void *src_addr, @@ -734,11 +645,6 @@ static inline int mca_spml_ikrit_get_async(void *src_addr, return OSHMEM_SUCCESS; get_req = alloc_get_req(); - if (NULL == get_req) { - SPML_ERROR("out of get requests - aborting"); - oshmem_shmem_abort(-1); - return OSHMEM_ERROR; - } if (OSHMEM_SUCCESS != mca_spml_ikrit_get_helper(&get_req->mxm_req, @@ -766,11 +672,7 @@ static inline void fence_completion_cb(void *ctx) (mca_spml_ikrit_get_request_t *) ctx; OPAL_THREAD_ADD32(&mca_spml_ikrit.n_mxm_fences, -1); - fence_req->req_get.req_base.req_spml_complete = true; - fence_req->req_get.req_base.req_oshmem.req_status.SHMEM_ERROR = - OSHMEM_SUCCESS; - oshmem_request_complete(&fence_req->req_get.req_base.req_oshmem, 1); - oshmem_request_free((oshmem_request_t**) &fence_req); + free_get_req(fence_req); } static int mca_spml_ikrit_mxm_fence(int dst) @@ -778,11 +680,6 @@ static int mca_spml_ikrit_mxm_fence(int dst) mca_spml_ikrit_get_request_t *fence_req; fence_req = alloc_get_req(); - if (NULL == fence_req) { - SPML_ERROR("out of get requests - aborting"); - oshmem_shmem_abort(-1); - return OSHMEM_ERROR; - } fence_req->mxm_req.base.mq = mca_spml_ikrit.mxm_mq; fence_req->mxm_req.base.conn = mca_spml_ikrit.mxm_peers[dst].mxm_conn; @@ -808,6 +705,7 @@ static inline void put_completion_cb(void *ctx) mxm_peer_t *peer; OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_puts, -1); + /* TODO: keep pointer to peer in the request */ peer = &mca_spml_ikrit.mxm_peers[put_req->pe]; /* this was last put in progress. Remove peer from the list so that we do not need explicit fence */ @@ -832,11 +730,7 @@ static inline void put_completion_cb(void *ctx) } } - put_req->req_put.req_base.req_spml_complete = true; - put_req->req_put.req_base.req_oshmem.req_status.SHMEM_ERROR = - OSHMEM_SUCCESS; - oshmem_request_complete(&put_req->req_put.req_base.req_oshmem, 1); - oshmem_request_free((oshmem_request_t**) &put_req); + free_put_req(put_req); } /** @@ -882,7 +776,7 @@ static inline int mca_spml_ikrit_put_internal(void* dst_addr, mxm_progress(mca_spml_ikrit.mxm_context); return OSHMEM_SUCCESS; } - /* segment not mapped - fallback to rmda */ + /* segment not mapped - fallback to rdma */ r_mkey = mca_memheap_base_get_cached_mkey(dst, dst_addr, MXM_PTL_RDMA, &rva); if (!r_mkey) { SPML_ERROR("pe=%d: %p is not address of shared variable", @@ -897,11 +791,7 @@ static inline int mca_spml_ikrit_put_internal(void* dst_addr, mca_spml_base_mkey2str(r_mkey)); put_req = alloc_put_req(); - if (NULL == put_req) { - SPML_ERROR("out of put requests - aborting"); - oshmem_shmem_abort(-1); - return OSHMEM_ERROR; - } + if (handle) *handle = put_req; @@ -1118,15 +1008,15 @@ int mca_spml_ikrit_fence(void) peer = spml_ikrit_container_of(item, mxm_peer_t, link); peer->n_active_puts = 0; peer->need_fence = 0; - mca_spml_ikrit_mxm_fence(peer->pe); + mca_spml_ikrit_mxm_fence(peer - mca_spml_ikrit.mxm_peers); } while (0 < mca_spml_ikrit.n_mxm_fences) { - oshmem_request_wait_any_completion(); + opal_progress(); } while (0 < mca_spml_ikrit.n_active_gets) { - oshmem_request_wait_any_completion(); + opal_progress(); } SPML_VERBOSE(20, "fence completed"); diff --git a/oshmem/mca/spml/ikrit/spml_ikrit.h b/oshmem/mca/spml/ikrit/spml_ikrit.h index 8580a24c05e..1a42b91e723 100644 --- a/oshmem/mca/spml/ikrit/spml_ikrit.h +++ b/oshmem/mca/spml/ikrit/spml_ikrit.h @@ -59,7 +59,7 @@ BEGIN_C_DECLS /** - * UD MXM SPML module + * MXM SPML module */ struct mxm_peer { mxm_conn_h mxm_conn; @@ -67,7 +67,6 @@ struct mxm_peer { uint8_t ptl_id; opal_list_item_t link; int32_t n_active_puts; - uint32_t pe; uint8_t need_fence; }; From 7caa736533752fbca1a7736870191af041fc0f3f Mon Sep 17 00:00:00 2001 From: Alex Mikheev Date: Wed, 2 Nov 2016 18:57:10 +0200 Subject: [PATCH 08/14] OSHMEM: fixes potential deadlock in shmem_lock() Signed-off-by: Alex Mikheev --- oshmem/shmem/c/shmem_lock.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/oshmem/shmem/c/shmem_lock.c b/oshmem/shmem/c/shmem_lock.c index 3d167b61d57..dc469461acf 100644 --- a/oshmem/shmem/c/shmem_lock.c +++ b/oshmem/shmem/c/shmem_lock.c @@ -270,7 +270,11 @@ static uint64_t shmem_lock_cswap(void *target, prev_value = prev_value_32; } - + /* function is used to busy wait for the value. + * Call opal_progress() so that ompi will no deadlock + * (for example may need to respond to rkey requests) + */ + opal_progress(); return prev_value; } From 5c2f807ef8078c5365f71715a16b4cd14344f443 Mon Sep 17 00:00:00 2001 From: Alex Mikheev Date: Wed, 2 Nov 2016 18:57:51 +0200 Subject: [PATCH 09/14] OSHMEM: fixes verbosity log level cal Signed-off-by: Alex Mikheev --- oshmem/util/oshmem_util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oshmem/util/oshmem_util.c b/oshmem/util/oshmem_util.c index f97309906cd..ac64d42a0ab 100644 --- a/oshmem/util/oshmem_util.c +++ b/oshmem/util/oshmem_util.c @@ -23,7 +23,7 @@ void oshmem_output_verbose(int level, int output_id, const char* prefix, char *buff, *str; int ret = 0; - if (level < opal_output_get_verbosity(output_id)) { + if (level <= opal_output_get_verbosity(output_id)) { UNREFERENCED_PARAMETER(ret); va_start(args, format); From ff5095e533e9921adbd27c43c73fb591c87a873a Mon Sep 17 00:00:00 2001 From: Alex Mikheev Date: Wed, 2 Nov 2016 18:59:06 +0200 Subject: [PATCH 10/14] OSHMEM: adds support for mkey caching by spml It improves cpu cache hit ratio. Signed-off-by: Alex Mikheev --- oshmem/mca/atomic/mxm/atomic_mxm.h | 74 ++++++- oshmem/mca/atomic/mxm/atomic_mxm_cswap.c | 65 +----- oshmem/mca/atomic/mxm/atomic_mxm_fadd.c | 72 +----- oshmem/mca/memheap/base/base.h | 100 +++++++-- oshmem/mca/memheap/base/memheap_base_frame.c | 2 +- oshmem/mca/memheap/base/memheap_base_mkey.c | 80 +++---- .../mca/memheap/base/memheap_base_register.c | 16 +- oshmem/mca/memheap/base/memheap_base_select.c | 4 +- oshmem/mca/memheap/base/memheap_base_static.c | 8 +- oshmem/mca/spml/base/base.h | 2 +- oshmem/mca/spml/base/spml_base.c | 4 +- oshmem/mca/spml/ikrit/spml_ikrit.c | 206 +++++++++--------- oshmem/mca/spml/ikrit/spml_ikrit.h | 39 +++- oshmem/mca/spml/spml.h | 6 +- oshmem/mca/spml/ucx/spml_ucx.c | 58 +++-- oshmem/mca/spml/ucx/spml_ucx.h | 44 ++-- oshmem/mca/sshmem/mmap/sshmem_mmap_module.c | 14 +- oshmem/mca/sshmem/sshmem_types.h | 29 ++- oshmem/mca/sshmem/sysv/sshmem_sysv_module.c | 10 +- oshmem/mca/sshmem/verbs/sshmem_verbs_module.c | 8 +- 20 files changed, 451 insertions(+), 390 deletions(-) diff --git a/oshmem/mca/atomic/mxm/atomic_mxm.h b/oshmem/mca/atomic/mxm/atomic_mxm.h index c2a98b892f5..64478b2b66a 100644 --- a/oshmem/mca/atomic/mxm/atomic_mxm.h +++ b/oshmem/mca/atomic/mxm/atomic_mxm.h @@ -19,6 +19,7 @@ /* This component does uses SPML:IKRIT */ #include "oshmem/mca/spml/ikrit/spml_ikrit.h" +#include "oshmem/runtime/runtime.h" BEGIN_C_DECLS @@ -60,15 +61,76 @@ struct mca_atomic_mxm_module_t { typedef struct mca_atomic_mxm_module_t mca_atomic_mxm_module_t; OBJ_CLASS_DECLARATION(mca_atomic_mxm_module_t); -END_C_DECLS -/* move to spml/ikrit */ -static inline mxm_mem_key_t *to_mxm_mkey(sshmem_mkey_t *mkey) { +static inline uint8_t mca_atomic_mxm_order(size_t nlong) +{ + if (OPAL_LIKELY(8 == nlong)) { + return 3; + } + + if (OPAL_LIKELY(4 == nlong)) { + return 2; + } + + if (2 == nlong) { + return 1; + } - if (0 == mkey->len) { - return &mxm_empty_mem_key; + if (1 == nlong) { + return 0; } - return (mxm_mem_key_t *)mkey->u.data; + + ATOMIC_ERROR("Type size must be 1/2/4 or 8 bytes."); + oshmem_shmem_abort(-1); + return OSHMEM_ERR_BAD_PARAM; } +static inline void mca_atomic_mxm_req_init(mxm_send_req_t *sreq, int pe, void *target, size_t nlong) +{ + uint8_t nlong_order; + void *remote_addr; + mxm_mem_key_t *mkey; + + nlong_order = mca_atomic_mxm_order(nlong); + + mkey = mca_spml_ikrit_get_mkey(pe, target, MXM_PTL_RDMA, &remote_addr); + + /* mxm request init */ + sreq->base.state = MXM_REQ_NEW; + sreq->base.mq = mca_atomic_mxm_spml_self->mxm_mq; + sreq->base.conn = mca_atomic_mxm_spml_self->mxm_peers[pe].mxm_hw_rdma_conn; + sreq->base.completed_cb = NULL; + sreq->base.data_type = MXM_REQ_DATA_BUFFER; + + sreq->base.data.buffer.memh = MXM_INVALID_MEM_HANDLE; + sreq->base.data.buffer.length = nlong; + + sreq->op.atomic.remote_vaddr = (uintptr_t) remote_addr; + sreq->op.atomic.remote_mkey = mkey; + sreq->op.atomic.order = nlong_order; + + sreq->flags = 0; +} + +static inline void mca_atomic_mxm_post(mxm_send_req_t *sreq) +{ + mxm_error_t mxm_err; + + mxm_err = mxm_req_send(sreq); + if (OPAL_UNLIKELY(MXM_OK != mxm_err)) { + ATOMIC_ERROR("mxm_req_send failed, mxm_error = %d", + mxm_err); + oshmem_shmem_abort(-1); + } + + mxm_req_wait(&sreq->base); + if (OPAL_UNLIKELY(MXM_OK != sreq->base.error)) { + ATOMIC_ERROR("mxm_req_wait got non MXM_OK error: %d", + sreq->base.error); + oshmem_shmem_abort(-1); + } +} + +END_C_DECLS + #endif /* MCA_ATOMIC_MXM_H */ diff --git a/oshmem/mca/atomic/mxm/atomic_mxm_cswap.c b/oshmem/mca/atomic/mxm/atomic_mxm_cswap.c index e75dc3a8f5b..bb6c675a03c 100644 --- a/oshmem/mca/atomic/mxm/atomic_mxm_cswap.c +++ b/oshmem/mca/atomic/mxm/atomic_mxm_cswap.c @@ -31,60 +31,11 @@ int mca_atomic_mxm_cswap(void *target, size_t nlong, int pe) { - unsigned my_pe; - uint8_t nlong_order; - void *remote_addr; mxm_send_req_t sreq; - mxm_error_t mxm_err; - sshmem_mkey_t *r_mkey; - my_pe = oshmem_my_proc_id(); - mxm_err = MXM_OK; + mca_atomic_mxm_req_init(&sreq, pe, target, nlong); - switch (nlong) { - case 1: - nlong_order = 0; - break; - case 2: - nlong_order = 1; - break; - case 4: - nlong_order = 2; - break; - case 8: - nlong_order = 3; - break; - default: - ATOMIC_ERROR("[#%d] Type size must be 1/2/4 or 8 bytes.", my_pe); - oshmem_shmem_abort(-1); - return OSHMEM_ERR_BAD_PARAM; - } - - r_mkey = mca_memheap_base_get_cached_mkey(pe, target, MXM_PTL_RDMA, &remote_addr); - if (!r_mkey) { - ATOMIC_ERROR("[#%d] %p is not address of symmetric variable", - my_pe, target); - oshmem_shmem_abort(-1); - return OSHMEM_ERR_BAD_PARAM; - } - - /* mxm request init */ - sreq.base.state = MXM_REQ_NEW; - sreq.base.mq = mca_atomic_mxm_spml_self->mxm_mq; - sreq.base.conn = mca_atomic_mxm_spml_self->mxm_peers[pe].mxm_hw_rdma_conn; - sreq.base.completed_cb = NULL; - sreq.base.data_type = MXM_REQ_DATA_BUFFER; - - /* set data */ sreq.base.data.buffer.ptr = (void *) value; - sreq.base.data.buffer.length = nlong; - sreq.base.data.buffer.memh = MXM_INVALID_MEM_HANDLE; - - sreq.op.atomic.remote_vaddr = (uintptr_t) remote_addr; - sreq.flags = 0; - sreq.op.atomic.remote_mkey = to_mxm_mkey(r_mkey); - sreq.op.atomic.order = nlong_order; - if (NULL == cond) { sreq.opcode = MXM_REQ_OP_ATOMIC_SWAP; } else { @@ -92,20 +43,8 @@ int mca_atomic_mxm_cswap(void *target, sreq.opcode = MXM_REQ_OP_ATOMIC_CSWAP; } - if (MXM_OK != (mxm_err = mxm_req_send(&sreq))) { - ATOMIC_ERROR("[#%d] mxm_req_send failed, mxm_error = %d", - my_pe, mxm_err); - oshmem_shmem_abort(-1); - return OSHMEM_ERROR; - } + mca_atomic_mxm_post(&sreq); - mxm_req_wait(&sreq.base); - if (MXM_OK != sreq.base.error) { - ATOMIC_ERROR("[#%d] mxm_req_wait got non MXM_OK error: %d", - my_pe, sreq.base.error); - oshmem_shmem_abort(-1); - return OSHMEM_ERROR; - } memcpy(prev, value, nlong); return OSHMEM_SUCCESS; diff --git a/oshmem/mca/atomic/mxm/atomic_mxm_fadd.c b/oshmem/mca/atomic/mxm/atomic_mxm_fadd.c index afde70dcadd..54676ceacec 100644 --- a/oshmem/mca/atomic/mxm/atomic_mxm_fadd.c +++ b/oshmem/mca/atomic/mxm/atomic_mxm_fadd.c @@ -32,86 +32,20 @@ int mca_atomic_mxm_fadd(void *target, int pe, struct oshmem_op_t *op) { - unsigned my_pe; - uint8_t nlong_order; - void *remote_addr; mxm_send_req_t sreq; - mxm_error_t mxm_err; - sshmem_mkey_t *r_mkey; static char dummy_buf[8]; - my_pe = oshmem_my_proc_id(); - mxm_err = MXM_OK; + mca_atomic_mxm_req_init(&sreq, pe, target, nlong); - switch (nlong) { - case 1: - nlong_order = 0; - break; - case 2: - nlong_order = 1; - break; - case 4: - nlong_order = 2; - break; - case 8: - nlong_order = 3; - break; - default: - ATOMIC_ERROR("[#%d] Type size must be 1/2/4 or 8 bytes.", my_pe); - oshmem_shmem_abort(-1); - return OSHMEM_ERR_BAD_PARAM; - } - - r_mkey = mca_memheap_base_get_cached_mkey(pe, target, MXM_PTL_RDMA, &remote_addr); - if (!r_mkey) { - ATOMIC_ERROR("[#%d] %p is not address of symmetric variable", - my_pe, target); - oshmem_shmem_abort(-1); - return OSHMEM_ERR_BAD_PARAM; - } - - /* mxm request init */ - sreq.base.state = MXM_REQ_NEW; - sreq.base.mq = mca_atomic_mxm_spml_self->mxm_mq; - sreq.base.conn = mca_atomic_mxm_spml_self->mxm_peers[pe].mxm_hw_rdma_conn; - sreq.base.completed_cb = NULL; - sreq.base.data_type = MXM_REQ_DATA_BUFFER; - - sreq.op.atomic.remote_vaddr = (uintptr_t) remote_addr; - sreq.op.atomic.remote_mkey = to_mxm_mkey(r_mkey); memcpy(&sreq.op.atomic.value, value, nlong); - sreq.op.atomic.order = nlong_order; - - /* Do we need atomic 'add' or atomic 'fetch and add'? */ + sreq.opcode = MXM_REQ_OP_ATOMIC_FADD; if (NULL == prev) { sreq.base.data.buffer.ptr = dummy_buf; - sreq.base.data.buffer.length = nlong; - sreq.base.data.buffer.memh = MXM_INVALID_MEM_HANDLE; - sreq.flags = 0; - sreq.opcode = MXM_REQ_OP_ATOMIC_FADD; } else { sreq.base.data.buffer.ptr = prev; - sreq.base.data.buffer.length = nlong; - sreq.base.data.buffer.memh = MXM_INVALID_MEM_HANDLE; - sreq.flags = 0; - - sreq.opcode = MXM_REQ_OP_ATOMIC_FADD; } - if (MXM_OK != (mxm_err = mxm_req_send(&sreq))) { - ATOMIC_ERROR("[#%d] mxm_req_send failed, mxm_error = %d", - my_pe, mxm_err); - oshmem_shmem_abort(-1); - return OSHMEM_ERROR; - } - - mxm_req_wait(&sreq.base); - if (MXM_OK != sreq.base.error) { - ATOMIC_ERROR("[#%d] mxm_req_wait got non MXM_OK error: %d", - my_pe, sreq.base.error); - oshmem_shmem_abort(-1); - return OSHMEM_ERROR; - } + mca_atomic_mxm_post(&sreq); return OSHMEM_SUCCESS; } diff --git a/oshmem/mca/memheap/base/base.h b/oshmem/mca/memheap/base/base.h index b8da84f4d9f..222cfd9e813 100644 --- a/oshmem/mca/memheap/base/base.h +++ b/oshmem/mca/memheap/base/base.h @@ -44,9 +44,13 @@ extern char* mca_memheap_base_exclude; extern int mca_memheap_base_already_opened; extern int mca_memheap_base_key_exchange; -#define MCA_MEMHEAP_MAX_SEGMENTS 256 +#define MCA_MEMHEAP_MAX_SEGMENTS 4 #define HEAP_SEG_INDEX 0 #define SYMB_SEG_INDEX 1 +#define MCA_MEMHEAP_SEG_COUNT (SYMB_SEG_INDEX+1) + +#define MEMHEAP_SEG_INVALID 0xFFFF + typedef struct mca_memheap_map { map_segment_t mem_segs[MCA_MEMHEAP_MAX_SEGMENTS]; /* TODO: change into pointer array */ @@ -158,41 +162,97 @@ extern int mca_memheap_seg_cmp(const void *k, const void *v); extern mca_memheap_map_t* memheap_map; +static inline int map_segment_is_va_in(map_base_segment_t *s, const void *va) +{ + return ((uintptr_t)va >= (uintptr_t)s->va_base && + (uintptr_t)va < (uintptr_t)s->va_end); +} + +static inline map_segment_t *memheap_find_seg(const int segno) +{ + return &mca_memheap_base_map.mem_segs[segno]; +} + +static inline int memheap_is_va_in_segment(const void *va, const int segno) +{ + + return map_segment_is_va_in(&memheap_find_seg(segno)->super, va); +} + +static inline int memheap_find_segnum(const void *va) +{ + if (OPAL_LIKELY(memheap_is_va_in_segment(va, SYMB_SEG_INDEX))) { + return SYMB_SEG_INDEX; + } else if (memheap_is_va_in_segment(va, HEAP_SEG_INDEX)) { + return HEAP_SEG_INDEX; + } + return MEMHEAP_SEG_INVALID; +} + +static inline void* memheap_va2rva(const void* va, const void* local_base, const void* remote_base) +{ + return (void*) (remote_base > local_base ? + (uintptr_t)va + ((uintptr_t)remote_base - (uintptr_t)local_base) : + (uintptr_t)va - ((uintptr_t)local_base - (uintptr_t)remote_base)); +} + +static inline void *map_segment_va2rva(mkey_segment_t *seg, const void *va) +{ + return memheap_va2rva(va, seg->super.va_base, seg->rva_base); +} + +static inline map_base_segment_t *map_segment_find_va(map_base_segment_t *segs, size_t elem_size, const void *va) +{ + map_base_segment_t *rseg; + + rseg = (map_base_segment_t *)((char *)segs + elem_size * HEAP_SEG_INDEX); + if (OPAL_LIKELY(map_segment_is_va_in(rseg, va))) { + return rseg; + } + + rseg = (map_base_segment_t *)((char *)segs + elem_size * SYMB_SEG_INDEX); + if (OPAL_LIKELY(map_segment_is_va_in(rseg, va))) { + return rseg; + } + + return NULL; +} + +void mkey_segment_init(mkey_segment_t *seg, sshmem_mkey_t *mkey, uint32_t segno); + + static inline map_segment_t *memheap_find_va(const void* va) { map_segment_t *s; - if (OPAL_LIKELY((uintptr_t)va >= (uintptr_t)memheap_map->mem_segs[HEAP_SEG_INDEX].seg_base_addr && - (uintptr_t)va < (uintptr_t)memheap_map->mem_segs[HEAP_SEG_INDEX].end)) { + /* most probably there will be only two segments: heap and global data */ + if (OPAL_LIKELY(memheap_is_va_in_segment(va, SYMB_SEG_INDEX))) { + s = &memheap_map->mem_segs[SYMB_SEG_INDEX]; + } else if (memheap_is_va_in_segment(va, HEAP_SEG_INDEX)) { s = &memheap_map->mem_segs[HEAP_SEG_INDEX]; - } else { + } else if (memheap_map->n_segments - 2 > 0) { s = bsearch(va, - &memheap_map->mem_segs[SYMB_SEG_INDEX], - memheap_map->n_segments - 1, + &memheap_map->mem_segs[SYMB_SEG_INDEX+1], + memheap_map->n_segments - 2, sizeof(*s), mca_memheap_seg_cmp); + } else { + s = NULL; } #if MEMHEAP_BASE_DEBUG == 1 if (s) { MEMHEAP_VERBOSE(5, "match seg#%02ld: 0x%llX - 0x%llX %llu bytes va=%p", s - memheap_map->mem_segs, - (long long)s->seg_base_addr, - (long long)s->end, - (long long)(s->end - s->seg_base_addr), + (long long)s->super.va_base, + (long long)s->super.va_end, + (long long)(s->super.va_end - s->super.va_base), (void *)va); } #endif return s; } -static inline void* memheap_va2rva(void* va, void* local_base, void* remote_base) -{ - return (void*) (remote_base > local_base ? - (uintptr_t)va + ((uintptr_t)remote_base - (uintptr_t)local_base) : - (uintptr_t)va - ((uintptr_t)local_base - (uintptr_t)remote_base)); -} - static inline sshmem_mkey_t *mca_memheap_base_get_cached_mkey(int pe, void* va, int btl_id, @@ -218,7 +278,7 @@ static inline sshmem_mkey_t *mca_memheap_base_get_cached_mkey(int pe, if (OPAL_LIKELY(s->mkeys_cache[pe])) { mkey = &s->mkeys_cache[pe][btl_id]; - *rva = memheap_va2rva(va, s->seg_base_addr, mkey->va_base); + *rva = memheap_va2rva(va, s->super.va_base, mkey->va_base); MEMHEAP_VERBOSE_FASTPATH(10, "rkey: pe=%d va=%p -> (cached) %lx %p", pe, (void *)va, mkey->u.key, (void *)*rva); return mkey; } @@ -231,6 +291,12 @@ static inline int mca_memheap_base_num_transports(void) return memheap_map->num_transports; } +static inline void* mca_memheap_seg2base_va(int seg) +{ + return memheap_map->mem_segs[seg].super.va_base; +} + + END_C_DECLS #endif /* MCA_MEMHEAP_BASE_H */ diff --git a/oshmem/mca/memheap/base/memheap_base_frame.c b/oshmem/mca/memheap/base/memheap_base_frame.c index dc9c51b25bc..578b4eda721 100644 --- a/oshmem/mca/memheap/base/memheap_base_frame.c +++ b/oshmem/mca/memheap/base/memheap_base_frame.c @@ -38,7 +38,7 @@ char* mca_memheap_base_exclude = NULL; opal_list_t mca_memheap_base_components_opened = {{0}}; struct mca_memheap_base_module_t* mca_memheap_base_module_initialized = NULL; int mca_memheap_base_already_opened = 0; -mca_memheap_map_t mca_memheap_base_map = {{{0}}}; +mca_memheap_map_t mca_memheap_base_map; static int mca_memheap_base_register(mca_base_register_flag_t flags) { diff --git a/oshmem/mca/memheap/base/memheap_base_mkey.c b/oshmem/mca/memheap/base/memheap_base_mkey.c index 09fe07965de..d072ee053e1 100644 --- a/oshmem/mca/memheap/base/memheap_base_mkey.c +++ b/oshmem/mca/memheap/base/memheap_base_mkey.c @@ -49,6 +49,7 @@ typedef struct oob_comm_request { struct oob_comm { opal_mutex_t lck; opal_condition_t cond; + uint32_t segno; sshmem_mkey_t *mkeys; int mkeys_rcvd; oob_comm_request_t req_pool[MEMHEAP_RECV_REQS_MAX]; @@ -69,54 +70,30 @@ static int memheap_oob_get_mkeys(int pe, uint32_t va_seg_num, sshmem_mkey_t *mkey); -static inline void* mca_memheap_seg2base_va(int seg) -{ - return memheap_map->mem_segs[seg].seg_base_addr; -} - int mca_memheap_seg_cmp(const void *k, const void *v) { uintptr_t va = (uintptr_t) k; map_segment_t *s = (map_segment_t *) v; - if (va < (uintptr_t)s->seg_base_addr) + if (va < (uintptr_t)s->super.va_base) return -1; - if (va >= (uintptr_t)s->end) + if (va >= (uintptr_t)s->super.va_end) return 1; return 0; } -/** - * @param all_trs - * 0 - pack mkeys for transports to given pe - * 1 - pack mkeys for ALL possible transports. value of pe is ignored - */ -static int pack_local_mkeys(opal_buffer_t *msg, int pe, int seg, int all_trs) +static int pack_local_mkeys(opal_buffer_t *msg, int pe, int seg) { - ompi_proc_t *proc; int i, n, tr_id; sshmem_mkey_t *mkey; - /* go over all transports to remote pe and pack mkeys */ - if (!all_trs) { - n = oshmem_get_transport_count(pe); - proc = oshmem_proc_group_find(oshmem_group_all, pe); - } - else { - proc = NULL; - n = memheap_map->num_transports; - } - + /* go over all transports and pack mkeys */ + n = memheap_map->num_transports; opal_dss.pack(msg, &n, 1, OPAL_UINT32); MEMHEAP_VERBOSE(5, "found %d transports to %d", n, pe); for (i = 0; i < n; i++) { - if (!all_trs) { - tr_id = OSHMEM_PROC_DATA(proc)->transport_ids[i]; - } - else { - tr_id = i; - } + tr_id = i; mkey = mca_memheap_base_get_mkey(mca_memheap_seg2base_va(seg), tr_id); if (!mkey) { MEMHEAP_ERROR("seg#%d tr_id: %d failed to find local mkey", @@ -203,10 +180,10 @@ static void unpack_remote_mkeys(opal_buffer_t *msg, int remote_pe) } cnt = memheap_oob.mkeys[tr_id].len; opal_dss.unpack(msg, memheap_oob.mkeys[tr_id].u.data, &cnt, OPAL_BYTE); - MCA_SPML_CALL(rmkey_unpack(&memheap_oob.mkeys[tr_id], remote_pe)); } else { memheap_oob.mkeys[tr_id].u.key = MAP_SEGMENT_SHM_INVALID; } + MCA_SPML_CALL(rmkey_unpack(&memheap_oob.mkeys[tr_id], memheap_oob.segno, remote_pe, tr_id)); } MEMHEAP_VERBOSE(5, @@ -250,7 +227,7 @@ static void do_recv(int source_pe, opal_buffer_t* buffer) msg_type = MEMHEAP_RKEY_RESP; opal_dss.pack(msg, &msg_type, 1, OPAL_UINT8); - if (OSHMEM_SUCCESS != pack_local_mkeys(msg, source_pe, seg, 0)) { + if (OSHMEM_SUCCESS != pack_local_mkeys(msg, source_pe, seg)) { OBJ_RELEASE(msg); goto send_fail; } @@ -488,12 +465,18 @@ static int memheap_oob_get_mkeys(int pe, uint32_t seg, sshmem_mkey_t *mkeys) if (OSHMEM_SUCCESS == MCA_SPML_CALL(oob_get_mkeys(pe, seg, mkeys))) { for (i = 0; i < memheap_map->num_transports; i++) { - mkeys[i].va_base = mca_memheap_seg2base_va(seg); MEMHEAP_VERBOSE(5, "MKEY CALCULATED BY LOCAL SPML: pe: %d tr_id: %d %s", pe, i, mca_spml_base_mkey2str(&mkeys[i])); + int my_pe = oshmem_my_proc_id(); + if (my_pe == 0) + printf( + "MKEY CALCULATED BY LOCAL SPML: pe: %d tr_id: %d %s\n", + pe, + i, + mca_spml_base_mkey2str(&mkeys[i])); } return OSHMEM_SUCCESS; } @@ -501,6 +484,7 @@ static int memheap_oob_get_mkeys(int pe, uint32_t seg, sshmem_mkey_t *mkeys) OPAL_THREAD_LOCK(&memheap_oob.lck); memheap_oob.mkeys = mkeys; + memheap_oob.segno = seg; memheap_oob.mkeys_rcvd = 0; msg = OBJ_NEW(opal_buffer_t); @@ -592,7 +576,7 @@ void mca_memheap_modex_recv_all(void) } for (j = 0; j < memheap_map->n_segments; j++) { - pack_local_mkeys(msg, 0, j, 1); + pack_local_mkeys(msg, 0, j); } /* we assume here that int32_t returned by opal_dss.unload @@ -668,6 +652,7 @@ void mca_memheap_modex_recv_all(void) } } memheap_oob.mkeys = s->mkeys_cache[i]; + memheap_oob.segno = j; unpack_remote_mkeys(msg, i); } } @@ -722,7 +707,7 @@ sshmem_mkey_t * mca_memheap_base_get_cached_mkey_slow(map_segment_t *s, return NULL ; mkey = &s->mkeys_cache[pe][btl_id]; - *rva = memheap_va2rva(va, s->seg_base_addr, mkey->va_base); + *rva = memheap_va2rva(va, s->super.va_base, mkey->va_base); MEMHEAP_VERBOSE_FASTPATH(5, "rkey: pe=%d va=%p -> (remote lookup) %lx %p", pe, (void *)va, mkey->u.key, (void *)*rva); return mkey; @@ -748,7 +733,7 @@ uint64_t mca_memheap_base_find_offset(int pe, s = memheap_find_va(va); if (my_pe == pe) { - return (uintptr_t)va - (uintptr_t)s->seg_base_addr; + return (uintptr_t)va - (uintptr_t)s->super.va_base; } else { return ((s && MAP_SEGMENT_IS_VALID(s)) ? ((uintptr_t)rva - (uintptr_t)(s->mkeys_cache[pe][tr_id].va_base)) : 0); @@ -770,14 +755,31 @@ int mca_memheap_base_detect_addr_type(void* va) if (s) { if (s->type == MAP_SEGMENT_STATIC) { addr_type = ADDR_STATIC; - } else if ((uintptr_t)va >= (uintptr_t) s->seg_base_addr - && (uintptr_t)va < (uintptr_t) ((uintptr_t)s->seg_base_addr + mca_memheap.memheap_size)) { + } else if ((uintptr_t)va >= (uintptr_t) s->super.va_base + && (uintptr_t)va < (uintptr_t) ((uintptr_t)s->super.va_base + mca_memheap.memheap_size)) { addr_type = ADDR_USER; } else { - assert( (uintptr_t)va >= (uintptr_t) ((uintptr_t)s->seg_base_addr + mca_memheap.memheap_size) && (uintptr_t)va < (uintptr_t)s->end); + assert( (uintptr_t)va >= (uintptr_t) ((uintptr_t)s->super.va_base + mca_memheap.memheap_size) && (uintptr_t)va < (uintptr_t)s->super.va_end); addr_type = ADDR_PRIVATE; } } return addr_type; } + +void mkey_segment_init(mkey_segment_t *seg, sshmem_mkey_t *mkey, uint32_t segno) +{ + map_segment_t *s; + + if (segno >= MCA_MEMHEAP_SEG_COUNT) { + return; + } + + s = memheap_find_seg(segno); + assert(NULL != s); + + seg->super.va_base = s->super.va_base; + seg->super.va_end = s->super.va_end; + seg->rva_base = mkey->va_base; +} + diff --git a/oshmem/mca/memheap/base/memheap_base_register.c b/oshmem/mca/memheap/base/memheap_base_register.c index 18da1790f50..ea742b2eb5e 100644 --- a/oshmem/mca/memheap/base/memheap_base_register.c +++ b/oshmem/mca/memheap/base/memheap_base_register.c @@ -32,9 +32,9 @@ int mca_memheap_base_reg(mca_memheap_map_t *memheap_map) MEMHEAP_VERBOSE(5, "register seg#%02d: 0x%p - 0x%p %llu bytes type=0x%X id=0x%X", i, - s->seg_base_addr, - s->end, - (long long)((uintptr_t)s->end - (uintptr_t)s->seg_base_addr), + s->super.va_base, + s->super.va_end, + (long long)((uintptr_t)s->super.va_end - (uintptr_t)s->super.va_base), s->type, s->seg_id); ret = _reg_segment(s, &memheap_map->num_transports); @@ -60,9 +60,9 @@ int mca_memheap_base_dereg(mca_memheap_map_t *memheap_map) MEMHEAP_VERBOSE(5, "deregistering segment#%d: %p - %p %llu bytes", i, - s->seg_base_addr, - s->end, - (long long)((uintptr_t)s->end - (uintptr_t)s->seg_base_addr)); + s->super.va_base, + s->super.va_end, + (long long)((uintptr_t)s->super.va_end - (uintptr_t)s->super.va_base)); (void)_dereg_segment(s); } @@ -120,8 +120,8 @@ static int _reg_segment(map_segment_t *s, int *num_btl) } if (!rc) { - s->mkeys = MCA_SPML_CALL(register((void *)(unsigned long)s->seg_base_addr, - (uintptr_t)s->end - (uintptr_t)s->seg_base_addr, + s->mkeys = MCA_SPML_CALL(register((void *)(unsigned long)s->super.va_base, + (uintptr_t)s->super.va_end - (uintptr_t)s->super.va_base, s->seg_id, num_btl)); if (NULL == s->mkeys) { diff --git a/oshmem/mca/memheap/base/memheap_base_select.c b/oshmem/mca/memheap/base/memheap_base_select.c index 95e5eb01f74..b1a52e7a7bb 100644 --- a/oshmem/mca/memheap/base/memheap_base_select.c +++ b/oshmem/mca/memheap/base/memheap_base_select.c @@ -218,10 +218,10 @@ static memheap_context_t* _memheap_create(void) context.user_size = user_size; context.private_size = MEMHEAP_BASE_PRIVATE_SIZE; context.user_base_addr = - (void*) ((unsigned char*) mca_memheap_base_map.mem_segs[HEAP_SEG_INDEX].seg_base_addr + (void*) ((unsigned char*) mca_memheap_base_map.mem_segs[HEAP_SEG_INDEX].super.va_base + 0); context.private_base_addr = - (void*) ((unsigned char*) mca_memheap_base_map.mem_segs[HEAP_SEG_INDEX].seg_base_addr + (void*) ((unsigned char*) mca_memheap_base_map.mem_segs[HEAP_SEG_INDEX].super.va_base + context.user_size); } diff --git a/oshmem/mca/memheap/base/memheap_base_static.c b/oshmem/mca/memheap/base/memheap_base_static.c index ff0a43b7be4..edbb11aa310 100644 --- a/oshmem/mca/memheap/base/memheap_base_static.c +++ b/oshmem/mca/memheap/base/memheap_base_static.c @@ -63,13 +63,13 @@ int mca_memheap_base_static_init(mca_memheap_map_t *map) memset(s, 0, sizeof(*s)); MAP_SEGMENT_RESET_FLAGS(s); s->seg_id = MAP_SEGMENT_SHM_INVALID; - s->seg_base_addr = memheap_context.mem_segs[i].start; - s->end = memheap_context.mem_segs[i].end; - s->seg_size = ((uintptr_t)s->end - (uintptr_t)s->seg_base_addr); + s->super.va_base = memheap_context.mem_segs[i].start; + s->super.va_end = memheap_context.mem_segs[i].end; + s->seg_size = ((uintptr_t)s->super.va_end - (uintptr_t)s->super.va_base); s->type = MAP_SEGMENT_STATIC; map->n_segments++; - total_mem += ((uintptr_t)s->end - (uintptr_t)s->seg_base_addr); + total_mem += ((uintptr_t)s->super.va_end - (uintptr_t)s->super.va_base); } MEMHEAP_VERBOSE(1, "Memheap static memory: %llu byte(s), %d segments", diff --git a/oshmem/mca/spml/base/base.h b/oshmem/mca/spml/base/base.h index b53894b21c7..a0fd613d4db 100644 --- a/oshmem/mca/spml/base/base.h +++ b/oshmem/mca/spml/base/base.h @@ -71,7 +71,7 @@ OSHMEM_DECLSPEC int mca_spml_base_oob_get_mkeys(int pe, uint32_t seg, sshmem_mkey_t *mkeys); -OSHMEM_DECLSPEC void mca_spml_base_rmkey_unpack(sshmem_mkey_t *mkey, int pe); +OSHMEM_DECLSPEC void mca_spml_base_rmkey_unpack(sshmem_mkey_t *mkey, uint32_t seg, int pe, int tr_id); OSHMEM_DECLSPEC void mca_spml_base_rmkey_free(sshmem_mkey_t *mkey); OSHMEM_DECLSPEC int mca_spml_base_put_nb(void *dst_addr, size_t size, diff --git a/oshmem/mca/spml/base/spml_base.c b/oshmem/mca/spml/base/spml_base.c index f43db019b8c..bdaf013438c 100644 --- a/oshmem/mca/spml/base/spml_base.c +++ b/oshmem/mca/spml/base/spml_base.c @@ -153,12 +153,12 @@ int mca_spml_base_wait_nb(void* handle) return OSHMEM_SUCCESS; } -int mca_spml_base_oob_get_mkeys(int pe, uint32_t seg, sshmem_mkey_t *mkeys) +int mca_spml_base_oob_get_mkeys(int pe, uint32_t segno, sshmem_mkey_t *mkeys) { return OSHMEM_ERROR; } -void mca_spml_base_rmkey_unpack(sshmem_mkey_t *mkey, int pe) +void mca_spml_base_rmkey_unpack(sshmem_mkey_t *mkey, uint32_t segno, int pe, int tr_id) { } diff --git a/oshmem/mca/spml/ikrit/spml_ikrit.c b/oshmem/mca/spml/ikrit/spml_ikrit.c index c8409979ef7..909b9137234 100644 --- a/oshmem/mca/spml/ikrit/spml_ikrit.c +++ b/oshmem/mca/spml/ikrit/spml_ikrit.c @@ -34,6 +34,7 @@ #include "oshmem/mca/spml/base/spml_base_putreq.h" #include "oshmem/runtime/runtime.h" #include "orte/util/show_help.h" +#include "oshmem/mca/sshmem/sshmem.h" #include "oshmem/mca/spml/ikrit/spml_ikrit_component.h" @@ -68,6 +69,12 @@ struct mca_spml_ikrit_put_request { typedef struct mca_spml_ikrit_put_request mca_spml_ikrit_put_request_t; + +static inline int get_ptl_id(int dst) +{ + return mca_spml_ikrit.mxm_peers[dst].ptl_id; +} + static inline mxm_mem_key_t *to_mxm_mkey(sshmem_mkey_t *mkey) { if (0 == mkey->len) { @@ -142,6 +149,8 @@ int mca_spml_ikrit_put_simple(void* dst_addr, void* src_addr, int dst); +static void mca_spml_ikrit_cache_mkeys(sshmem_mkey_t *, uint32_t seg, int remote_pe, int tr_id); + mca_spml_ikrit_t mca_spml_ikrit = { { /* Init mca_spml_base_module_t */ @@ -160,13 +169,62 @@ mca_spml_ikrit_t mca_spml_ikrit = { mca_spml_base_wait, mca_spml_base_wait_nb, mca_spml_ikrit_fence, - mca_spml_base_rmkey_unpack, + mca_spml_ikrit_cache_mkeys, mca_spml_base_rmkey_free, (void*)&mca_spml_ikrit } }; +static void mca_spml_ikrit_cache_mkeys(sshmem_mkey_t *mkey, uint32_t seg, int dst_pe, int tr_id) +{ + mxm_peer_t *peer; + + if (MXM_PTL_RDMA != tr_id) { + return; + } + + peer = &mca_spml_ikrit.mxm_peers[dst_pe]; + mkey_segment_init(&peer->mkeys[seg].super, mkey, seg); + + if (0 != mkey->len) { + memcpy(&peer->mkeys[seg].key, mkey->u.data, mkey->len); + } else { + memcpy(&peer->mkeys[seg].key, &mxm_empty_mem_key, sizeof(mxm_empty_mem_key)); + } +} + +mxm_mem_key_t *mca_spml_ikrit_get_mkey_slow(int pe, void *va, int ptl_id, void **rva) +{ + sshmem_mkey_t *mkey; + +retry: + mkey = mca_memheap_base_get_cached_mkey(pe, va, ptl_id, rva); + if (NULL == mkey) { + SPML_ERROR("pe=%d: %p is not address of shared variable", pe, va); + oshmem_shmem_abort(-1); + return NULL; + } + + if (MXM_PTL_SHM == ptl_id) { + if (mca_memheap_base_can_local_copy(mkey, va)) { + return NULL; + } + + /* if dst addr is on memheap and local copy is not allowed + * disable direct shm transport + */ + if (memheap_is_va_in_segment(va, HEAP_SEG_INDEX)) { + mca_spml_ikrit.mxm_peers[pe].ptl_id = MXM_PTL_RDMA; + } + /* going via mxm must always work */ + ptl_id = MXM_PTL_RDMA; + goto retry; + } + + return to_mxm_mkey(mkey); +} + int mca_spml_ikrit_enable(bool enable) { SPML_VERBOSE(50, "*** ikrit ENABLED ****"); @@ -246,7 +304,7 @@ int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs) { spml_ikrit_mxm_ep_conn_info_t *ep_info = NULL; spml_ikrit_mxm_ep_conn_info_t *ep_hw_rdma_info = NULL; - spml_ikrit_mxm_ep_conn_info_t my_ep_info = {{0}}; + spml_ikrit_mxm_ep_conn_info_t my_ep_info; size_t mxm_addr_len = MXM_MAX_ADDR_LEN; mxm_error_t err; size_t i, n; @@ -276,6 +334,8 @@ int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs) goto bail; } + memset(&my_ep_info, 0, sizeof(my_ep_info)); + if (mca_spml_ikrit.hw_rdma_channel) { err = mxm_ep_get_address(mca_spml_ikrit.mxm_hw_rdma_ep, &my_ep_info.addr.ep_addr, &mxm_addr_len); if (MXM_OK != err) { @@ -373,6 +433,7 @@ sshmem_mkey_t *mca_spml_ikrit_register(void* addr, sshmem_mkey_t *mkeys; mxm_error_t err; mxm_mem_key_t *m_key; + int my_rank = oshmem_my_proc_id(); *count = 0; mkeys = (sshmem_mkey_t *) calloc(1, MXM_PTL_LAST * sizeof(*mkeys)); @@ -430,9 +491,10 @@ sshmem_mkey_t *mca_spml_ikrit_register(void* addr, } SPML_VERBOSE(5, "rank %d ptl %d addr %p size %llu %s", - oshmem_proc_pe(oshmem_proc_local()), i, addr, (unsigned long long)size, + my_rank, i, addr, (unsigned long long)size, mca_spml_base_mkey2str(&mkeys[i])); + mca_spml_ikrit_cache_mkeys(&mkeys[i], memheap_find_segnum(addr), my_rank, i); } *count = MXM_PTL_LAST; @@ -476,14 +538,10 @@ int mca_spml_ikrit_deregister(sshmem_mkey_t *mkeys) } -static inline int get_ptl_id(int dst) -{ - return mca_spml_ikrit.mxm_peers[dst].ptl_id; -} - int mca_spml_ikrit_oob_get_mkeys(int pe, uint32_t seg, sshmem_mkey_t *mkeys) { int ptl; + ptl = get_ptl_id(pe); if (ptl < 0) return OSHMEM_ERROR; @@ -495,8 +553,11 @@ int mca_spml_ikrit_oob_get_mkeys(int pe, uint32_t seg, sshmem_mkey_t *mkeys) * So can only skip mkey exchange when ud is the only transport */ if (mca_spml_ikrit.ud_only) { - mkeys[ptl].len = 0; - mkeys[ptl].u.key = MAP_SEGMENT_SHM_INVALID; + /* assumes that remote has the same va_base as we do */ + mkeys[ptl].len = 0; + mkeys[ptl].va_base = mca_memheap_seg2base_va(seg); + mkeys[ptl].u.key = MAP_SEGMENT_SHM_INVALID; + mca_spml_ikrit_cache_mkeys(&mkeys[ptl], seg, pe, ptl); return OSHMEM_SUCCESS; } @@ -512,24 +573,13 @@ static inline int mca_spml_ikrit_get_helper(mxm_send_req_t *sreq, /* shmem spec states that get() operations are blocking. So it is enough to have single mxm request. Also we count on mxm doing copy */ void *rva; - sshmem_mkey_t *r_mkey; + mxm_mem_key_t *mkey; - /* already tried to send via shm and failed. go via rdma */ - /** - * Get the address to the remote rkey. - **/ - r_mkey = mca_memheap_base_get_cached_mkey(src, src_addr, MXM_PTL_RDMA, &rva); - if (!r_mkey) { - SPML_ERROR("pe=%d: %p is not address of shared variable", - src, src_addr); - oshmem_shmem_abort(-1); - return OSHMEM_ERROR; - } + mkey = mca_spml_ikrit_get_mkey(src, src_addr, MXM_PTL_RDMA, &rva); SPML_VERBOSE_FASTPATH(100, - "get: pe:%d ptl=%d src=%p -> dst: %p sz=%d. src_rva=%p, %s", - src, MXM_PTL_RDMA, src_addr, dst_addr, (int)size, (void *)rva, - mca_spml_base_mkey2str(r_mkey)); + "get: pe:%d ptl=%d src=%p -> dst: %p sz=%d. src_rva=%p", + src, MXM_PTL_RDMA, src_addr, dst_addr, (int)size, (void *)rva); /* mxm does not really cares for get lkey */ sreq->base.mq = mca_spml_ikrit.mxm_mq; @@ -537,7 +587,7 @@ static inline int mca_spml_ikrit_get_helper(mxm_send_req_t *sreq, sreq->base.data_type = MXM_REQ_DATA_BUFFER; sreq->base.data.buffer.ptr = dst_addr; sreq->base.data.buffer.length = size; - sreq->op.mem.remote_mkey = to_mxm_mkey(r_mkey); + sreq->op.mem.remote_mkey = mkey; sreq->opcode = MXM_REQ_OP_GET; sreq->op.mem.remote_vaddr = (intptr_t) rva; sreq->base.state = MXM_REQ_NEW; @@ -552,7 +602,6 @@ static inline int mca_spml_ikrit_get_shm(void *src_addr, { int ptl_id; void *rva; - sshmem_mkey_t *r_mkey; ptl_id = get_ptl_id(src); /** @@ -561,20 +610,12 @@ static inline int mca_spml_ikrit_get_shm(void *src_addr, if (ptl_id != MXM_PTL_SHM) return OSHMEM_ERROR; - r_mkey = mca_memheap_base_get_cached_mkey(src, src_addr, ptl_id, &rva); - if (!r_mkey) { - SPML_ERROR("pe=%d: %p is not address of shared variable", - src, src_addr); - oshmem_shmem_abort(-1); - return OSHMEM_ERROR; - } - - if (!mca_memheap_base_can_local_copy(r_mkey, src_addr)) + if (NULL != mca_spml_ikrit_get_mkey(src, src_addr, MXM_PTL_SHM, &rva)) return OSHMEM_ERROR; SPML_VERBOSE_FASTPATH(100, - "shm get: pe:%d src=%p -> dst: %p sz=%d. src_rva=%p, %s", - src, src_addr, dst_addr, (int)size, (void *)rva, mca_spml_base_mkey2str(r_mkey)); + "shm get: pe:%d src=%p -> dst: %p sz=%d. src_rva=%p", + src, src_addr, dst_addr, (int)size, (void *)rva); memcpy(dst_addr, (void *) (unsigned long) rva, size); opal_progress(); @@ -746,49 +787,27 @@ static inline int mca_spml_ikrit_put_internal(void* dst_addr, void *rva; mca_spml_ikrit_put_request_t *put_req; int ptl_id; - sshmem_mkey_t *r_mkey; static int count; int need_progress = 0; + mxm_mem_key_t *mkey; - if (0 >= size) { + if (OPAL_UNLIKELY(0 >= size)) { return OSHMEM_SUCCESS; } ptl_id = get_ptl_id(dst); - /* Get rkey of remote PE (dst proc) which must be on memheap */ - r_mkey = mca_memheap_base_get_cached_mkey(dst, dst_addr, ptl_id, &rva); - if (!r_mkey) { - SPML_ERROR("pe=%d: %p is not address of shared variable", - dst, dst_addr); - oshmem_shmem_abort(-1); - return OSHMEM_ERROR; - } + mkey = mca_spml_ikrit_get_mkey(dst, dst_addr, ptl_id, &rva); - SPML_VERBOSE_FASTPATH(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s", - dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva, - mca_spml_base_mkey2str(r_mkey)); - - if (OPAL_UNLIKELY(MXM_PTL_SHM == ptl_id)) { - if (mca_memheap_base_can_local_copy(r_mkey, dst_addr)) { - memcpy((void *) (unsigned long) rva, src_addr, size); - /* call progress as often as we would have with regular put */ - if (++count % SPML_IKRIT_PACKETS_PER_SYNC == 0) - mxm_progress(mca_spml_ikrit.mxm_context); - return OSHMEM_SUCCESS; - } - /* segment not mapped - fallback to rdma */ - r_mkey = mca_memheap_base_get_cached_mkey(dst, dst_addr, MXM_PTL_RDMA, &rva); - if (!r_mkey) { - SPML_ERROR("pe=%d: %p is not address of shared variable", - dst, dst_addr); - oshmem_shmem_abort(-1); - return OSHMEM_ERROR; - } + if (OPAL_UNLIKELY(NULL == mkey)) { + memcpy((void *) (unsigned long) rva, src_addr, size); + /* call progress as often as we would have with regular put */ + if (++count % SPML_IKRIT_PACKETS_PER_SYNC == 0) + mxm_progress(mca_spml_ikrit.mxm_context); + return OSHMEM_SUCCESS; } SPML_VERBOSE_FASTPATH(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s", - dst, MXM_PTL_RDMA, dst_addr, src_addr, (int)size, (void *)rva, - mca_spml_base_mkey2str(r_mkey)); + dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva); put_req = alloc_put_req(); @@ -836,7 +855,7 @@ static inline int mca_spml_ikrit_put_internal(void* dst_addr, put_req->mxm_req.base.state = MXM_REQ_NEW; put_req->pe = dst; - put_req->mxm_req.op.mem.remote_mkey = to_mxm_mkey(r_mkey); + put_req->mxm_req.op.mem.remote_mkey = mkey; OPAL_THREAD_ADD32(&mca_spml_ikrit.n_active_puts, 1); if (mca_spml_ikrit.mxm_peers[dst].need_fence == 0) { @@ -870,48 +889,25 @@ int mca_spml_ikrit_put_simple(void* dst_addr, mxm_send_req_t mxm_req; mxm_wait_t wait; int ptl_id; - sshmem_mkey_t *r_mkey; + mxm_mem_key_t *mkey; static int count; ptl_id = get_ptl_id(dst); - /* Get rkey of remote PE (dst proc) which must be on memheap */ - r_mkey = mca_memheap_base_get_cached_mkey(dst, dst_addr, ptl_id, &rva); - if (!r_mkey) { - SPML_ERROR("pe=%d: %p is not address of shared variable", - dst, dst_addr); - oshmem_shmem_abort(-1); - return OSHMEM_ERROR; - } + mkey = mca_spml_ikrit_get_mkey(dst, dst_addr, ptl_id, &rva); SPML_VERBOSE_FASTPATH(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s", - dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva, - mca_spml_base_mkey2str(r_mkey)); + dst, ptl_id, dst_addr, src_addr, (int)size, (void *)rva); - if (MXM_PTL_SHM == ptl_id) { - if (mca_memheap_base_can_local_copy(r_mkey, dst_addr)) { - memcpy((void *) (unsigned long) rva, src_addr, size); - /* call progress as often as we would have with regular put */ - if (++count % SPML_IKRIT_PACKETS_PER_SYNC == 0) - mxm_progress(mca_spml_ikrit.mxm_context); - return OSHMEM_SUCCESS; - } - /* segment not mapped - fallback to rmda */ - r_mkey = mca_memheap_base_get_cached_mkey(dst, - //(unsigned long) dst_addr, - dst_addr, - MXM_PTL_RDMA, - &rva); - if (!r_mkey) { - SPML_ERROR("pe=%d: %p is not address of shared variable", - dst, dst_addr); - oshmem_shmem_abort(-1); - return OSHMEM_ERROR; - } + if (NULL == mkey) { + memcpy((void *) (unsigned long) rva, src_addr, size); + /* call progress as often as we would have with regular put */ + if (++count % SPML_IKRIT_PACKETS_PER_SYNC == 0) + mxm_progress(mca_spml_ikrit.mxm_context); + return OSHMEM_SUCCESS; } SPML_VERBOSE_FASTPATH(100, "put: pe:%d ptl=%d dst=%p <- src: %p sz=%d. dst_rva=%p, %s", - dst, MXM_PTL_RDMA, dst_addr, src_addr, (int)size, (void *)rva, - mca_spml_base_mkey2str(r_mkey)); + dst, MXM_PTL_RDMA, dst_addr, src_addr, (int)size, (void *)rva); /* fill out request */ mxm_req.base.mq = mca_spml_ikrit.mxm_mq; @@ -927,7 +923,7 @@ int mca_spml_ikrit_put_simple(void* dst_addr, mxm_req.base.state = MXM_REQ_NEW; mxm_req.base.error = MXM_OK; - mxm_req.op.mem.remote_mkey = to_mxm_mkey(r_mkey); + mxm_req.op.mem.remote_mkey = mkey; if (mca_spml_ikrit.mxm_peers[dst].need_fence == 0) { opal_list_append(&mca_spml_ikrit.active_peers, diff --git a/oshmem/mca/spml/ikrit/spml_ikrit.h b/oshmem/mca/spml/ikrit/spml_ikrit.h index 1a42b91e723..c6810dbce51 100644 --- a/oshmem/mca/spml/ikrit/spml_ikrit.h +++ b/oshmem/mca/spml/ikrit/spml_ikrit.h @@ -33,6 +33,7 @@ #include "opal/class/opal_list.h" #include "orte/runtime/orte_globals.h" +#include "oshmem/mca/memheap/base/base.h" #include @@ -61,13 +62,21 @@ BEGIN_C_DECLS /** * MXM SPML module */ +/* TODO: move va_xx to base struct */ +struct spml_ikrit_mkey { + mkey_segment_t super; + mxm_mem_key_t key; +}; +typedef struct spml_ikrit_mkey spml_ikrit_mkey_t; + struct mxm_peer { mxm_conn_h mxm_conn; mxm_conn_h mxm_hw_rdma_conn; uint8_t ptl_id; - opal_list_item_t link; - int32_t n_active_puts; uint8_t need_fence; + int32_t n_active_puts; + opal_list_item_t link; + spml_ikrit_mkey_t mkeys[MCA_MEMHEAP_SEG_COUNT]; }; typedef struct mxm_peer mxm_peer_t; @@ -156,7 +165,7 @@ extern sshmem_mkey_t *mca_spml_ikrit_register(void* addr, int *count); extern int mca_spml_ikrit_deregister(sshmem_mkey_t *mkeys); extern int mca_spml_ikrit_oob_get_mkeys(int pe, - uint32_t seg, + uint32_t segno, sshmem_mkey_t *mkeys); extern int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs); @@ -164,6 +173,30 @@ extern int mca_spml_ikrit_del_procs(ompi_proc_t** procs, size_t nprocs); extern int mca_spml_ikrit_fence(void); extern int spml_ikrit_progress(void); +mxm_mem_key_t *mca_spml_ikrit_get_mkey_slow(int pe, void *va, int ptl_id, void **rva); + +/* the functionreturns NULL if data can be directly copied via shared memory + * else it returns mxm mem key + * + * the function will abort() if va is not symmetric var address. + */ +static inline mxm_mem_key_t *mca_spml_ikrit_get_mkey(int pe, void *va, int ptl_id, void **rva) +{ + spml_ikrit_mkey_t *mkey; + + if (OPAL_UNLIKELY(MXM_PTL_RDMA != ptl_id)) { + return mca_spml_ikrit_get_mkey_slow(pe, va, ptl_id, rva); + } + + mkey = mca_spml_ikrit.mxm_peers[pe].mkeys; + mkey = (spml_ikrit_mkey_t *)map_segment_find_va(&mkey->super.super, sizeof(*mkey), va); + if (OPAL_UNLIKELY(NULL == mkey)) { + return mca_spml_ikrit_get_mkey_slow(pe, va, ptl_id, rva); + } + *rva = map_segment_va2rva(&mkey->super, va); + return &mkey->key; +} + END_C_DECLS #endif diff --git a/oshmem/mca/spml/spml.h b/oshmem/mca/spml/spml.h index ffcc61f411f..f081b8b7b18 100644 --- a/oshmem/mca/spml/spml.h +++ b/oshmem/mca/spml/spml.h @@ -118,7 +118,7 @@ typedef int (*mca_spml_base_module_wait_fn_t)(void* addr, * * @param mkey remote mkey */ -typedef void (*mca_spml_base_module_mkey_unpack_fn_t)(sshmem_mkey_t *, int remote_pe); +typedef void (*mca_spml_base_module_mkey_unpack_fn_t)(sshmem_mkey_t *, uint32_t segno, int remote_pe, int tr_id); /** * free resources used by deserialized remote mkey @@ -149,9 +149,9 @@ typedef int (*mca_spml_base_module_deregister_fn_t)(sshmem_mkey_t *mkeys); /** * try to fill up mkeys that can be used to reach remote pe. - * @param pe remote pe + * @param pe remote pe * @param seg 0 - symmetric heap, 1 - static data, everything else are static data in .so - * @param mkeys mkeys array + * @param mkeys mkeys array * * @return OSHMEM_SUCCSESS if keys are found */ diff --git a/oshmem/mca/spml/ucx/spml_ucx.c b/oshmem/mca/spml/ucx/spml_ucx.c index 008e0ed779c..3f49a5ea9a6 100644 --- a/oshmem/mca/spml/ucx/spml_ucx.c +++ b/oshmem/mca/spml/ucx/spml_ucx.c @@ -115,7 +115,6 @@ int mca_spml_ucx_del_procs(ompi_proc_t** procs, size_t nprocs) int my_rank = oshmem_my_proc_id(); size_t num_reqs, max_reqs; void *dreq, **dreqs; - ompi_proc_t *proc; ucp_ep_h ep; size_t i, n; @@ -157,7 +156,7 @@ int mca_spml_ucx_del_procs(ompi_proc_t** procs, size_t nprocs) mca_spml_ucx.ucp_peers[n].ucp_conn = NULL; - if (num_reqs >= mca_spml_ucx.num_disconnect) { + if ((int)num_reqs >= mca_spml_ucx.num_disconnect) { mca_spml_ucx_waitall(dreqs, &num_reqs); } } @@ -322,6 +321,21 @@ int mca_spml_ucx_add_procs(ompi_proc_t** procs, size_t nprocs) } + +spml_ucx_mkey_t * mca_spml_ucx_get_mkey_slow(int pe, void *va, void **rva) +{ + sshmem_mkey_t *r_mkey; + + r_mkey = mca_memheap_base_get_cached_mkey(pe, va, 0, rva); + if (OPAL_UNLIKELY(!r_mkey)) { + SPML_ERROR("pe=%d: %p is not address of symmetric variable", + pe, va); + oshmem_shmem_abort(-1); + return NULL; + } + return (spml_ucx_mkey_t *)(r_mkey->spml_context); +} + void mca_spml_ucx_rmkey_free(sshmem_mkey_t *mkey) { spml_ucx_mkey_t *ucx_mkey; @@ -331,20 +345,23 @@ void mca_spml_ucx_rmkey_free(sshmem_mkey_t *mkey) } ucx_mkey = (spml_ucx_mkey_t *)(mkey->spml_context); ucp_rkey_destroy(ucx_mkey->rkey); - free(ucx_mkey); } -void mca_spml_ucx_rmkey_unpack(sshmem_mkey_t *mkey, int pe) +static void mca_spml_ucx_cache_mkey(sshmem_mkey_t *mkey, uint32_t segno, int dst_pe) +{ + ucp_peer_t *peer; + + peer = &mca_spml_ucx.ucp_peers[dst_pe]; + mkey_segment_init(&peer->mkeys[segno].super, mkey, segno); +} + +void mca_spml_ucx_rmkey_unpack(sshmem_mkey_t *mkey, uint32_t segno, int pe, int tr_id) { spml_ucx_mkey_t *ucx_mkey; ucs_status_t err; - - ucx_mkey = (spml_ucx_mkey_t *)malloc(sizeof(*ucx_mkey)); - if (!ucx_mkey) { - SPML_ERROR("not enough memory to allocate mkey"); - goto error_fatal; - } + ucx_mkey = &mca_spml_ucx.ucp_peers[pe].mkeys[segno].key; + err = ucp_ep_rkey_unpack(mca_spml_ucx.ucp_peers[pe].ucp_conn, mkey->u.data, &ucx_mkey->rkey); @@ -354,6 +371,7 @@ void mca_spml_ucx_rmkey_unpack(sshmem_mkey_t *mkey, int pe) } mkey->spml_context = ucx_mkey; + mca_spml_ucx_cache_mkey(mkey, segno, pe); return; error_fatal: @@ -370,23 +388,23 @@ sshmem_mkey_t *mca_spml_ucx_register(void* addr, ucs_status_t err; spml_ucx_mkey_t *ucx_mkey; size_t len; + int my_pe = oshmem_my_proc_id(); + int seg; *count = 0; mkeys = (sshmem_mkey_t *) calloc(1, sizeof(*mkeys)); if (!mkeys) { - return NULL ; + return NULL; } - ucx_mkey = (spml_ucx_mkey_t *)malloc(sizeof(*ucx_mkey)); - if (!ucx_mkey) { - goto error_out; - } + seg = memheap_find_segnum(addr); + ucx_mkey = &mca_spml_ucx.ucp_peers[my_pe].mkeys[seg].key; mkeys[0].spml_context = ucx_mkey; - err = ucp_mem_map(mca_spml_ucx.ucp_context, - &addr, size, 0, &ucx_mkey->mem_h); + + err = ucp_mem_map(mca_spml_ucx.ucp_context, &addr, size, 0, &ucx_mkey->mem_h); if (UCS_OK != err) { - goto error_out1; + goto error_out; } err = ucp_rkey_pack(mca_spml_ucx.ucp_context, ucx_mkey->mem_h, @@ -412,12 +430,11 @@ sshmem_mkey_t *mca_spml_ucx_register(void* addr, mkeys[0].len = len; mkeys[0].va_base = addr; *count = 1; + mca_spml_ucx_cache_mkey(&mkeys[0], seg, my_pe); return mkeys; error_unmap: ucp_mem_unmap(mca_spml_ucx.ucp_context, ucx_mkey->mem_h); -error_out1: - free(ucx_mkey); error_out: free(mkeys); @@ -442,7 +459,6 @@ int mca_spml_ucx_deregister(sshmem_mkey_t *mkeys) ucp_rkey_buffer_release(mkeys[0].u.data); } - free(ucx_mkey); return OSHMEM_SUCCESS; } diff --git a/oshmem/mca/spml/ucx/spml_ucx.h b/oshmem/mca/spml/ucx/spml_ucx.h index 5e828e06374..a6c47ea3799 100644 --- a/oshmem/mca/spml/ucx/spml_ucx.h +++ b/oshmem/mca/spml/ucx/spml_ucx.h @@ -40,10 +40,22 @@ BEGIN_C_DECLS /** * UCX SPML module */ -struct ucp_peer { - ucp_ep_h ucp_conn; +struct spml_ucx_mkey { + ucp_rkey_h rkey; + ucp_mem_h mem_h; +}; +typedef struct spml_ucx_mkey spml_ucx_mkey_t; + +struct spml_ucx_cached_mkey { + mkey_segment_t super; + spml_ucx_mkey_t key; }; +typedef struct spml_ucx_cached_mkey spml_ucx_cached_mkey_t; +struct ucp_peer { + ucp_ep_h ucp_conn; + spml_ucx_cached_mkey_t mkeys[MCA_MEMHEAP_SEG_COUNT]; +}; typedef struct ucp_peer ucp_peer_t; struct mca_spml_ucx { @@ -56,16 +68,8 @@ struct mca_spml_ucx { int priority; /* component priority */ bool enabled; }; - typedef struct mca_spml_ucx mca_spml_ucx_t; -struct spml_ucx_mkey { - ucp_rkey_h rkey; - ucp_mem_h mem_h; -}; - -typedef struct spml_ucx_mkey spml_ucx_mkey_t; - extern mca_spml_ucx_t mca_spml_ucx; @@ -103,7 +107,7 @@ extern sshmem_mkey_t *mca_spml_ucx_register(void* addr, int *count); extern int mca_spml_ucx_deregister(sshmem_mkey_t *mkeys); -extern void mca_spml_ucx_rmkey_unpack(sshmem_mkey_t *mkey, int pe); +extern void mca_spml_ucx_rmkey_unpack(sshmem_mkey_t *mkey, uint32_t segno, int pe, int tr_id); extern void mca_spml_ucx_rmkey_free(sshmem_mkey_t *mkey); extern int mca_spml_ucx_add_procs(ompi_proc_t** procs, size_t nprocs); @@ -113,20 +117,20 @@ extern int mca_spml_ucx_quiet(void); extern int spml_ucx_progress(void); +spml_ucx_mkey_t * mca_spml_ucx_get_mkey_slow(int pe, void *va, void **rva); static inline spml_ucx_mkey_t * mca_spml_ucx_get_mkey(int pe, void *va, void **rva) { - sshmem_mkey_t *r_mkey; - - r_mkey = mca_memheap_base_get_cached_mkey(pe, va, 0, rva); - if (OPAL_UNLIKELY(!r_mkey)) { - SPML_ERROR("pe=%d: %p is not address of symmetric variable", - pe, va); - oshmem_shmem_abort(-1); - return NULL; + spml_ucx_cached_mkey_t *mkey; + + mkey = mca_spml_ucx.ucp_peers[pe].mkeys; + mkey = (spml_ucx_cached_mkey_t *)map_segment_find_va(&mkey->super.super, sizeof(*mkey), va); + if (OPAL_UNLIKELY(NULL == mkey)) { + return mca_spml_ucx_get_mkey_slow(pe, va, rva); } - return (spml_ucx_mkey_t *)(r_mkey->spml_context); + *rva = map_segment_va2rva(&mkey->super, va); + return &mkey->key; } static inline int ucx_status_to_oshmem(ucs_status_t status) diff --git a/oshmem/mca/sshmem/mmap/sshmem_mmap_module.c b/oshmem/mca/sshmem/mmap/sshmem_mmap_module.c index 8d5ef386dbd..9b8daca0289 100644 --- a/oshmem/mca/sshmem/mmap/sshmem_mmap_module.c +++ b/oshmem/mca/sshmem/mmap/sshmem_mmap_module.c @@ -118,8 +118,8 @@ shmem_ds_reset(map_segment_t *ds_buf) MAP_SEGMENT_RESET_FLAGS(ds_buf); ds_buf->seg_id = MAP_SEGMENT_SHM_INVALID; - ds_buf->seg_base_addr = 0; - ds_buf->end = 0; + ds_buf->super.va_base = 0; + ds_buf->super.va_end = 0; ds_buf->seg_size = 0; ds_buf->type = MAP_SEGMENT_UNKNOWN; unlink(ds_buf->seg_name); @@ -218,9 +218,9 @@ segment_create(map_segment_t *ds_buf, */ ds_buf->seg_id = oshmem_my_proc_id(); } - ds_buf->seg_base_addr = addr; - ds_buf->seg_size = size; - ds_buf->end = (void*)((uintptr_t)ds_buf->seg_base_addr + ds_buf->seg_size); + ds_buf->super.va_base = addr; + ds_buf->seg_size = size; + ds_buf->super.va_end = (void*)((uintptr_t)ds_buf->super.va_base + ds_buf->seg_size); OPAL_OUTPUT_VERBOSE( (70, oshmem_sshmem_base_framework.framework_output, @@ -229,7 +229,7 @@ segment_create(map_segment_t *ds_buf, mca_sshmem_mmap_component.super.base_version.mca_type_name, mca_sshmem_mmap_component.super.base_version.mca_component_name, (rc ? "failure" : "successful"), - ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) + ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) ); return rc; @@ -344,7 +344,7 @@ segment_detach(map_segment_t *ds_buf, sshmem_mkey_t *mkey) ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) ); - munmap((void *)ds_buf->seg_base_addr, ds_buf->seg_size); + munmap((void *)ds_buf->super.va_base, ds_buf->seg_size); /* reset the contents of the map_segment_t associated with this * shared memory segment. diff --git a/oshmem/mca/sshmem/sshmem_types.h b/oshmem/mca/sshmem/sshmem_types.h index a89f3396fe6..3081892e5fb 100644 --- a/oshmem/mca/sshmem/sshmem_types.h +++ b/oshmem/mca/sshmem/sshmem_types.h @@ -96,16 +96,25 @@ typedef struct sshmem_mkey { void *spml_context; /* spml module can attach internal structures here */ } sshmem_mkey_t; -typedef struct map_segment_t { - sshmem_mkey_t **mkeys_cache; /* includes remote segment bases in va_base */ - sshmem_mkey_t *mkeys; /* includes local segment bases in va_base */ - segment_flag_t flags; /* enable/disable flag */ - int seg_id; - void* seg_base_addr; /* base address of the segment */ - void* end; /* final address of the segment */ - char seg_name[OPAL_PATH_MAX]; - size_t seg_size; /* length of the segment */ - segment_type_t type; /* type of the segment */ +typedef struct map_base_segment { + void *va_base; /* base address of the segment */ + void *va_end; /* final address of the segment */ +} map_base_segment_t; + +typedef struct mkey_segment { + map_base_segment_t super; + void *rva_base; /* base va on remote pe */ +} mkey_segment_t; + +typedef struct map_segment { + map_base_segment_t super; + sshmem_mkey_t **mkeys_cache; /* includes remote segment bases in va_base */ + sshmem_mkey_t *mkeys; /* includes local segment bases in va_base */ + segment_flag_t flags; /* enable/disable flag */ + int seg_id; + char seg_name[OPAL_PATH_MAX]; + size_t seg_size; /* length of the segment */ + segment_type_t type; /* type of the segment */ } map_segment_t; END_C_DECLS diff --git a/oshmem/mca/sshmem/sysv/sshmem_sysv_module.c b/oshmem/mca/sshmem/sysv/sshmem_sysv_module.c index 737051ea30b..726d9aa42a1 100644 --- a/oshmem/mca/sshmem/sysv/sshmem_sysv_module.c +++ b/oshmem/mca/sshmem/sysv/sshmem_sysv_module.c @@ -114,8 +114,8 @@ shmem_ds_reset(map_segment_t *ds_buf) MAP_SEGMENT_RESET_FLAGS(ds_buf); ds_buf->seg_id = MAP_SEGMENT_SHM_INVALID; - ds_buf->seg_base_addr = 0; - ds_buf->end = 0; + ds_buf->super.va_base = 0; + ds_buf->super.va_end = 0; ds_buf->seg_size = 0; ds_buf->type = MAP_SEGMENT_UNKNOWN; memset(ds_buf->seg_name, '\0', sizeof(ds_buf->seg_name)); @@ -225,9 +225,9 @@ segment_create(map_segment_t *ds_buf, ds_buf->type = MAP_SEGMENT_ALLOC_SHM; ds_buf->seg_id = shmid; - ds_buf->seg_base_addr = addr; + ds_buf->super.va_base = addr; ds_buf->seg_size = size; - ds_buf->end = (void*)((uintptr_t)ds_buf->seg_base_addr + ds_buf->seg_size); + ds_buf->super.va_end = (void*)((uintptr_t)ds_buf->super.va_base + ds_buf->seg_size); OPAL_OUTPUT_VERBOSE( (70, oshmem_sshmem_base_framework.framework_output, @@ -236,7 +236,7 @@ segment_create(map_segment_t *ds_buf, mca_sshmem_sysv_component.super.base_version.mca_type_name, mca_sshmem_sysv_component.super.base_version.mca_component_name, (rc ? "failure" : "successful"), - ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) + ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) ); return rc; diff --git a/oshmem/mca/sshmem/verbs/sshmem_verbs_module.c b/oshmem/mca/sshmem/verbs/sshmem_verbs_module.c index 8f9ed70feef..53d042fe21a 100644 --- a/oshmem/mca/sshmem/verbs/sshmem_verbs_module.c +++ b/oshmem/mca/sshmem/verbs/sshmem_verbs_module.c @@ -110,8 +110,8 @@ shmem_ds_reset(map_segment_t *ds_buf) MAP_SEGMENT_RESET_FLAGS(ds_buf); ds_buf->seg_id = MAP_SEGMENT_SHM_INVALID; - ds_buf->seg_base_addr = 0; - ds_buf->end = 0; + ds_buf->super.va_base = 0; + ds_buf->super.va_end = 0; ds_buf->seg_size = 0; ds_buf->type = MAP_SEGMENT_UNKNOWN; memset(ds_buf->seg_name, '\0', sizeof(ds_buf->seg_name)); @@ -320,9 +320,9 @@ segment_create(map_segment_t *ds_buf, ds_buf->type = MAP_SEGMENT_ALLOC_IBV_NOSHMR; ds_buf->seg_id = MAP_SEGMENT_SHM_INVALID; } - ds_buf->seg_base_addr = ib_mr->addr; + ds_buf->super.va_base = ib_mr->addr; ds_buf->seg_size = size; - ds_buf->end = (void*)((uintptr_t)ds_buf->seg_base_addr + ds_buf->seg_size); + ds_buf->super.va_end = (void*)((uintptr_t)ds_buf->super.va_base + ds_buf->seg_size); } } From f133d9b6c859218a120d4493e84f93078a9c1e0d Mon Sep 17 00:00:00 2001 From: Alex Mikheev Date: Mon, 7 Nov 2016 14:39:46 +0200 Subject: [PATCH 11/14] oshmem: fixes comiplation errors in sshmem Signed-off-by: Alex Mikheev --- oshmem/mca/sshmem/mmap/sshmem_mmap_module.c | 6 +++--- oshmem/mca/sshmem/sysv/sshmem_sysv_module.c | 4 ++-- oshmem/mca/sshmem/verbs/sshmem_verbs_module.c | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/oshmem/mca/sshmem/mmap/sshmem_mmap_module.c b/oshmem/mca/sshmem/mmap/sshmem_mmap_module.c index 9b8daca0289..07de1e6d581 100644 --- a/oshmem/mca/sshmem/mmap/sshmem_mmap_module.c +++ b/oshmem/mca/sshmem/mmap/sshmem_mmap_module.c @@ -319,7 +319,7 @@ segment_attach(map_segment_t *ds_buf, sshmem_mkey_t *mkey) "(id: %d, addr: %p size: %lu, name: %s | va_base: 0x%p len: %d key %llx)\n", mca_sshmem_mmap_component.super.base_version.mca_type_name, mca_sshmem_mmap_component.super.base_version.mca_component_name, - ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name, + ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size, ds_buf->seg_name, mkey->va_base, mkey->len, (unsigned long long)mkey->u.key) ); @@ -341,7 +341,7 @@ segment_detach(map_segment_t *ds_buf, sshmem_mkey_t *mkey) "(id: %d, addr: %p size: %lu, name: %s)\n", mca_sshmem_mmap_component.super.base_version.mca_type_name, mca_sshmem_mmap_component.super.base_version.mca_component_name, - ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) + ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) ); munmap((void *)ds_buf->super.va_base, ds_buf->seg_size); @@ -366,7 +366,7 @@ segment_unlink(map_segment_t *ds_buf) "(id: %d, addr: %p size: %lu, name: %s)\n", mca_sshmem_mmap_component.super.base_version.mca_type_name, mca_sshmem_mmap_component.super.base_version.mca_component_name, - ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) + ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) ); /* don't completely reset. in particular, only reset diff --git a/oshmem/mca/sshmem/sysv/sshmem_sysv_module.c b/oshmem/mca/sshmem/sysv/sshmem_sysv_module.c index 726d9aa42a1..625ef445349 100644 --- a/oshmem/mca/sshmem/sysv/sshmem_sysv_module.c +++ b/oshmem/mca/sshmem/sysv/sshmem_sysv_module.c @@ -264,7 +264,7 @@ segment_attach(map_segment_t *ds_buf, sshmem_mkey_t *mkey) "(id: %d, addr: %p size: %lu, name: %s | va_base: 0x%p len: %d key %llx)\n", mca_sshmem_sysv_component.super.base_version.mca_type_name, mca_sshmem_sysv_component.super.base_version.mca_component_name, - ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name, + ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size, ds_buf->seg_name, mkey->va_base, mkey->len, (unsigned long long)mkey->u.key) ); @@ -286,7 +286,7 @@ segment_detach(map_segment_t *ds_buf, sshmem_mkey_t *mkey) "(id: %d, addr: %p size: %lu, name: %s)\n", mca_sshmem_sysv_component.super.base_version.mca_type_name, mca_sshmem_sysv_component.super.base_version.mca_component_name, - ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) + ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) ); if (ds_buf->seg_id != MAP_SEGMENT_SHM_INVALID) { diff --git a/oshmem/mca/sshmem/verbs/sshmem_verbs_module.c b/oshmem/mca/sshmem/verbs/sshmem_verbs_module.c index 53d042fe21a..f1828764134 100644 --- a/oshmem/mca/sshmem/verbs/sshmem_verbs_module.c +++ b/oshmem/mca/sshmem/verbs/sshmem_verbs_module.c @@ -333,7 +333,7 @@ segment_create(map_segment_t *ds_buf, mca_sshmem_verbs_component.super.base_version.mca_type_name, mca_sshmem_verbs_component.super.base_version.mca_component_name, (rc ? "failure" : "successful"), - ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) + ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) ); return rc; @@ -398,7 +398,7 @@ segment_attach(map_segment_t *ds_buf, sshmem_mkey_t *mkey) "(id: %d, addr: %p size: %lu, name: %s | va_base: 0x%p len: %d key %llx)\n", mca_sshmem_verbs_component.super.base_version.mca_type_name, mca_sshmem_verbs_component.super.base_version.mca_component_name, - ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name, + ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size, ds_buf->seg_name, mkey->va_base, mkey->len, (unsigned long long)mkey->u.key) ); @@ -422,7 +422,7 @@ segment_detach(map_segment_t *ds_buf, sshmem_mkey_t *mkey) "(id: %d, addr: %p size: %lu, name: %s)\n", mca_sshmem_verbs_component.super.base_version.mca_type_name, mca_sshmem_verbs_component.super.base_version.mca_component_name, - ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) + ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) ); if (device) { @@ -501,7 +501,7 @@ segment_unlink(map_segment_t *ds_buf) "(id: %d, addr: %p size: %lu, name: %s)\n", mca_sshmem_verbs_component.super.base_version.mca_type_name, mca_sshmem_verbs_component.super.base_version.mca_component_name, - ds_buf->seg_id, ds_buf->seg_base_addr, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) + ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) ); /* don't completely reset. in particular, only reset From bf61961f8b4f0f4cb183e5f01b4ad9e83efbe4d7 Mon Sep 17 00:00:00 2001 From: Alex Mikheev Date: Tue, 8 Nov 2016 15:11:59 +0200 Subject: [PATCH 12/14] oshmem: code review fixes Signed-off-by: Alex Mikheev --- oshmem/mca/memheap/base/base.h | 22 +++++++--------- oshmem/mca/memheap/base/memheap_base_mkey.c | 20 +++++--------- oshmem/mca/spml/ikrit/spml_ikrit.c | 29 +++++---------------- 3 files changed, 22 insertions(+), 49 deletions(-) diff --git a/oshmem/mca/memheap/base/base.h b/oshmem/mca/memheap/base/base.h index 222cfd9e813..c82e03da5a7 100644 --- a/oshmem/mca/memheap/base/base.h +++ b/oshmem/mca/memheap/base/base.h @@ -162,24 +162,22 @@ extern int mca_memheap_seg_cmp(const void *k, const void *v); extern mca_memheap_map_t* memheap_map; -static inline int map_segment_is_va_in(map_base_segment_t *s, const void *va) +static inline int map_segment_is_va_in(map_base_segment_t *s, void *va) { - return ((uintptr_t)va >= (uintptr_t)s->va_base && - (uintptr_t)va < (uintptr_t)s->va_end); + return (va >= s->va_base && va < s->va_end); } -static inline map_segment_t *memheap_find_seg(const int segno) +static inline map_segment_t *memheap_find_seg(int segno) { return &mca_memheap_base_map.mem_segs[segno]; } -static inline int memheap_is_va_in_segment(const void *va, const int segno) +static inline int memheap_is_va_in_segment(void *va, int segno) { - return map_segment_is_va_in(&memheap_find_seg(segno)->super, va); } -static inline int memheap_find_segnum(const void *va) +static inline int memheap_find_segnum(void *va) { if (OPAL_LIKELY(memheap_is_va_in_segment(va, SYMB_SEG_INDEX))) { return SYMB_SEG_INDEX; @@ -189,19 +187,19 @@ static inline int memheap_find_segnum(const void *va) return MEMHEAP_SEG_INVALID; } -static inline void* memheap_va2rva(const void* va, const void* local_base, const void* remote_base) +static inline void* memheap_va2rva(void* va, void* local_base, void* remote_base) { return (void*) (remote_base > local_base ? (uintptr_t)va + ((uintptr_t)remote_base - (uintptr_t)local_base) : (uintptr_t)va - ((uintptr_t)local_base - (uintptr_t)remote_base)); } -static inline void *map_segment_va2rva(mkey_segment_t *seg, const void *va) +static inline void *map_segment_va2rva(mkey_segment_t *seg, void *va) { return memheap_va2rva(va, seg->super.va_base, seg->rva_base); } -static inline map_base_segment_t *map_segment_find_va(map_base_segment_t *segs, size_t elem_size, const void *va) +static inline map_base_segment_t *map_segment_find_va(map_base_segment_t *segs, size_t elem_size, void *va) { map_base_segment_t *rseg; @@ -220,8 +218,7 @@ static inline map_base_segment_t *map_segment_find_va(map_base_segment_t *segs, void mkey_segment_init(mkey_segment_t *seg, sshmem_mkey_t *mkey, uint32_t segno); - -static inline map_segment_t *memheap_find_va(const void* va) +static inline map_segment_t *memheap_find_va(void* va) { map_segment_t *s; @@ -296,7 +293,6 @@ static inline void* mca_memheap_seg2base_va(int seg) return memheap_map->mem_segs[seg].super.va_base; } - END_C_DECLS #endif /* MCA_MEMHEAP_BASE_H */ diff --git a/oshmem/mca/memheap/base/memheap_base_mkey.c b/oshmem/mca/memheap/base/memheap_base_mkey.c index d072ee053e1..563fcd0bb99 100644 --- a/oshmem/mca/memheap/base/memheap_base_mkey.c +++ b/oshmem/mca/memheap/base/memheap_base_mkey.c @@ -85,7 +85,7 @@ int mca_memheap_seg_cmp(const void *k, const void *v) static int pack_local_mkeys(opal_buffer_t *msg, int pe, int seg) { - int i, n, tr_id; + int i, n; sshmem_mkey_t *mkey; /* go over all transports and pack mkeys */ @@ -93,14 +93,13 @@ static int pack_local_mkeys(opal_buffer_t *msg, int pe, int seg) opal_dss.pack(msg, &n, 1, OPAL_UINT32); MEMHEAP_VERBOSE(5, "found %d transports to %d", n, pe); for (i = 0; i < n; i++) { - tr_id = i; - mkey = mca_memheap_base_get_mkey(mca_memheap_seg2base_va(seg), tr_id); + mkey = mca_memheap_base_get_mkey(mca_memheap_seg2base_va(seg), i); if (!mkey) { MEMHEAP_ERROR("seg#%d tr_id: %d failed to find local mkey", - seg, tr_id); + seg, i); return OSHMEM_ERROR; } - opal_dss.pack(msg, &tr_id, 1, OPAL_UINT32); + opal_dss.pack(msg, &i, 1, OPAL_UINT32); opal_dss.pack(msg, &mkey->va_base, 1, OPAL_UINT64); if (0 == mkey->va_base) { opal_dss.pack(msg, &mkey->u.key, 1, OPAL_UINT64); @@ -112,7 +111,7 @@ static int pack_local_mkeys(opal_buffer_t *msg, int pe, int seg) } MEMHEAP_VERBOSE(5, "seg#%d tr_id: %d %s", - seg, tr_id, mca_spml_base_mkey2str(mkey)); + seg, i, mca_spml_base_mkey2str(mkey)); } return OSHMEM_SUCCESS; } @@ -470,13 +469,6 @@ static int memheap_oob_get_mkeys(int pe, uint32_t seg, sshmem_mkey_t *mkeys) pe, i, mca_spml_base_mkey2str(&mkeys[i])); - int my_pe = oshmem_my_proc_id(); - if (my_pe == 0) - printf( - "MKEY CALCULATED BY LOCAL SPML: pe: %d tr_id: %d %s\n", - pe, - i, - mca_spml_base_mkey2str(&mkeys[i])); } return OSHMEM_SUCCESS; } @@ -742,7 +734,7 @@ uint64_t mca_memheap_base_find_offset(int pe, int mca_memheap_base_is_symmetric_addr(const void* va) { - return (memheap_find_va(va) ? 1 : 0); + return (memheap_find_va((void *)va) ? 1 : 0); } int mca_memheap_base_detect_addr_type(void* va) diff --git a/oshmem/mca/spml/ikrit/spml_ikrit.c b/oshmem/mca/spml/ikrit/spml_ikrit.c index 909b9137234..1374ceb4557 100644 --- a/oshmem/mca/spml/ikrit/spml_ikrit.c +++ b/oshmem/mca/spml/ikrit/spml_ikrit.c @@ -491,7 +491,7 @@ sshmem_mkey_t *mca_spml_ikrit_register(void* addr, } SPML_VERBOSE(5, "rank %d ptl %d addr %p size %llu %s", - my_rank, i, addr, (unsigned long long)size, + my_rank, i, addr, (unsigned long long)size, mca_spml_base_mkey2str(&mkeys[i])); mca_spml_ikrit_cache_mkeys(&mkeys[i], memheap_find_segnum(addr), my_rank, i); @@ -687,12 +687,11 @@ static inline int mca_spml_ikrit_get_async(void *src_addr, get_req = alloc_get_req(); - if (OSHMEM_SUCCESS - != mca_spml_ikrit_get_helper(&get_req->mxm_req, - src_addr, - size, - dst_addr, - src)) { + if (OSHMEM_SUCCESS != mca_spml_ikrit_get_helper(&get_req->mxm_req, + src_addr, + size, + dst_addr, + src)) { oshmem_shmem_abort(-1); return OSHMEM_ERROR; } @@ -818,16 +817,6 @@ static inline int mca_spml_ikrit_put_internal(void* dst_addr, put_req->mxm_req.base.mq = mca_spml_ikrit.mxm_mq; /* request immediate responce if we are getting low on send buffers. We only get responce from remote on ack timeout. * Also request explicit ack once in a while */ -#if 0 - put_req->mxm_req.opcode = MXM_REQ_OP_PUT; - if (mca_spml_ikrit.free_list_max - mca_spml_ikrit.n_active_puts <= SPML_IKRIT_PUT_LOW_WATER || - (mca_spml_ikrit.mxm_peers[dst]->n_active_puts + 1) % SPML_IKRIT_PACKETS_PER_SYNC == 0) { - put_req->mxm_req.base.flags = MXM_REQ_FLAG_SEND_SYNC; - need_progress = 1; - } else { - put_req->mxm_req.base.flags = MXM_REQ_FLAG_SEND_LAZY|MXM_REQ_FLAG_SEND_SYNC; - } -#endif put_req->mxm_req.flags = 0; if (mca_spml_ikrit.free_list_max - mca_spml_ikrit.n_active_puts <= SPML_IKRIT_PUT_LOW_WATER || (int)opal_list_get_size(&mca_spml_ikrit.active_peers) > mca_spml_ikrit.unsync_conn_max || @@ -1007,11 +996,7 @@ int mca_spml_ikrit_fence(void) mca_spml_ikrit_mxm_fence(peer - mca_spml_ikrit.mxm_peers); } - while (0 < mca_spml_ikrit.n_mxm_fences) { - opal_progress(); - } - - while (0 < mca_spml_ikrit.n_active_gets) { + while (0 < mca_spml_ikrit.n_mxm_fences || 0 < mca_spml_ikrit.n_active_gets) { opal_progress(); } From 48a7a0bbb9c3a609ed258af8910d5148bc37e80c Mon Sep 17 00:00:00 2001 From: Alex Mikheev Date: Thu, 10 Nov 2016 11:27:24 +0200 Subject: [PATCH 13/14] oshmem: lock: call opal_progress only when busy waiting Signed-off-by: Alex Mikheev --- oshmem/shmem/c/shmem_lock.c | 72 +++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 30 deletions(-) diff --git a/oshmem/shmem/c/shmem_lock.c b/oshmem/shmem/c/shmem_lock.c index dc469461acf..e7b7d815485 100644 --- a/oshmem/shmem/c/shmem_lock.c +++ b/oshmem/shmem/c/shmem_lock.c @@ -270,10 +270,22 @@ static uint64_t shmem_lock_cswap(void *target, prev_value = prev_value_32; } - /* function is used to busy wait for the value. - * Call opal_progress() so that ompi will no deadlock - * (for example may need to respond to rkey requests) - */ + return prev_value; +} + +/* function is used to busy wait for the value. + * Call opal_progress() so that ompi will no deadlock + * (for example may need to respond to rkey requests) + */ +static uint64_t shmem_lock_cswap_poll(void *target, + int target_size, + uint64_t cond, + uint64_t value, + int pe) +{ + uint64_t prev_value; + + prev_value = shmem_lock_cswap(target, target_size, cond, value, pe); opal_progress(); return prev_value; } @@ -320,11 +332,11 @@ static int pack_first_word(void *lock, extract_second_word(&lock_value, lock_size, &two); pack_2_words(&new_long_value, lock_size, one, &two); while (lock_value - != (temp = shmem_lock_cswap(lock, - lock_size, - lock_value, - new_long_value, - my_pe))) { + != (temp = shmem_lock_cswap_poll(lock, + lock_size, + lock_value, + new_long_value, + my_pe))) { lock_value = temp; extract_second_word(&lock_value, lock_size, &two); pack_2_words(&new_long_value, lock_size, one, &two); @@ -371,11 +383,11 @@ static int pack_second_word(void *lock, extract_first_word(&lock_value, lock_size, &one); pack_2_words(&new_long_value, lock_size, &one, two); while (lock_value - != (temp = shmem_lock_cswap(lock, - lock_size, - lock_value, - new_long_value, - my_pe))) { + != (temp = shmem_lock_cswap_poll(lock, + lock_size, + lock_value, + new_long_value, + my_pe))) { lock_value = temp; extract_first_word(&lock_value, lock_size, &one); pack_2_words(&new_long_value, lock_size, &one, two); @@ -695,11 +707,11 @@ static int shmem_lock_wait_for_ticket(void *lock, new_server_lock = server_lock = temp; lock_pack_pe_last(&new_server_lock, lock_size, &my_pe, 0); } while (server_lock - != (temp = shmem_lock_cswap(lock, - lock_size, - server_lock, - new_server_lock, - server_pe))); + != (temp = shmem_lock_cswap_poll(lock, + lock_size, + server_lock, + new_server_lock, + server_pe))); lock_extract_pe_last(&server_lock, lock_size, pe_last); if (*pe_last == -1) { /* we are first in queue for the lock */ @@ -755,11 +767,11 @@ static int shmem_lock_subscribe_for_informing(void *lock, prev_remote_value += my_pe + 1; while (prev_remote_value - != (temp_value = shmem_lock_cswap(lock, - lock_size, - prev_remote_value, - new_remote_value, - pe_last))) { + != (temp_value = shmem_lock_cswap_poll(lock, + lock_size, + prev_remote_value, + new_remote_value, + pe_last))) { prev_remote_value = temp_value; lock_extract_counter(&prev_remote_value, lock_size, @@ -853,11 +865,11 @@ static int shmem_lock_inform_next(void *lock, int lock_size, int pe_next) | (((uint64_t) 1) << (lock_bitwise_size - 1)); while (remote_value - != (temp_value = shmem_lock_cswap(lock, - lock_size, - remote_value, - new_remote_value, - pe_next))) { + != (temp_value = shmem_lock_cswap_poll(lock, + lock_size, + remote_value, + new_remote_value, + pe_next))) { remote_value = temp_value; new_remote_value = remote_value | (((uint64_t) 1) << (lock_bitwise_size - 1)); @@ -942,7 +954,7 @@ static int shmem_lock_try_inform_server(void *lock, int lock_size) &incorrect_pe, &my_pe); return !(remote_value - == shmem_lock_cswap(lock, lock_size, remote_value, zero, server_pe)); + == shmem_lock_cswap_poll(lock, lock_size, remote_value, zero, server_pe)); } /***************************************************************************/ From 864904e8ab8b6aed227380cc565ef8a9128be5b2 Mon Sep 17 00:00:00 2001 From: Alex Mikheev Date: Thu, 10 Nov 2016 11:29:03 +0200 Subject: [PATCH 14/14] oshmem: ucx: check status only if configured --with-oshmem-param-check Current standard says that behaviour in the case of error is undefined Signed-off-by: Alex Mikheev --- oshmem/mca/spml/ucx/spml_ucx.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/oshmem/mca/spml/ucx/spml_ucx.h b/oshmem/mca/spml/ucx/spml_ucx.h index a6c47ea3799..0e0e01b7f1d 100644 --- a/oshmem/mca/spml/ucx/spml_ucx.h +++ b/oshmem/mca/spml/ucx/spml_ucx.h @@ -135,12 +135,20 @@ mca_spml_ucx_get_mkey(int pe, void *va, void **rva) static inline int ucx_status_to_oshmem(ucs_status_t status) { +#if OSHMEM_PARAM_CHECK == 1 return OPAL_LIKELY(UCS_OK == status) ? OSHMEM_SUCCESS : OSHMEM_ERROR; +#else + return OSHMEM_SUCCESS; +#endif } static inline int ucx_status_to_oshmem_nb(ucs_status_t status) { +#if OSHMEM_PARAM_CHECK == 1 return OPAL_LIKELY(status >= 0) ? OSHMEM_SUCCESS : OSHMEM_ERROR; +#else + return OSHMEM_SUCCESS; +#endif } END_C_DECLS