diff --git a/ompi/mca/osc/rdma/osc_rdma.h b/ompi/mca/osc/rdma/osc_rdma.h index 6a58c19f86c..85132768e8c 100644 --- a/ompi/mca/osc/rdma/osc_rdma.h +++ b/ompi/mca/osc/rdma/osc_rdma.h @@ -148,9 +148,6 @@ struct ompi_osc_rdma_module_t { /** value of same_size info key for this window */ bool same_size; - /** CPU atomics can be used */ - bool use_cpu_atomics; - /** passive-target synchronization will not be used in this window */ bool no_locks; @@ -255,6 +252,21 @@ struct ompi_osc_rdma_module_t { /** lock for peer hash table/array */ opal_mutex_t peer_lock; + /** flag to indicate wether to use the local leader optimization, + * in which on each node a process was designated as local leader. + * local leader setup a shared memory region, and all other processes + * on the same node map their state to that region. When a process + * want to update a peer's state, the process uses atomics on the peer's + * local leader to update peer's state through shared memory region. + * BTLs that uses active message RDMA cannot support such optimization, + * because active message RDMA uses send/receive to emulate put and + * atomics, so the atomcis and RMA operation must be through the same + * ordered channel. + */ + bool use_local_leader; + + /** array of peer state. Used when local leader is NOT used */ + uintptr_t *peer_state_array; /** BTL(s) in use. Currently this is only used to support RDMA emulation over * non-RDMA BTLs. The typical usage is btl/sm + btl/tcp. In the future this @@ -630,11 +642,6 @@ static inline bool ompi_osc_rdma_oor (int rc) return (OPAL_SUCCESS != rc && (OPAL_ERR_OUT_OF_RESOURCE == rc || OPAL_ERR_TEMP_OUT_OF_RESOURCE == rc)); } -__opal_attribute_always_inline__ -static inline mca_btl_base_module_t *ompi_osc_rdma_selected_btl (ompi_osc_rdma_module_t *module, uint8_t btl_index) { - return module->selected_btls[btl_index]; -} - __opal_attribute_always_inline__ static inline void ompi_osc_rdma_selected_btl_insert (ompi_osc_rdma_module_t *module, struct mca_btl_base_module_t *btl, uint8_t btl_index) { if(btl_index == module->selected_btls_size) { diff --git a/ompi/mca/osc/rdma/osc_rdma_accumulate.c b/ompi/mca/osc/rdma/osc_rdma_accumulate.c index 15f0a80714e..6824668beb1 100644 --- a/ompi/mca/osc/rdma/osc_rdma_accumulate.c +++ b/ompi/mca/osc/rdma/osc_rdma_accumulate.c @@ -156,7 +156,7 @@ static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const mca_btl_base_registration_handle_t *target_handle, ompi_op_t *op, ompi_osc_rdma_request_t *req) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + mca_btl_base_module_t *selected_btl = peer->data_btl; int32_t atomic_flags = selected_btl->btl_atomic_flags; int btl_op, flags; int64_t origin; @@ -176,7 +176,7 @@ static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const origin = (8 == extent) ? ((int64_t *) origin_addr)[0] : ((int32_t *) origin_addr)[0]; - return ompi_osc_rdma_btl_fop (module, peer->data_btl_index, peer->data_endpoint, target_address, target_handle, btl_op, origin, flags, + return ompi_osc_rdma_btl_fop (module, peer->data_btl, peer->data_endpoint, target_address, target_handle, btl_op, origin, flags, result_addr, true, NULL, NULL, NULL); } @@ -198,7 +198,7 @@ static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const voi OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating fetch-and-op using compare-and-swap"); - ret = ompi_osc_get_data_blocking (module, peer->data_btl_index, peer->data_endpoint, address, target_handle, &old_value, 8); + ret = ompi_osc_get_data_blocking (module, peer->data_btl, peer->data_endpoint, address, target_handle, &old_value, 8); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } @@ -213,7 +213,7 @@ static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const voi ompi_op_reduce (op, (void *) ((intptr_t) origin_addr + dt->super.true_lb), (void*)((intptr_t) &new_value + offset), 1, dt); } - ret = ompi_osc_rdma_btl_cswap (module, peer->data_btl_index, peer->data_endpoint, address, target_handle, + ret = ompi_osc_rdma_btl_cswap (module, peer->data_btl, peer->data_endpoint, address, target_handle, old_value, new_value, 0, (int64_t*)&new_value); if (OPAL_SUCCESS != ret || new_value == old_value) { break; @@ -234,7 +234,7 @@ static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const vo ompi_op_t *op, ompi_osc_rdma_request_t *req) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + mca_btl_base_module_t *selected_btl = peer->data_btl; int32_t atomic_flags = selected_btl->btl_atomic_flags; int btl_op, flags; int64_t origin; @@ -262,7 +262,7 @@ static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const vo *((int64_t *) origin_addr)); /* if we locked the peer its best to wait for completion before returning */ - return ompi_osc_rdma_btl_op (module, peer->data_btl_index, peer->data_endpoint, target_address, target_handle, btl_op, origin, + return ompi_osc_rdma_btl_op (module, peer->data_btl, peer->data_endpoint, target_address, target_handle, btl_op, origin, flags, true, NULL, NULL, NULL); } @@ -375,7 +375,7 @@ static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const v /* set up the request */ request->to_free = ptr; - ret = ompi_osc_get_data_blocking (module, peer->data_btl_index, peer->data_endpoint, + ret = ompi_osc_get_data_blocking (module, peer->data_btl, peer->data_endpoint, target_address, target_handle, ptr, len); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; @@ -661,7 +661,7 @@ static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const vo bool lock_acquired) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + mca_btl_base_module_t *btl = peer->data_btl; int32_t atomic_flags = btl->btl_atomic_flags; const size_t size = datatype->super.size; int64_t compare, source; @@ -679,7 +679,7 @@ static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const vo OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating compare-and-swap using %d-bit btl atomics. compare: 0x%" PRIx64 ", origin: 0x%" PRIx64, (int) size * 8, *((int64_t *) compare_addr), *((int64_t *) source_addr)); - ret = ompi_osc_rdma_btl_cswap (module, peer->data_btl_index, peer->data_endpoint, target_address, target_handle, + ret = ompi_osc_rdma_btl_cswap (module, peer->data_btl, peer->data_endpoint, target_address, target_handle, compare, source, flags, result_addr); if (OPAL_LIKELY(OMPI_SUCCESS == ret)) { ompi_osc_rdma_peer_accumulate_cleanup (module, peer, lock_acquired); @@ -715,7 +715,7 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr, mca_btl_base_registration_handle_t *target_handle, bool lock_acquired) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + mca_btl_base_module_t *btl = peer->data_btl; unsigned long len = datatype->super.size; mca_btl_base_registration_handle_t *local_handle = NULL; ompi_osc_rdma_frag_t *frag = NULL; @@ -728,7 +728,7 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr, ", sync %p", len, target_address, (void *) sync); OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "RDMA compare-and-swap initiating blocking btl get..."); - ret = ompi_osc_get_data_blocking (module, peer->data_btl_index, peer->data_endpoint, target_address, + ret = ompi_osc_get_data_blocking (module, peer->data_btl, peer->data_endpoint, target_address, target_handle, result_addr, len); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.c b/ompi/mca/osc/rdma/osc_rdma_comm.c index 449bbea0641..46a012da5dc 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.c +++ b/ompi/mca/osc/rdma/osc_rdma_comm.c @@ -54,11 +54,11 @@ static void ompi_osc_get_data_complete (struct mca_btl_base_module_t *btl, struc ((bool *) context)[0] = true; } -int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, uint8_t btl_index, +int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, + struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, uint64_t source_address, mca_btl_base_registration_handle_t *source_handle, void *data, size_t len) { - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, btl_index); const size_t btl_alignment_mask = ALIGNMENT_MASK(btl->btl_get_alignment); mca_btl_base_registration_handle_t *local_handle = NULL; ompi_osc_rdma_frag_t *frag = NULL; @@ -444,7 +444,7 @@ static int ompi_osc_rdma_put_real (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_pee mca_btl_base_registration_handle_t *local_handle, size_t size, mca_btl_base_rdma_completion_fn_t cb, void *context, void *cbdata) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + mca_btl_base_module_t *btl = peer->data_btl; int ret; OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating btl put of %lu bytes to remote address %" PRIx64 ", sync " @@ -481,7 +481,7 @@ int ompi_osc_rdma_put_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t * ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + mca_btl_base_module_t *btl = peer->data_btl; mca_btl_base_registration_handle_t *local_handle = NULL; mca_btl_base_rdma_completion_fn_t cbfunc = NULL; ompi_osc_rdma_frag_t *frag = NULL; @@ -600,7 +600,7 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + mca_btl_base_module_t *btl = peer->data_btl; const size_t btl_alignment_mask = ALIGNMENT_MASK(btl->btl_get_alignment); mca_btl_base_registration_handle_t *local_handle = NULL; ompi_osc_rdma_frag_t *frag = NULL; @@ -736,7 +736,7 @@ static inline int ompi_osc_rdma_put_w_req (ompi_osc_rdma_sync_t *sync, const voi ompi_datatype_t *target_datatype, ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + mca_btl_base_module_t *btl = peer->data_btl; mca_btl_base_registration_handle_t *target_handle; uint64_t target_address; int ret; @@ -779,7 +779,7 @@ static inline int ompi_osc_rdma_get_w_req (ompi_osc_rdma_sync_t *sync, void *ori ompi_datatype_t *source_datatype, ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + mca_btl_base_module_t *btl = peer->data_btl; mca_btl_base_registration_handle_t *source_handle; uint64_t source_address; ptrdiff_t source_span, source_lb; diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.h b/ompi/mca/osc/rdma/osc_rdma_comm.h index efb305a571e..6427272a450 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.h +++ b/ompi/mca/osc/rdma/osc_rdma_comm.h @@ -103,6 +103,7 @@ int ompi_osc_rdma_rget (void *origin_addr, int origin_count, ompi_datatype_t *or * @brief read data from a remote memory region (blocking) * * @param[in] module osc rdma module + * @param[in] btl btl module * @param[in] endpoint btl endpoint * @param[in] source_address remote address to read from * @param[in] source_handle btl registration handle for remote region (must be valid for the entire region) @@ -113,7 +114,8 @@ int ompi_osc_rdma_rget (void *origin_addr, int origin_count, ompi_datatype_t *or * data that is stored on the remote peer. The peer object does not have to be fully initialized to * work. Only the btl endpoint is needed. */ -int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, uint8_t btl_index, +int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, + struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, uint64_t source_address, mca_btl_base_registration_handle_t *source_handle, void *data, size_t len); diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index a2e8dca6fa7..5c347996c1a 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -446,11 +446,53 @@ static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void return OMPI_SUCCESS; } +/** + * @brief gather pointer of module->state and inside the world comm + * + * This function is used when local leader optimization is NOT used. + * In which case, each process communicate with its peer directly + * to update peer's state counters (instead of communicating with the peer's + * local leader), therefore it need the pointer of peer's state counters, so + * it can use atomics to update the counters. + * + * Note state_handle is not gathered because local leader optimization + * is NOT used only when active message RDMA is used, and active message + * RDMA does not need memory registration. + * + * @param module[in,out] ompi osc rdma module + */ +static int gather_peer_state(ompi_osc_rdma_module_t *module) +{ + int ret, comm_size; + + comm_size = ompi_comm_size (module->comm); + + module->peer_state_array = calloc(comm_size, sizeof(uintptr_t)); + if (NULL == module->peer_state_array) { + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to allocate memory for module state array!"); + return OMPI_ERR_OUT_OF_RESOURCE; + } + + ret = module->comm->c_coll->coll_allgather(&module->state, sizeof(uintptr_t), MPI_BYTE, + module->peer_state_array, sizeof(uintptr_t), MPI_BYTE, + module->comm, module->comm->c_coll->coll_allgather_module); + if (OMPI_SUCCESS != ret) { + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "module state allgather failed with ompi error code %d", ret); + return ret; + } + + assert (!module->use_memory_registration); + + return 0; +} + + static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, size_t size) { size_t total_size, local_rank_array_size, leader_peer_data_size; ompi_osc_rdma_peer_t *my_peer; int ret, my_rank; + bool use_cpu_atomics; OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "allocating private internal state"); @@ -505,6 +547,13 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s } } + if (!module->use_local_leader) { + ret = gather_peer_state(module); + if (OPAL_UNLIKELY(OMPI_SUCCESS !=ret)) { + return ret; + } + } + ret = ompi_osc_rdma_new_peer (module, my_rank, &my_peer); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; @@ -521,13 +570,15 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s my_peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE; my_peer->state = (uint64_t) (uintptr_t) module->state; - if (module->use_cpu_atomics) { + assert(NULL != my_peer->data_btl); + use_cpu_atomics = (my_peer->data_btl->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB); + if (use_cpu_atomics) { /* all peers are local or it is safe to mix cpu and nic atomics */ my_peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_STATE; } else { /* use my endpoint handle to modify the peer's state */ my_peer->state_handle = module->state_handle; - my_peer->state_btl_index = my_peer->data_btl_index; + my_peer->state_btl = my_peer->data_btl; my_peer->state_endpoint = my_peer->data_endpoint; } @@ -540,7 +591,7 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s ex_peer->size = size; } - if (!module->use_cpu_atomics) { + if (!use_cpu_atomics) { if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { /* base is local and cpu atomics are available */ ex_peer->super.base_handle = module->state_handle; @@ -574,6 +625,7 @@ static int synchronize_errorcode(int errorcode, ompi_communicator_t *comm) static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, size_t size) { ompi_communicator_t *shared_comm; + bool use_cpu_atomics_on_peer; unsigned long offset, total_size; unsigned long state_base, data_base; int local_rank, local_size, ret; @@ -592,12 +644,13 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s /* CPU atomics can be used if every process is on the same node or the NIC allows mixing CPU and NIC atomics */ module->single_node = local_size == global_size; - module->use_cpu_atomics = module->single_node; - - if (!module->single_node) { - for (int i = 0 ; i < module->btls_in_use ; ++i) { - module->use_cpu_atomics = module->use_cpu_atomics && !!(module->selected_btls[i]->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB); - } + module->use_local_leader = true; + for (int i = 0 ; i < module->btls_in_use ; ++i) { + /* the usage of local leader means to use different channels to send data to peer and update peer's state. + * When different channels are used, active message RDMA cannot guarantee that put and atomics are completed + * in the same order. + */ + module->use_local_leader = module->use_local_leader && ! (module->selected_btls[i]->btl_flags &(MCA_BTL_FLAGS_PUT_AM | MCA_BTL_FLAGS_ATOMIC_AM_FOP)); } if (1 == local_size) { @@ -650,9 +703,9 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s if (0 == local_rank) { /* allocate the shared memory segment */ - ret = opal_asprintf (&data_file, "%s" OPAL_PATH_SEP "osc_rdma.%s.%x.%d", + ret = opal_asprintf (&data_file, "%s" OPAL_PATH_SEP "osc_rdma.%s.%x.%d.%d", mca_osc_rdma_component.backing_directory, ompi_process_info.nodename, - OMPI_PROC_MY_NAME->jobid, ompi_comm_get_cid(module->comm)); + OMPI_PROC_MY_NAME->jobid, ompi_comm_get_cid(module->comm), getpid()); if (0 > ret) { ret = OMPI_ERR_OUT_OF_RESOURCE; } else { @@ -751,6 +804,13 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s break; } + if (!module->use_local_leader) { + ret = gather_peer_state(module); + if (OPAL_UNLIKELY(OMPI_SUCCESS !=ret)) { + break; + } + } + offset = data_base; ompi_osc_rdma_peer_t *local_leader; for (int i = 0 ; i < local_size ; ++i) { @@ -772,24 +832,33 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s ex_peer = (ompi_osc_rdma_peer_extended_t *) peer; + use_cpu_atomics_on_peer = (peer->data_btl->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB); /* set up peer state */ - if (module->use_cpu_atomics) { + if (use_cpu_atomics_on_peer) { /* all peers are local or it is safe to mix cpu and nic atomics */ peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_STATE; peer->state = (osc_rdma_counter_t) peer_state; peer->state_endpoint = NULL; } else { - /* use my endpoint handle to modify the peer's state */ - if (module->use_memory_registration) { - peer->state_handle = (mca_btl_base_registration_handle_t *) state_region->btl_handle_data; - } - peer->state = (osc_rdma_counter_t) ((uintptr_t) state_region->base + state_base + module->state_size * i); - if (i==0) { - peer->state_endpoint = peer->data_endpoint; - peer->state_btl_index = peer->data_btl_index; + + if (module->use_local_leader) { + if (module->use_memory_registration) { + peer->state_handle = (mca_btl_base_registration_handle_t *) state_region->btl_handle_data; + } + peer->state = (osc_rdma_counter_t) ((uintptr_t) state_region->base + state_base + module->state_size * i); + if (i==0) { + peer->state_endpoint = peer->data_endpoint; + peer->state_btl = peer->data_btl; + } else { + peer->state_endpoint = local_leader->state_endpoint; + peer->state_btl = local_leader->state_btl; + } } else { - peer->state_endpoint = local_leader->state_endpoint; - peer->state_btl_index = local_leader->state_btl_index; + assert (!module->use_memory_registration); + assert (NULL != module->peer_state_array); + peer->state = (osc_rdma_counter_t)module->peer_state_array[peer_rank]; + peer->state_endpoint = peer->data_endpoint; + peer->state_btl = peer->data_btl; } } @@ -798,16 +867,16 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s } if (MPI_WIN_FLAVOR_DYNAMIC != module->flavor && MPI_WIN_FLAVOR_CREATE != module->flavor && - !module->use_cpu_atomics && temp[i].size && i > 0) { + !use_cpu_atomics_on_peer && temp[i].size && i > 0) { /* use the local leader's endpoint */ peer->data_endpoint = local_leader->data_endpoint; - peer->data_btl_index = local_leader->data_btl_index; + peer->data_btl = local_leader->data_btl; } ompi_osc_module_add_peer (module, peer); if (MPI_WIN_FLAVOR_DYNAMIC == module->flavor) { - if (module->use_cpu_atomics && peer_rank == my_rank) { + if (use_cpu_atomics_on_peer && peer_rank == my_rank) { peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE; } /* nothing more to do */ @@ -823,7 +892,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s ex_peer->size = temp[i].size; } - if (module->use_cpu_atomics && (MPI_WIN_FLAVOR_ALLOCATE == module->flavor || peer_rank == my_rank)) { + if (use_cpu_atomics_on_peer && (MPI_WIN_FLAVOR_ALLOCATE == module->flavor || peer_rank == my_rank)) { /* base is local and cpu atomics are available */ if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { ex_peer->super.base = (uintptr_t) module->segment_base + offset; diff --git a/ompi/mca/osc/rdma/osc_rdma_dynamic.c b/ompi/mca/osc/rdma/osc_rdma_dynamic.c index 8adfa7f8159..73e80eb8dc7 100644 --- a/ompi/mca/osc/rdma/osc_rdma_dynamic.c +++ b/ompi/mca/osc/rdma/osc_rdma_dynamic.c @@ -392,7 +392,7 @@ static int ompi_osc_rdma_refresh_dynamic_region (ompi_osc_rdma_module_t *module, osc_rdma_counter_t remote_value; source_address = (uint64_t)(intptr_t) peer->super.state + offsetof (ompi_osc_rdma_state_t, region_count); - ret = ompi_osc_get_data_blocking (module, peer->super.state_btl_index, peer->super.state_endpoint, + ret = ompi_osc_get_data_blocking (module, peer->super.state_btl, peer->super.state_endpoint, source_address, peer->super.state_handle, &remote_value, sizeof (remote_value)); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { @@ -433,7 +433,7 @@ static int ompi_osc_rdma_refresh_dynamic_region (ompi_osc_rdma_module_t *module, OMPI_OSC_RDMA_LOCK_EXCLUSIVE); source_address = (uint64_t)(intptr_t) peer->super.state + offsetof (ompi_osc_rdma_state_t, regions); - ret = ompi_osc_get_data_blocking (module, peer->super.state_btl_index, peer->super.state_endpoint, + ret = ompi_osc_get_data_blocking (module, peer->super.state_btl, peer->super.state_endpoint, source_address, peer->super.state_handle, peer->regions, region_len); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { OPAL_THREAD_UNLOCK(&module->lock); diff --git a/ompi/mca/osc/rdma/osc_rdma_lock.h b/ompi/mca/osc/rdma/osc_rdma_lock.h index 36a30a1cc0b..0ffdcde190f 100644 --- a/ompi/mca/osc/rdma/osc_rdma_lock.h +++ b/ompi/mca/osc/rdma/osc_rdma_lock.h @@ -37,14 +37,14 @@ void ompi_osc_rdma_atomic_complete (mca_btl_base_module_t *btl, struct mca_btl_b void *context, void *data, int status); __opal_attribute_always_inline__ -static inline int ompi_osc_rdma_btl_fop (ompi_osc_rdma_module_t *module, uint8_t btl_index, +static inline int ompi_osc_rdma_btl_fop (ompi_osc_rdma_module_t *module, + struct mca_btl_base_module_t *selected_btl, struct mca_btl_base_endpoint_t *endpoint, uint64_t address, mca_btl_base_registration_handle_t *address_handle, int op, int64_t operand, int flags, int64_t *result, const bool wait_for_completion, ompi_osc_rdma_pending_op_cb_fn_t cbfunc, void *cbdata, void *cbcontext) { ompi_osc_rdma_pending_op_t *pending_op; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); int ret = OPAL_ERROR; pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); @@ -110,23 +110,23 @@ static inline int ompi_osc_rdma_lock_btl_fop (ompi_osc_rdma_module_t *module, om int op, ompi_osc_rdma_lock_t operand, ompi_osc_rdma_lock_t *result, const bool wait_for_completion) { - return ompi_osc_rdma_btl_fop (module, peer->state_btl_index, peer->state_endpoint, address, peer->state_handle, op, + return ompi_osc_rdma_btl_fop (module, peer->state_btl, peer->state_endpoint, address, peer->state_handle, op, operand, 0, result, wait_for_completion, NULL, NULL, NULL); } __opal_attribute_always_inline__ -static inline int ompi_osc_rdma_btl_op (ompi_osc_rdma_module_t *module, uint8_t btl_index, +static inline int ompi_osc_rdma_btl_op (ompi_osc_rdma_module_t *module, + struct mca_btl_base_module_t *selected_btl, struct mca_btl_base_endpoint_t *endpoint, uint64_t address, mca_btl_base_registration_handle_t *address_handle, int op, int64_t operand, int flags, const bool wait_for_completion, ompi_osc_rdma_pending_op_cb_fn_t cbfunc, void *cbdata, void *cbcontext) { ompi_osc_rdma_pending_op_t *pending_op; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); int ret; if (!(selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) { - return ompi_osc_rdma_btl_fop (module, btl_index, endpoint, address, address_handle, op, operand, flags, + return ompi_osc_rdma_btl_fop (module, selected_btl, endpoint, address, address_handle, op, operand, flags, NULL, wait_for_completion, cbfunc, cbdata, cbcontext); } @@ -181,18 +181,18 @@ __opal_attribute_always_inline__ static inline int ompi_osc_rdma_lock_btl_op (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t address, int op, ompi_osc_rdma_lock_t operand, const bool wait_for_completion) { - return ompi_osc_rdma_btl_op (module, peer->state_btl_index, peer->state_endpoint, address, peer->state_handle, op, + return ompi_osc_rdma_btl_op (module, peer->state_btl, peer->state_endpoint, address, peer->state_handle, op, operand, 0, wait_for_completion, NULL, NULL, NULL); } __opal_attribute_always_inline__ -static inline int ompi_osc_rdma_btl_cswap (ompi_osc_rdma_module_t *module, uint8_t btl_index, +static inline int ompi_osc_rdma_btl_cswap (ompi_osc_rdma_module_t *module, + struct mca_btl_base_module_t *selected_btl, struct mca_btl_base_endpoint_t *endpoint, uint64_t address, mca_btl_base_registration_handle_t *address_handle, int64_t compare, int64_t value, int flags, int64_t *result) { ompi_osc_rdma_pending_op_t *pending_op; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); int ret; pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); @@ -244,7 +244,7 @@ __opal_attribute_always_inline__ static inline int ompi_osc_rdma_lock_btl_cswap (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t address, ompi_osc_rdma_lock_t compare, ompi_osc_rdma_lock_t value, ompi_osc_rdma_lock_t *result) { - return ompi_osc_rdma_btl_cswap (module, peer->state_btl_index, peer->state_endpoint, address, peer->state_handle, compare, value, + return ompi_osc_rdma_btl_cswap (module, peer->state_btl, peer->state_endpoint, address, peer->state_handle, compare, value, 0, result); } diff --git a/ompi/mca/osc/rdma/osc_rdma_peer.c b/ompi/mca/osc/rdma/osc_rdma_peer.c index c6689d78812..2543ef4eecb 100644 --- a/ompi/mca/osc/rdma/osc_rdma_peer.c +++ b/ompi/mca/osc/rdma/osc_rdma_peer.c @@ -30,13 +30,15 @@ * * @param[in] module osc rdma module * @param[in] peer_id process rank in the module communicator - * @param[in] module_btl_index btl index to use + * @param[in] btl_out btl to be used + * @param[in] endpoint endpoint to be used * - * @returns NULL on error - * @returns btl endpoint on success + * @returns OMPI_SUCCESS on success + * @returns ompi error code on error */ static int ompi_osc_rdma_peer_btl_endpoint (struct ompi_osc_rdma_module_t *module, - int peer_id, uint8_t *btl_index_out, + int peer_id, + struct mca_btl_base_module_t **btl_out, struct mca_btl_base_endpoint_t **endpoint) { ompi_proc_t *proc = ompi_comm_peer_lookup (module->comm, peer_id); @@ -51,7 +53,7 @@ static int ompi_osc_rdma_peer_btl_endpoint (struct ompi_osc_rdma_module_t *modul for (int module_btl_index = 0 ; module_btl_index < module->btls_in_use ; ++module_btl_index) { for (int btl_index = 0 ; btl_index < num_btls ; ++btl_index) { if (bml_endpoint->btl_rdma.bml_btls[btl_index].btl == module->selected_btls[module_btl_index]) { - *btl_index_out = module_btl_index; + *btl_out = bml_endpoint->btl_rdma.bml_btls[btl_index].btl; *endpoint = bml_endpoint->btl_rdma.bml_btls[btl_index].btl_endpoint; return OMPI_SUCCESS; } @@ -64,28 +66,39 @@ static int ompi_osc_rdma_peer_btl_endpoint (struct ompi_osc_rdma_module_t *modul for (int module_btl_index = 0 ; module_btl_index < module->btls_in_use ; ++module_btl_index) { for (int btl_index = 0 ; btl_index < num_btls ; ++btl_index) { if (bml_endpoint->btl_eager.bml_btls[btl_index].btl == module->selected_btls[module_btl_index]) { - *btl_index_out = module_btl_index; + *btl_out = bml_endpoint->btl_eager.bml_btls[btl_index].btl; *endpoint = bml_endpoint->btl_eager.bml_btls[btl_index].btl_endpoint; return OMPI_SUCCESS; } } } - /* unlikely but can happen when creating a peer for self */ + if (peer_id == ompi_comm_rank (module->comm)) { + for (int btl_index = 0 ; btl_index < num_btls ; ++btl_index) { + struct mca_btl_base_module_t *btl; + + btl = bml_endpoint->btl_rdma.bml_btls[btl_index].btl; + if (strcmp(btl->btl_component->btl_version.mca_component_name, "self")==0) { + *btl_out = btl; + *endpoint = bml_endpoint->btl_eager.bml_btls[btl_index].btl_endpoint; + return OMPI_SUCCESS; + } + } + } + return OMPI_ERR_UNREACH; } int ompi_osc_rdma_new_peer (struct ompi_osc_rdma_module_t *module, int peer_id, ompi_osc_rdma_peer_t **peer_out) { struct mca_btl_base_endpoint_t *endpoint; + struct mca_btl_base_module_t *btl; ompi_osc_rdma_peer_t *peer; - uint8_t module_btl_index = UINT8_MAX; *peer_out = NULL; /* find a btl/endpoint to use for this peer */ - int ret = ompi_osc_rdma_peer_btl_endpoint (module, peer_id, &module_btl_index, &endpoint); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret && !((module->selected_btls[0]->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB) && - peer_id == ompi_comm_rank (module->comm)))) { + int ret = ompi_osc_rdma_peer_btl_endpoint (module, peer_id, &btl, &endpoint); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } @@ -99,7 +112,7 @@ int ompi_osc_rdma_new_peer (struct ompi_osc_rdma_module_t *module, int peer_id, } peer->data_endpoint = endpoint; - peer->data_btl_index = module_btl_index; + peer->data_btl = btl; peer->rank = peer_id; *peer_out = peer; @@ -127,7 +140,7 @@ static int ompi_osc_rdma_peer_setup (ompi_osc_rdma_module_t *module, ompi_osc_rd ompi_osc_rdma_rank_data_t rank_data; int registration_handle_size = 0; int node_id, node_rank, array_index; - uint8_t array_btl_index; + struct mca_btl_base_module_t *array_btl; int ret, disp_unit; char *peer_data; @@ -137,49 +150,64 @@ static int ompi_osc_rdma_peer_setup (ompi_osc_rdma_module_t *module, ompi_osc_rd registration_handle_size = module->selected_btls[0]->btl_registration_handle_size; } - /* each node is responsible for holding a part of the rank -> node/local rank mapping array. this code - * calculates the node and offset the mapping can be found. once the mapping has been read the state - * part of the peer structure can be initialized. */ - node_id = peer->rank / RANK_ARRAY_COUNT(module); - array_peer_data = (ompi_osc_rdma_region_t *) ((intptr_t) module->node_comm_info + node_id * module->region_size); + if (module->use_local_leader) { + /* each node is responsible for holding a part of the rank -> node/local rank mapping array. this code + * calculates the node and offset the mapping can be found. once the mapping has been read the state + * part of the peer structure can be initialized. */ + node_id = peer->rank / RANK_ARRAY_COUNT(module); + array_peer_data = (ompi_osc_rdma_region_t *) ((intptr_t) module->node_comm_info + node_id * module->region_size); - /* the node leader rank is stored in the length field */ - node_rank = NODE_ID_TO_RANK(module, array_peer_data, node_id); - array_index = peer->rank % RANK_ARRAY_COUNT(module); + /* the node leader rank is stored in the length field */ + node_rank = NODE_ID_TO_RANK(module, array_peer_data, node_id); + array_index = peer->rank % RANK_ARRAY_COUNT(module); - array_pointer = array_peer_data->base + array_index * sizeof (rank_data); + array_pointer = array_peer_data->base + array_index * sizeof (rank_data); - /* lookup the btl endpoint needed to retrieve the mapping */ - ret = ompi_osc_rdma_peer_btl_endpoint (module, node_rank, &array_btl_index, &array_endpoint); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - return OMPI_ERR_UNREACH; - } + /* lookup the btl endpoint needed to retrieve the mapping */ + ret = ompi_osc_rdma_peer_btl_endpoint (module, node_rank, &array_btl, &array_endpoint); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return OMPI_ERR_UNREACH; + } - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "reading region data for %d from rank: %d, index: %d, pointer: 0x%" PRIx64 - ", size: %lu", peer->rank, node_rank, array_index, array_pointer, sizeof (rank_data)); + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "reading region data for %d from rank: %d, index: %d, pointer: 0x%" PRIx64 + ", size: %lu", peer->rank, node_rank, array_index, array_pointer, sizeof (rank_data)); - ret = ompi_osc_get_data_blocking (module, array_btl_index, array_endpoint, array_pointer, - (mca_btl_base_registration_handle_t *) array_peer_data->btl_handle_data, - &rank_data, sizeof (rank_data)); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - return ret; - } + ret = ompi_osc_get_data_blocking (module, array_btl, array_endpoint, array_pointer, + (mca_btl_base_registration_handle_t *) array_peer_data->btl_handle_data, + &rank_data, sizeof (rank_data)); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } - /* initialize the state part of the peer object. NTH: for now the state data is for every node is stored on - * every node. this gives a good balance of code complexity and memory usage at this time. we take advantage - * of this by re-using the endpoint and pointer stored in the node_comm_info array. */ - node_peer_data = (ompi_osc_rdma_region_t *) ((intptr_t) module->node_comm_info + rank_data.node_id * module->region_size); + /* initialize the state part of the peer object. NTH: for now the state data is for every node is stored on + * every node. this gives a good balance of code complexity and memory usage at this time. we take advantage + * of this by re-using the endpoint and pointer stored in the node_comm_info array. */ + node_peer_data = (ompi_osc_rdma_region_t *) ((intptr_t) module->node_comm_info + rank_data.node_id * module->region_size); - peer->state = node_peer_data->base + module->state_offset + module->state_size * rank_data.rank; + peer->state = node_peer_data->base + module->state_offset + module->state_size * rank_data.rank; - if (registration_handle_size) { - peer->state_handle = (mca_btl_base_registration_handle_t *) node_peer_data->btl_handle_data; - } + if (registration_handle_size) { + peer->state_handle = (mca_btl_base_registration_handle_t *) node_peer_data->btl_handle_data; + } - ret = ompi_osc_rdma_peer_btl_endpoint (module, NODE_ID_TO_RANK(module, node_peer_data, rank_data.node_id), - &peer->state_btl_index, &peer->state_endpoint); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - return OPAL_ERR_UNREACH; + ret = ompi_osc_rdma_peer_btl_endpoint (module, NODE_ID_TO_RANK(module, node_peer_data, rank_data.node_id), + &peer->state_btl, &peer->state_endpoint); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return OPAL_ERR_UNREACH; + } + } else { + assert(NULL != module->peer_state_array); + peer->state = module->peer_state_array[peer->rank]; + + assert(!module->use_memory_registration); + peer->state_handle = NULL; + + /* when local leader optimization is not used, + * same endpoint were used to transfer data and + * update state + */ + peer->state_btl = peer->data_btl; + peer->state_endpoint = peer->data_endpoint; } /* nothing more to do for dynamic memory windows */ @@ -199,7 +227,7 @@ static int ompi_osc_rdma_peer_setup (ompi_osc_rdma_module_t *module, ompi_osc_rd peer_data = alloca (peer_data_size); /* read window data from the end of the target's state structure */ - ret = ompi_osc_get_data_blocking (module, peer->state_btl_index, peer->state_endpoint, + ret = ompi_osc_get_data_blocking (module, peer->state_btl, peer->state_endpoint, peer->state + peer_data_offset, peer->state_handle, peer_data, peer_data_size); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { @@ -248,7 +276,7 @@ static int ompi_osc_rdma_peer_setup (ompi_osc_rdma_module_t *module, ompi_osc_rd if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { ex_peer->super.super.data_endpoint = ex_peer->super.super.state_endpoint; - ex_peer->super.super.data_btl_index = ex_peer->super.super.state_btl_index; + ex_peer->super.super.data_btl = ex_peer->super.super.state_btl; } } diff --git a/ompi/mca/osc/rdma/osc_rdma_peer.h b/ompi/mca/osc/rdma/osc_rdma_peer.h index ef8f8e0605c..86a717babf6 100644 --- a/ompi/mca/osc/rdma/osc_rdma_peer.h +++ b/ompi/mca/osc/rdma/osc_rdma_peer.h @@ -45,11 +45,14 @@ struct ompi_osc_rdma_peer_t { /** peer flags */ opal_atomic_int32_t flags; - /** index into BTL array */ - uint8_t data_btl_index; - - /** index into BTL array */ - uint8_t state_btl_index; + /** btl used for rdma */ + struct mca_btl_base_module_t *data_btl; + + /** btl used for reading/modifying peer state. + * When the local leader optimization is used, + * peer state are read/modified through a different + * btl then the one used for rdma data */ + struct mca_btl_base_module_t *state_btl; }; typedef struct ompi_osc_rdma_peer_t ompi_osc_rdma_peer_t; diff --git a/opal/mca/btl/base/btl_base_am_rdma.c b/opal/mca/btl/base/btl_base_am_rdma.c index 4feeff87c24..474dd1803c3 100644 --- a/opal/mca/btl/base/btl_base_am_rdma.c +++ b/opal/mca/btl/base/btl_base_am_rdma.c @@ -1114,7 +1114,7 @@ int mca_btl_base_am_rdma_init(mca_btl_base_module_t *btl) /* emulated RDMA atomics can support the full range of atomics. for * now only a handful are supported. */ - btl->btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_GLOB | MCA_BTL_ATOMIC_SUPPORTS_CSWAP + btl->btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_CSWAP | MCA_BTL_ATOMIC_SUPPORTS_32BIT | MCA_BTL_ATOMIC_SUPPORTS_ADD | MCA_BTL_ATOMIC_SUPPORTS_AND | MCA_BTL_ATOMIC_SUPPORTS_OR | MCA_BTL_ATOMIC_SUPPORTS_XOR | MCA_BTL_ATOMIC_SUPPORTS_SWAP diff --git a/opal/mca/btl/ofi/btl_ofi_module.c b/opal/mca/btl/ofi/btl_ofi_module.c index cffa0c27317..bd0b492bddb 100644 --- a/opal/mca/btl/ofi/btl_ofi_module.c +++ b/opal/mca/btl/ofi/btl_ofi_module.c @@ -393,7 +393,8 @@ mca_btl_ofi_module_t *mca_btl_ofi_module_alloc(int mode) module->super.btl_flags |= MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS | MCA_BTL_FLAGS_RDMA; - module->super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | MCA_BTL_ATOMIC_SUPPORTS_SWAP + module->super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_GLOB + | MCA_BTL_ATOMIC_SUPPORTS_ADD | MCA_BTL_ATOMIC_SUPPORTS_SWAP | MCA_BTL_ATOMIC_SUPPORTS_CSWAP | MCA_BTL_ATOMIC_SUPPORTS_32BIT; diff --git a/opal/mca/btl/self/btl_self_component.c b/opal/mca/btl/self/btl_self_component.c index 576716e4d73..c9fc5bdca0b 100644 --- a/opal/mca/btl/self/btl_self_component.c +++ b/opal/mca/btl/self/btl_self_component.c @@ -107,6 +107,7 @@ static int mca_btl_self_component_register(void) mca_btl_self.btl_rdma_pipeline_frag_size = INT_MAX; mca_btl_self.btl_min_rdma_pipeline_size = 0; mca_btl_self.btl_flags = MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_SEND; + mca_btl_self.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_GLOB; mca_btl_self.btl_bandwidth = 100; mca_btl_self.btl_latency = 0; mca_btl_base_param_register(&mca_btl_self_component.super.btl_version, &mca_btl_self);