Skip to content

Commit 18f1037

Browse files
authored
Merge pull request #6152 from ggouaillardet/topic/v4.0.x/ucx_warning
btl/openib: delay UCX warning to add_procs()
2 parents a04f5f0 + 8da4605 commit 18f1037

File tree

4 files changed

+187
-133
lines changed

4 files changed

+187
-133
lines changed

opal/mca/btl/openib/btl_openib.c

Lines changed: 75 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919
* Copyright (c) 2009 IBM Corporation. All rights reserved.
2020
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
2121
* Copyright (c) 2013-2015 NVIDIA Corporation. All rights reserved.
22-
* Copyright (c) 2014-2015 Research Organization for Information Science
23-
* and Technology (RIST). All rights reserved.
22+
* Copyright (c) 2014-2018 Research Organization for Information Science
23+
* and Technology (RIST). All rights reserved.
2424
* Copyright (c) 2014 Bull SAS. All rights reserved
2525
* $COPYRIGHT$
2626
*
@@ -1040,6 +1040,15 @@ int mca_btl_openib_add_procs(
10401040
int btl_rank = 0;
10411041
volatile mca_btl_base_endpoint_t* endpoint;
10421042

1043+
1044+
if (! openib_btl->allowed) {
1045+
opal_bitmap_clear_all_bits(reachable);
1046+
opal_show_help("help-mpi-btl-openib.txt", "ib port not selected",
1047+
true, opal_process_info.nodename,
1048+
openib_btl->device_name, openib_btl->port_num);
1049+
return OPAL_SUCCESS;
1050+
}
1051+
10431052
btl_rank = get_openib_btl_params(openib_btl, &lcl_subnet_id_port_cnt);
10441053
if( 0 > btl_rank ){
10451054
return OPAL_ERR_NOT_FOUND;
@@ -1639,79 +1648,81 @@ static int mca_btl_openib_finalize_resources(struct mca_btl_base_module_t* btl)
16391648
return OPAL_SUCCESS;
16401649
}
16411650

1642-
/* Release all QPs */
1643-
if (NULL != openib_btl->device->endpoints) {
1644-
for (ep_index=0;
1645-
ep_index < opal_pointer_array_get_size(openib_btl->device->endpoints);
1646-
ep_index++) {
1647-
endpoint=(mca_btl_openib_endpoint_t *)opal_pointer_array_get_item(openib_btl->device->endpoints,
1651+
if (openib_btl->allowed) {
1652+
/* Release all QPs */
1653+
if (NULL != openib_btl->device->endpoints) {
1654+
for (ep_index=0;
1655+
ep_index < opal_pointer_array_get_size(openib_btl->device->endpoints);
1656+
ep_index++) {
1657+
endpoint=(mca_btl_openib_endpoint_t *)opal_pointer_array_get_item(openib_btl->device->endpoints,
16481658
ep_index);
1649-
if(!endpoint) {
1650-
BTL_VERBOSE(("In finalize, got another null endpoint"));
1651-
continue;
1652-
}
1653-
if(endpoint->endpoint_btl != openib_btl) {
1654-
continue;
1655-
}
1656-
for(i = 0; i < openib_btl->device->eager_rdma_buffers_count; i++) {
1657-
if(openib_btl->device->eager_rdma_buffers[i] == endpoint) {
1658-
openib_btl->device->eager_rdma_buffers[i] = NULL;
1659-
OBJ_RELEASE(endpoint);
1659+
if(!endpoint) {
1660+
BTL_VERBOSE(("In finalize, got another null endpoint"));
1661+
continue;
16601662
}
1663+
if(endpoint->endpoint_btl != openib_btl) {
1664+
continue;
1665+
}
1666+
for(i = 0; i < openib_btl->device->eager_rdma_buffers_count; i++) {
1667+
if(openib_btl->device->eager_rdma_buffers[i] == endpoint) {
1668+
openib_btl->device->eager_rdma_buffers[i] = NULL;
1669+
OBJ_RELEASE(endpoint);
1670+
}
1671+
}
1672+
opal_pointer_array_set_item(openib_btl->device->endpoints,
1673+
ep_index, NULL);
1674+
assert(((opal_object_t*)endpoint)->obj_reference_count == 1);
1675+
OBJ_RELEASE(endpoint);
16611676
}
1662-
opal_pointer_array_set_item(openib_btl->device->endpoints,
1663-
ep_index, NULL);
1664-
assert(((opal_object_t*)endpoint)->obj_reference_count == 1);
1665-
OBJ_RELEASE(endpoint);
16661677
}
1667-
}
1668-
1669-
/* Release SRQ resources */
1670-
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
1671-
if(!BTL_OPENIB_QP_TYPE_PP(qp)) {
1672-
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
1673-
&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
1674-
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
1675-
&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
1676-
if (NULL != openib_btl->qps[qp].u.srq_qp.srq) {
1677-
opal_mutex_t *lock =
1678-
&mca_btl_openib_component.srq_manager.lock;
1679-
1680-
opal_hash_table_t *srq_addr_table =
1681-
&mca_btl_openib_component.srq_manager.srq_addr_table;
16821678

1683-
opal_mutex_lock(lock);
1684-
if (OPAL_SUCCESS !=
1685-
opal_hash_table_remove_value_ptr(srq_addr_table,
1686-
&openib_btl->qps[qp].u.srq_qp.srq,
1687-
sizeof(struct ibv_srq *))) {
1688-
BTL_VERBOSE(("Failed to remove SRQ %d entry from hash table.", qp));
1689-
rc = OPAL_ERROR;
1690-
}
1691-
opal_mutex_unlock(lock);
1692-
if (0 != ibv_destroy_srq(openib_btl->qps[qp].u.srq_qp.srq)) {
1693-
BTL_VERBOSE(("Failed to close SRQ %d", qp));
1694-
rc = OPAL_ERROR;
1679+
/* Release SRQ resources */
1680+
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
1681+
if(!BTL_OPENIB_QP_TYPE_PP(qp)) {
1682+
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
1683+
&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
1684+
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
1685+
&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
1686+
if (NULL != openib_btl->qps[qp].u.srq_qp.srq) {
1687+
opal_mutex_t *lock =
1688+
&mca_btl_openib_component.srq_manager.lock;
1689+
1690+
opal_hash_table_t *srq_addr_table =
1691+
&mca_btl_openib_component.srq_manager.srq_addr_table;
1692+
1693+
opal_mutex_lock(lock);
1694+
if (OPAL_SUCCESS !=
1695+
opal_hash_table_remove_value_ptr(srq_addr_table,
1696+
&openib_btl->qps[qp].u.srq_qp.srq,
1697+
sizeof(struct ibv_srq *))) {
1698+
BTL_VERBOSE(("Failed to remove SRQ %d entry from hash table.", qp));
1699+
rc = OPAL_ERROR;
1700+
}
1701+
opal_mutex_unlock(lock);
1702+
if (0 != ibv_destroy_srq(openib_btl->qps[qp].u.srq_qp.srq)) {
1703+
BTL_VERBOSE(("Failed to close SRQ %d", qp));
1704+
rc = OPAL_ERROR;
1705+
}
16951706
}
1696-
}
16971707

1698-
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
1699-
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
1708+
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
1709+
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
1710+
}
17001711
}
1701-
}
17021712

1703-
/* Finalize the CPC modules on this openib module */
1704-
for (i = 0; i < openib_btl->num_cpcs; ++i) {
1705-
if (NULL != openib_btl->cpcs[i]->cbm_finalize) {
1706-
openib_btl->cpcs[i]->cbm_finalize(openib_btl, openib_btl->cpcs[i]);
1713+
/* Finalize the CPC modules on this openib module */
1714+
for (i = 0; i < openib_btl->num_cpcs; ++i) {
1715+
if (NULL != openib_btl->cpcs[i]->cbm_finalize) {
1716+
openib_btl->cpcs[i]->cbm_finalize(openib_btl, openib_btl->cpcs[i]);
1717+
}
1718+
free(openib_btl->cpcs[i]);
17071719
}
1708-
free(openib_btl->cpcs[i]);
1709-
}
1710-
free(openib_btl->cpcs);
1720+
free(openib_btl->cpcs);
17111721

1712-
/* Release device if there are no more users */
1713-
if(!(--openib_btl->device->btls)) {
1714-
OBJ_RELEASE(openib_btl->device);
1722+
/* Release device if there are no more users */
1723+
if(!(--openib_btl->device->allowed_btls)) {
1724+
OBJ_RELEASE(openib_btl->device);
1725+
}
17151726
}
17161727

17171728
if (NULL != openib_btl->qps) {

opal/mca/btl/openib/btl_openib.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@
1818
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
1919
* Copyright (c) 2013-2014 NVIDIA Corporation. All rights reserved.
2020
* Copyright (c) 2014 Bull SAS. All rights reserved.
21-
* Copyright (c) 2015-2016 Research Organization for Information Science
22-
* and Technology (RIST). All rights reserved.
21+
* Copyright (c) 2015-2018 Research Organization for Information Science
22+
* and Technology (RIST). All rights reserved.
2323
* $COPYRIGHT$
2424
*
2525
* Additional copyrights may follow
@@ -164,6 +164,9 @@ struct mca_btl_openib_component_t {
164164
int ib_num_btls;
165165
/**< number of devices available to the openib component */
166166

167+
int ib_allowed_btls;
168+
/**< number of devices allowed to the openib component */
169+
167170
struct mca_btl_openib_module_t **openib_btls;
168171
/**< array of available BTLs */
169172

@@ -389,6 +392,7 @@ typedef struct mca_btl_openib_device_t {
389392
/* Whether this device supports eager RDMA */
390393
uint8_t use_eager_rdma;
391394
uint8_t btls; /** < number of btls using this device */
395+
uint8_t allowed_btls; /** < number of allowed btls using this device */
392396
opal_pointer_array_t *endpoints;
393397
opal_pointer_array_t *device_btls;
394398
uint16_t hp_cq_polls;
@@ -480,6 +484,7 @@ struct mca_btl_openib_module_t {
480484
uint8_t num_cpcs;
481485

482486
mca_btl_openib_device_t *device;
487+
char * device_name;
483488
uint8_t port_num; /**< ID of the PORT */
484489
uint16_t pkey_index;
485490
struct ibv_port_attr ib_port_attr;
@@ -501,6 +506,8 @@ struct mca_btl_openib_module_t {
501506
int local_procs; /** number of local procs */
502507

503508
bool atomic_ops_be; /** atomic result is big endian */
509+
510+
bool allowed; /** is this port allowed */
504511
};
505512
typedef struct mca_btl_openib_module_t mca_btl_openib_module_t;
506513

0 commit comments

Comments
 (0)