Skip to content

Commit 0a2ce58

Browse files
committed
btl/openib: delay UCX warning to add_procs()
If UCX is available, then pml/ucx will be used instead of pml/ob1 + btl/openib, so there is no need to warn about btl/openib not supporting Infiniband. Signed-off-by: Gilles Gouaillardet <[email protected]>
1 parent c5292a0 commit 0a2ce58

File tree

4 files changed

+171
-127
lines changed

4 files changed

+171
-127
lines changed

opal/mca/btl/openib/btl_openib.c

Lines changed: 70 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919
* Copyright (c) 2009 IBM Corporation. All rights reserved.
2020
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
2121
* Copyright (c) 2013-2015 NVIDIA Corporation. All rights reserved.
22-
* Copyright (c) 2014-2015 Research Organization for Information Science
23-
* and Technology (RIST). All rights reserved.
22+
* Copyright (c) 2014-2018 Research Organization for Information Science
23+
* and Technology (RIST). All rights reserved.
2424
* Copyright (c) 2014 Bull SAS. All rights reserved
2525
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
2626
* $COPYRIGHT$
@@ -1042,6 +1042,14 @@ int mca_btl_openib_add_procs(
10421042
int btl_rank = 0;
10431043
volatile mca_btl_base_endpoint_t* endpoint;
10441044

1045+
1046+
if (! openib_btl->allowed) {
1047+
opal_bitmap_clear_all_bits(reachable);
1048+
opal_show_help("help-mpi-btl-openib.txt", "ib port not selected",
1049+
true, opal_process_info.nodename,
1050+
ibv_get_device_name(openib_btl->device->ib_dev), openib_btl->port_num);
1051+
}
1052+
10451053
btl_rank = get_openib_btl_params(openib_btl, &lcl_subnet_id_port_cnt);
10461054
if( 0 > btl_rank ){
10471055
return OPAL_ERR_NOT_FOUND;
@@ -1641,75 +1649,77 @@ static int mca_btl_openib_finalize_resources(struct mca_btl_base_module_t* btl)
16411649
return OPAL_SUCCESS;
16421650
}
16431651

1644-
/* Release all QPs */
1645-
if (NULL != openib_btl->device->endpoints) {
1646-
for (ep_index=0;
1647-
ep_index < opal_pointer_array_get_size(openib_btl->device->endpoints);
1648-
ep_index++) {
1649-
endpoint=(mca_btl_openib_endpoint_t *)opal_pointer_array_get_item(openib_btl->device->endpoints,
1652+
if (openib_btl->allowed) {
1653+
/* Release all QPs */
1654+
if (NULL != openib_btl->device->endpoints) {
1655+
for (ep_index=0;
1656+
ep_index < opal_pointer_array_get_size(openib_btl->device->endpoints);
1657+
ep_index++) {
1658+
endpoint=(mca_btl_openib_endpoint_t *)opal_pointer_array_get_item(openib_btl->device->endpoints,
16501659
ep_index);
1651-
if(!endpoint) {
1652-
BTL_VERBOSE(("In finalize, got another null endpoint"));
1653-
continue;
1654-
}
1655-
if(endpoint->endpoint_btl != openib_btl) {
1656-
continue;
1657-
}
1658-
for(i = 0; i < openib_btl->device->eager_rdma_buffers_count; i++) {
1659-
if(openib_btl->device->eager_rdma_buffers[i] == endpoint) {
1660-
openib_btl->device->eager_rdma_buffers[i] = NULL;
1661-
OBJ_RELEASE(endpoint);
1660+
if(!endpoint) {
1661+
BTL_VERBOSE(("In finalize, got another null endpoint"));
1662+
continue;
16621663
}
1664+
if(endpoint->endpoint_btl != openib_btl) {
1665+
continue;
1666+
}
1667+
for(i = 0; i < openib_btl->device->eager_rdma_buffers_count; i++) {
1668+
if(openib_btl->device->eager_rdma_buffers[i] == endpoint) {
1669+
openib_btl->device->eager_rdma_buffers[i] = NULL;
1670+
OBJ_RELEASE(endpoint);
1671+
}
1672+
}
1673+
opal_pointer_array_set_item(openib_btl->device->endpoints,
1674+
ep_index, NULL);
1675+
assert(((opal_object_t*)endpoint)->obj_reference_count == 1);
1676+
OBJ_RELEASE(endpoint);
16631677
}
1664-
opal_pointer_array_set_item(openib_btl->device->endpoints,
1665-
ep_index, NULL);
1666-
assert(((opal_object_t*)endpoint)->obj_reference_count == 1);
1667-
OBJ_RELEASE(endpoint);
16681678
}
1669-
}
1670-
1671-
/* Release SRQ resources */
1672-
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
1673-
if(!BTL_OPENIB_QP_TYPE_PP(qp)) {
1674-
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
1675-
&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
1676-
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
1677-
&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
1678-
if (NULL != openib_btl->qps[qp].u.srq_qp.srq) {
1679-
opal_mutex_t *lock =
1680-
&mca_btl_openib_component.srq_manager.lock;
16811679

1682-
opal_hash_table_t *srq_addr_table =
1683-
&mca_btl_openib_component.srq_manager.srq_addr_table;
1684-
1685-
opal_mutex_lock(lock);
1686-
if (OPAL_SUCCESS !=
1687-
opal_hash_table_remove_value_ptr(srq_addr_table,
1688-
&openib_btl->qps[qp].u.srq_qp.srq,
1689-
sizeof(struct ibv_srq *))) {
1690-
BTL_VERBOSE(("Failed to remove SRQ %d entry from hash table.", qp));
1691-
rc = OPAL_ERROR;
1692-
}
1693-
opal_mutex_unlock(lock);
1694-
if (0 != ibv_destroy_srq(openib_btl->qps[qp].u.srq_qp.srq)) {
1695-
BTL_VERBOSE(("Failed to close SRQ %d", qp));
1696-
rc = OPAL_ERROR;
1680+
/* Release SRQ resources */
1681+
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
1682+
if(!BTL_OPENIB_QP_TYPE_PP(qp)) {
1683+
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
1684+
&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
1685+
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
1686+
&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
1687+
if (NULL != openib_btl->qps[qp].u.srq_qp.srq) {
1688+
opal_mutex_t *lock =
1689+
&mca_btl_openib_component.srq_manager.lock;
1690+
1691+
opal_hash_table_t *srq_addr_table =
1692+
&mca_btl_openib_component.srq_manager.srq_addr_table;
1693+
1694+
opal_mutex_lock(lock);
1695+
if (OPAL_SUCCESS !=
1696+
opal_hash_table_remove_value_ptr(srq_addr_table,
1697+
&openib_btl->qps[qp].u.srq_qp.srq,
1698+
sizeof(struct ibv_srq *))) {
1699+
BTL_VERBOSE(("Failed to remove SRQ %d entry from hash table.", qp));
1700+
rc = OPAL_ERROR;
1701+
}
1702+
opal_mutex_unlock(lock);
1703+
if (0 != ibv_destroy_srq(openib_btl->qps[qp].u.srq_qp.srq)) {
1704+
BTL_VERBOSE(("Failed to close SRQ %d", qp));
1705+
rc = OPAL_ERROR;
1706+
}
16971707
}
1698-
}
16991708

1700-
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
1701-
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
1709+
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
1710+
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
1711+
}
17021712
}
1703-
}
17041713

1705-
/* Finalize the CPC modules on this openib module */
1706-
for (i = 0; i < openib_btl->num_cpcs; ++i) {
1707-
if (NULL != openib_btl->cpcs[i]->cbm_finalize) {
1708-
openib_btl->cpcs[i]->cbm_finalize(openib_btl, openib_btl->cpcs[i]);
1714+
/* Finalize the CPC modules on this openib module */
1715+
for (i = 0; i < openib_btl->num_cpcs; ++i) {
1716+
if (NULL != openib_btl->cpcs[i]->cbm_finalize) {
1717+
openib_btl->cpcs[i]->cbm_finalize(openib_btl, openib_btl->cpcs[i]);
1718+
}
1719+
free(openib_btl->cpcs[i]);
17091720
}
1710-
free(openib_btl->cpcs[i]);
1721+
free(openib_btl->cpcs);
17111722
}
1712-
free(openib_btl->cpcs);
17131723

17141724
/* Release device if there are no more users */
17151725
if(!(--openib_btl->device->btls)) {

opal/mca/btl/openib/btl_openib.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@
1818
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
1919
* Copyright (c) 2013-2014 NVIDIA Corporation. All rights reserved.
2020
* Copyright (c) 2014 Bull SAS. All rights reserved.
21-
* Copyright (c) 2015-2016 Research Organization for Information Science
22-
* and Technology (RIST). All rights reserved.
21+
* Copyright (c) 2015-2018 Research Organization for Information Science
22+
* and Technology (RIST). All rights reserved.
2323
* $COPYRIGHT$
2424
*
2525
* Additional copyrights may follow
@@ -164,6 +164,9 @@ struct mca_btl_openib_component_t {
164164
int ib_num_btls;
165165
/**< number of devices available to the openib component */
166166

167+
int ib_allowed_btls;
168+
/**< number of devices allowed to the openib component */
169+
167170
struct mca_btl_openib_module_t **openib_btls;
168171
/**< array of available BTLs */
169172

@@ -501,6 +504,8 @@ struct mca_btl_openib_module_t {
501504
int local_procs; /** number of local procs */
502505

503506
bool atomic_ops_be; /** atomic result is big endian */
507+
508+
bool allowed; /** is this port allowed */
504509
};
505510
typedef struct mca_btl_openib_module_t mca_btl_openib_module_t;
506511

0 commit comments

Comments
 (0)