Skip to content

Commit 4079eec

Browse files
committed
pml/ob1: be more selective when using rdma capable btls
This commit updates the btl selection logic for the RDMA and RDMA pipeline protocols to use a btl iff: 1) the btl is also used for eager messages (high exclusivity), or 2) no other RDMA btl is available on an endpoint and the pml_ob1_use_all_rdma MCA variable is true. This fixes a performance regression with shared memory when an RDMA capable network is available. Signed-off-by: Nathan Hjelm <[email protected]>
1 parent 5ced037 commit 4079eec

File tree

5 files changed

+85
-11
lines changed

5 files changed

+85
-11
lines changed

ompi/mca/pml/ob1/pml_ob1.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* All rights reserved.
1313
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved
1414
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
15-
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
15+
* Copyright (c) 2012-2016 Los Alamos National Security, LLC. All rights
1616
* reserved.
1717
* Copyright (c) 2015 Research Organization for Information Science
1818
* and Technology (RIST). All rights reserved.
@@ -61,6 +61,7 @@ struct mca_pml_ob1_t {
6161
int max_rdma_per_request;
6262
int max_send_per_range;
6363
bool leave_pinned;
64+
bool use_all_rdma;
6465
int leave_pinned_pipeline;
6566

6667
/* lock queue access */

ompi/mca/pml/ob1/pml_ob1_component.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,12 @@ static int mca_pml_ob1_component_register(void)
198198

199199
mca_pml_ob1_param_register_uint("unexpected_limit", 128, &mca_pml_ob1.unexpected_limit);
200200

201+
mca_pml_ob1.use_all_rdma = false;
202+
(void) mca_base_component_var_register(&mca_pml_ob1_component.pmlm_version, "use_all_rdma",
203+
"Use all available RDMA btls for the RDMA and RDMA pipeline protocols "
204+
"(default: false)", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
205+
OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_GROUP, &mca_pml_ob1.use_all_rdma);
206+
201207
mca_pml_ob1.allocator_name = "bucket";
202208
(void) mca_base_component_var_register(&mca_pml_ob1_component.pmlm_version, "allocator",
203209
"Name of allocator component for unexpected messages",

ompi/mca/pml/ob1/pml_ob1_rdma.c

Lines changed: 70 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* University of Stuttgart. All rights reserved.
1111
* Copyright (c) 2004-2005 The Regents of the University of California.
1212
* All rights reserved.
13-
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
13+
* Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights
1414
* reserved.
1515
* $COPYRIGHT$
1616
*
@@ -42,6 +42,7 @@ size_t mca_pml_ob1_rdma_btls(
4242
mca_pml_ob1_com_btl_t* rdma_btls)
4343
{
4444
int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
45+
int num_eager_btls = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_eager);
4546
double weight_total = 0;
4647
int num_btls_used = 0;
4748

@@ -57,6 +58,22 @@ size_t mca_pml_ob1_rdma_btls(
5758
(bml_endpoint->btl_rdma_index + n) % num_btls);
5859
mca_btl_base_registration_handle_t *reg_handle = NULL;
5960
mca_btl_base_module_t *btl = bml_btl->btl;
61+
/* NTH: go ahead and use an rdma btl if is the only one */
62+
bool ignore = !mca_pml_ob1.use_all_rdma;
63+
64+
/* do not use rdma btls that are not in the eager list. this is necessary to avoid using
65+
* btls that exist on the endpoint only to support RMA. */
66+
for (int i = 0 ; i < num_eager_btls && ignore ; ++i) {
67+
mca_bml_base_btl_t *eager_btl = mca_bml_base_btl_array_get_index (&bml_endpoint->btl_eager, i);
68+
if (eager_btl->btl_endpoint == bml_btl->btl_endpoint) {
69+
ignore = false;
70+
break;
71+
}
72+
}
73+
74+
if (ignore) {
75+
continue;
76+
}
6077

6178
if (btl->btl_register_mem) {
6279
/* do not use the RDMA protocol with this btl if 1) leave pinned is disabled,
@@ -95,22 +112,66 @@ size_t mca_pml_ob1_rdma_btls(
95112
return num_btls_used;
96113
}
97114

115+
size_t mca_pml_ob1_rdma_pipeline_btls_count (mca_bml_base_endpoint_t* bml_endpoint)
116+
{
117+
int num_btls = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_rdma);
118+
int num_eager_btls = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_eager);
119+
int rdma_count = 0;
120+
121+
for(int i = 0; i < num_btls && i < mca_pml_ob1.max_rdma_per_request; ++i) {
122+
mca_bml_base_btl_t *bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma);
123+
/* NTH: go ahead and use an rdma btl if is the only one */
124+
bool ignore = !mca_pml_ob1.use_all_rdma;
125+
126+
for (int i = 0 ; i < num_eager_btls && ignore ; ++i) {
127+
mca_bml_base_btl_t *eager_btl = mca_bml_base_btl_array_get_index (&bml_endpoint->btl_eager, i);
128+
if (eager_btl->btl_endpoint == bml_btl->btl_endpoint) {
129+
ignore = false;
130+
break;
131+
}
132+
}
133+
134+
if (!ignore) {
135+
++rdma_count;
136+
}
137+
}
138+
139+
return rdma_count;
140+
}
141+
98142
size_t mca_pml_ob1_rdma_pipeline_btls( mca_bml_base_endpoint_t* bml_endpoint,
99143
size_t size,
100144
mca_pml_ob1_com_btl_t* rdma_btls )
101145
{
102-
int i, num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
146+
int num_btls = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_rdma);
147+
int num_eager_btls = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_eager);
103148
double weight_total = 0;
149+
int rdma_count = 0;
150+
151+
for(int i = 0; i < num_btls && i < mca_pml_ob1.max_rdma_per_request; i++) {
152+
mca_bml_base_btl_t *bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma);
153+
/* NTH: go ahead and use an rdma btl if is the only one */
154+
bool ignore = !mca_pml_ob1.use_all_rdma;
155+
156+
for (int i = 0 ; i < num_eager_btls && ignore ; ++i) {
157+
mca_bml_base_btl_t *eager_btl = mca_bml_base_btl_array_get_index (&bml_endpoint->btl_eager, i);
158+
if (eager_btl->btl_endpoint == bml_btl->btl_endpoint) {
159+
ignore = false;
160+
break;
161+
}
162+
}
163+
164+
if (ignore) {
165+
continue;
166+
}
104167

105-
for(i = 0; i < num_btls && i < mca_pml_ob1.max_rdma_per_request; i++) {
106-
rdma_btls[i].bml_btl =
107-
mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma);
108-
rdma_btls[i].btl_reg = NULL;
168+
rdma_btls[rdma_count].bml_btl = bml_btl;
169+
rdma_btls[rdma_count++].btl_reg = NULL;
109170

110-
weight_total += rdma_btls[i].bml_btl->btl_weight;
171+
weight_total += bml_btl->btl_weight;
111172
}
112173

113-
mca_pml_ob1_calc_weighted_length(rdma_btls, i, size, weight_total);
174+
mca_pml_ob1_calc_weighted_length (rdma_btls, rdma_count, size, weight_total);
114175

115-
return i;
176+
return rdma_count;
116177
}

ompi/mca/pml/ob1/pml_ob1_rdma.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
12
/*
23
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
34
* University Research and Technology
@@ -9,6 +10,8 @@
910
* University of Stuttgart. All rights reserved.
1011
* Copyright (c) 2004-2005 The Regents of the University of California.
1112
* All rights reserved.
13+
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
14+
* reserved.
1215
* $COPYRIGHT$
1316
*
1417
* Additional copyrights may follow
@@ -37,5 +40,8 @@ size_t mca_pml_ob1_rdma_btls(struct mca_bml_base_endpoint_t* endpoint,
3740
* bandwidth */
3841
size_t mca_pml_ob1_rdma_pipeline_btls(struct mca_bml_base_endpoint_t* endpoint,
3942
size_t size, mca_pml_ob1_com_btl_t* rdma_btls);
43+
44+
size_t mca_pml_ob1_rdma_pipeline_btls_count (mca_bml_base_endpoint_t* bml_endpoint);
45+
4046
#endif
4147

ompi/mca/pml/ob1/pml_ob1_recvreq.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ static int mca_pml_ob1_recv_request_ack(
263263
/* by default copy everything */
264264
recvreq->req_send_offset = bytes_received;
265265
if(hdr->hdr_msg_length > bytes_received) {
266-
size_t rdma_num = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
266+
size_t rdma_num = mca_pml_ob1_rdma_pipeline_btls_count (bml_endpoint);
267267
/*
268268
* lookup request buffer to determine if memory is already
269269
* registered.

0 commit comments

Comments
 (0)