Skip to content

Commit d7f6dd0

Browse files
authored
Merge pull request #6961 from hjelmn/fix_btl_vader_fragment_issue
btl/vader: when using single-copy emulation fragment large rdma
2 parents 884d4e7 + ae91b11 commit d7f6dd0

File tree

5 files changed

+90
-157
lines changed

5 files changed

+90
-157
lines changed

opal/mca/btl/vader/btl_vader_atomic.c

Lines changed: 10 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
/*
33
* Copyright (c) 2010-2017 Los Alamos National Security, LLC. All rights
44
* reserved.
5+
* Copyright (c) 2019 Google, Inc. All rights reserved.
56
* $COPYRIGHT$
67
*
78
* Additional copyrights may follow
@@ -16,58 +17,14 @@
1617
#include "btl_vader_endpoint.h"
1718
#include "btl_vader_xpmem.h"
1819

19-
static void mca_btl_vader_sc_emu_aop_complete (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint,
20-
mca_btl_base_descriptor_t *desc, int status)
21-
{
22-
mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) desc;
23-
void *local_address = frag->rdma.local_address;
24-
void *context = frag->rdma.context;
25-
void *cbdata = frag->rdma.cbdata;
26-
mca_btl_base_rdma_completion_fn_t cbfunc = frag->rdma.cbfunc;
27-
28-
/* return the fragment first since the callback may call put/get/amo and could use this fragment */
29-
MCA_BTL_VADER_FRAG_RETURN(frag);
30-
31-
cbfunc (btl, endpoint, local_address, NULL, context, cbdata, status);
32-
}
33-
3420
int mca_btl_vader_emu_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
3521
uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
3622
mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order,
3723
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
3824
{
39-
mca_btl_vader_frag_t *frag;
40-
41-
frag = mca_btl_vader_rdma_frag_alloc (btl, endpoint, MCA_BTL_VADER_OP_ATOMIC, operand, 0, op, 0, order, flags, NULL,
42-
remote_address, cbfunc, cbcontext, cbdata, mca_btl_vader_sc_emu_aop_complete);
43-
if (OPAL_UNLIKELY(NULL == frag)) {
44-
return OPAL_ERR_OUT_OF_RESOURCE;
45-
}
46-
47-
/* send is always successful */
48-
(void) mca_btl_vader_send (btl, endpoint, &frag->base, MCA_BTL_TAG_VADER);
49-
50-
return OPAL_SUCCESS;
51-
}
52-
53-
static void mca_btl_vader_sc_emu_afop_complete (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint,
54-
mca_btl_base_descriptor_t *desc, int status)
55-
{
56-
mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) desc;
57-
mca_btl_vader_sc_emu_hdr_t *hdr;
58-
void *local_address = frag->rdma.local_address;
59-
void *context = frag->rdma.context;
60-
void *cbdata = frag->rdma.cbdata;
61-
mca_btl_base_rdma_completion_fn_t cbfunc = frag->rdma.cbfunc;
62-
63-
hdr = (mca_btl_vader_sc_emu_hdr_t *) frag->segments[0].seg_addr.pval;
64-
65-
*((int64_t *) frag->rdma.local_address) = hdr->operand[0];
66-
67-
/* return the fragment first since the callback may call put/get/amo and could use this fragment */
68-
MCA_BTL_VADER_FRAG_RETURN(frag);
69-
70-
cbfunc (btl, endpoint, local_address, NULL, context, cbdata, status);
25+
size_t size = (flags & MCA_BTL_ATOMIC_FLAG_32BIT) ? 4 : 8;
26+
return mca_btl_vader_rdma_frag_start (btl, endpoint, MCA_BTL_VADER_OP_ATOMIC, operand, 0, op, order, flags,
27+
size, NULL, remote_address, cbfunc, cbcontext, cbdata);
7128
}
7229

7330
int mca_btl_vader_emu_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
@@ -76,37 +33,17 @@ int mca_btl_vader_emu_afop (struct mca_btl_base_module_t *btl, struct mca_btl_ba
7633
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
7734
void *cbcontext, void *cbdata)
7835
{
79-
mca_btl_vader_frag_t *frag;
80-
81-
frag = mca_btl_vader_rdma_frag_alloc (btl, endpoint, MCA_BTL_VADER_OP_ATOMIC, operand, 0, op, 0, order, flags,
82-
local_address, remote_address, cbfunc, cbcontext, cbdata,
83-
mca_btl_vader_sc_emu_afop_complete);
84-
if (OPAL_UNLIKELY(NULL == frag)) {
85-
return OPAL_ERR_OUT_OF_RESOURCE;
86-
}
87-
88-
/* send is always successful */
89-
(void) mca_btl_vader_send (btl, endpoint, &frag->base, MCA_BTL_TAG_VADER);
90-
91-
return OPAL_SUCCESS;
36+
size_t size = (flags & MCA_BTL_ATOMIC_FLAG_32BIT) ? 4 : 8;
37+
return mca_btl_vader_rdma_frag_start (btl, endpoint, MCA_BTL_VADER_OP_ATOMIC, operand, 0, op, order, flags,
38+
size, local_address, remote_address, cbfunc, cbcontext, cbdata);
9239
}
9340

9441
int mca_btl_vader_emu_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
9542
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
9643
mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags,
9744
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
9845
{
99-
mca_btl_vader_frag_t *frag;
100-
101-
frag = mca_btl_vader_rdma_frag_alloc (btl, endpoint, MCA_BTL_VADER_OP_CSWAP, compare, value, 0, 0, order,
102-
flags, local_address, remote_address, cbfunc, cbcontext, cbdata,
103-
mca_btl_vader_sc_emu_afop_complete);
104-
if (OPAL_UNLIKELY(NULL == frag)) {
105-
return OPAL_ERR_OUT_OF_RESOURCE;
106-
}
107-
108-
/* send is always successful */
109-
(void) mca_btl_vader_send (btl, endpoint, &frag->base, MCA_BTL_TAG_VADER);
110-
111-
return OPAL_SUCCESS;
46+
size_t size = (flags & MCA_BTL_ATOMIC_FLAG_32BIT) ? 4 : 8;
47+
return mca_btl_vader_rdma_frag_start (btl, endpoint, MCA_BTL_VADER_OP_CSWAP, compare, value, 0, order,
48+
flags, size, local_address, remote_address, cbfunc, cbcontext, cbdata);
11249
}

opal/mca/btl/vader/btl_vader_component.c

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
2222
* Copyright (c) 2018 Triad National Security, LLC. All rights
2323
* reserved.
24+
* Copyright (c) 2019 Google, Inc. All rights reserved.
2425
* $COPYRIGHT$
2526
*
2627
* Additional copyrights may follow
@@ -478,12 +479,6 @@ static void mca_btl_vader_check_single_copy (void)
478479
mca_btl_vader.super.btl_get = NULL;
479480
mca_btl_vader.super.btl_put = NULL;
480481
}
481-
482-
if (MCA_BTL_VADER_EMUL == mca_btl_vader_component.single_copy_mechanism) {
483-
/* limit to the maximum fragment size */
484-
mca_btl_vader.super.btl_put_limit = mca_btl_vader.super.btl_max_send_size - sizeof (mca_btl_vader_sc_emu_hdr_t);
485-
mca_btl_vader.super.btl_get_limit = mca_btl_vader.super.btl_max_send_size - sizeof (mca_btl_vader_sc_emu_hdr_t);
486-
}
487482
}
488483

489484
/*

opal/mca/btl/vader/btl_vader_frag.h

Lines changed: 73 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
1515
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
1616
* reserved.
17+
* Copyright (c) 2019 Google, Inc. All rights reserved.
1718
* $COPYRIGHT$
1819
*
1920
* Additional copyrights may follow
@@ -91,9 +92,12 @@ struct mca_btl_vader_frag_t {
9192
/** rdma callback data */
9293
struct mca_btl_vader_rdma_cbdata_t {
9394
void *local_address;
95+
uint64_t remote_address;
9496
mca_btl_base_rdma_completion_fn_t cbfunc;
9597
void *context;
9698
void *cbdata;
99+
size_t remaining;
100+
size_t sent;
97101
} rdma;
98102
};
99103

@@ -151,28 +155,87 @@ static inline void mca_btl_vader_frag_complete (mca_btl_vader_frag_t *frag) {
151155

152156
int mca_btl_vader_frag_init (opal_free_list_item_t *item, void *ctx);
153157

154-
static inline mca_btl_vader_frag_t *
155-
mca_btl_vader_rdma_frag_alloc (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, int type,
158+
static inline void mca_btl_vader_rdma_frag_advance (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint,
159+
mca_btl_vader_frag_t *frag, int status)
160+
{
161+
mca_btl_vader_sc_emu_hdr_t *hdr = (mca_btl_vader_sc_emu_hdr_t *) frag->segments[0].seg_addr.pval;
162+
mca_btl_base_rdma_completion_fn_t cbfunc = frag->rdma.cbfunc;
163+
size_t hdr_size = sizeof (*hdr);
164+
size_t len = frag->rdma.sent ? frag->segments[0].seg_len - hdr_size : 0;
165+
void *context = frag->rdma.context;
166+
void *cbdata = frag->rdma.cbdata;
167+
void *data = (void *) (hdr + 1);
168+
169+
if (frag->rdma.sent) {
170+
if (MCA_BTL_VADER_OP_GET == hdr->type) {
171+
memcpy (frag->rdma.local_address, data, len);
172+
} else if ((MCA_BTL_VADER_OP_ATOMIC == hdr->type || MCA_BTL_VADER_OP_CSWAP == hdr->type) &&
173+
frag->rdma.local_address) {
174+
if (8 == len) {
175+
*((int64_t *) frag->rdma.local_address) = hdr->operand[0];
176+
} else {
177+
*((int32_t *) frag->rdma.local_address) = (int32_t) hdr->operand[0];
178+
}
179+
}
180+
}
181+
182+
if (frag->rdma.remaining) {
183+
size_t packet_size = (frag->rdma.remaining + hdr_size) <= mca_btl_vader.super.btl_max_send_size ?
184+
frag->rdma.remaining : mca_btl_vader.super.btl_max_send_size - hdr_size;
185+
186+
/* advance the local and remote pointers */
187+
frag->rdma.local_address = (void *)((uintptr_t) frag->rdma.local_address + len);
188+
frag->rdma.remote_address += len;
189+
190+
if (MCA_BTL_VADER_OP_PUT == hdr->type) {
191+
/* copy the next block into the fragment buffer */
192+
memcpy ((void *) (hdr + 1), frag->rdma.local_address, packet_size);
193+
}
194+
195+
hdr->addr = frag->rdma.remote_address;
196+
/* clear out the complete flag before sending the fragment again */
197+
frag->hdr->flags &= ~MCA_BTL_VADER_FLAG_COMPLETE;
198+
frag->segments[0].seg_len = packet_size + sizeof (*hdr);
199+
frag->rdma.sent += packet_size;
200+
frag->rdma.remaining -= packet_size;
201+
202+
/* send is always successful */
203+
(void) mca_btl_vader_send (btl, endpoint, &frag->base, MCA_BTL_TAG_VADER);
204+
return;
205+
}
206+
207+
/* return the fragment before calling the callback */
208+
MCA_BTL_VADER_FRAG_RETURN(frag);
209+
cbfunc (btl, endpoint, (void *)((uintptr_t) frag->rdma.local_address - frag->rdma.sent), NULL,
210+
context, cbdata, status);
211+
}
212+
213+
static inline int
214+
mca_btl_vader_rdma_frag_start (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, int type,
156215
uint64_t operand1, uint64_t operand2, mca_btl_base_atomic_op_t op, int order,
157216
int flags, size_t size, void *local_address, int64_t remote_address,
158-
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext,
159-
void *cbdata, mca_btl_base_completion_fn_t des_cbfunc)
217+
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
160218
{
161219
mca_btl_vader_sc_emu_hdr_t *hdr;
162-
size_t total_size = size + sizeof (*hdr);
220+
size_t hdr_size = sizeof (*hdr);
221+
size_t packet_size = (size + hdr_size) <= mca_btl_vader.super.btl_max_send_size ? size :
222+
mca_btl_vader.super.btl_max_send_size - hdr_size;
163223
mca_btl_vader_frag_t *frag;
164224

165-
frag = (mca_btl_vader_frag_t *) mca_btl_vader_alloc (btl, endpoint, order, total_size,
225+
frag = (mca_btl_vader_frag_t *) mca_btl_vader_alloc (btl, endpoint, order, packet_size + hdr_size,
166226
MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
167227
if (OPAL_UNLIKELY(NULL == frag)) {
168-
return NULL;
228+
return OPAL_ERR_OUT_OF_RESOURCE;
169229
}
170230

171-
frag->base.des_cbfunc = des_cbfunc;
231+
frag->base.des_cbfunc = (mca_btl_base_completion_fn_t) mca_btl_vader_rdma_frag_advance;
172232
frag->rdma.local_address = local_address;
233+
frag->rdma.remote_address = remote_address;
173234
frag->rdma.cbfunc = cbfunc;
174235
frag->rdma.context = cbcontext;
175236
frag->rdma.cbdata = cbdata;
237+
frag->rdma.remaining = size;
238+
frag->rdma.sent = 0;
176239

177240
hdr = (mca_btl_vader_sc_emu_hdr_t *) frag->segments[0].seg_addr.pval;
178241

@@ -183,7 +246,8 @@ mca_btl_vader_rdma_frag_alloc (mca_btl_base_module_t *btl, mca_btl_base_endpoint
183246
hdr->operand[0] = operand1;
184247
hdr->operand[1] = operand2;
185248

186-
return frag;
249+
mca_btl_vader_rdma_frag_advance (btl, endpoint, frag, OPAL_SUCCESS);
250+
return OPAL_SUCCESS;
187251
}
188252

189253
#endif /* MCA_BTL_VADER_SEND_FRAG_H */

opal/mca/btl/vader/btl_vader_get.c

Lines changed: 3 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
* reserved.
55
* Copyright (c) 2018 Research Organization for Information Science
66
* and Technology (RIST). All rights reserved.
7+
* Copyright (c) 2019 Google, Inc. All rights reserved.
78
* $COPYRIGHT$
89
*
910
* Additional copyrights may follow
@@ -156,49 +157,15 @@ int mca_btl_vader_get_knem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
156157
}
157158
#endif
158159

159-
static void mca_btl_vader_sc_emu_get_complete (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint,
160-
mca_btl_base_descriptor_t *desc, int status)
161-
{
162-
mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) desc;
163-
mca_btl_vader_sc_emu_hdr_t *hdr;
164-
void *local_address = frag->rdma.local_address;
165-
size_t len = frag->segments[0].seg_len - sizeof (*hdr);
166-
void *context = frag->rdma.context;
167-
void *cbdata = frag->rdma.cbdata;
168-
mca_btl_base_rdma_completion_fn_t cbfunc = frag->rdma.cbfunc;
169-
void *data;
170-
171-
hdr = (mca_btl_vader_sc_emu_hdr_t *) frag->segments[0].seg_addr.pval;
172-
data = (void *) (hdr + 1);
173-
174-
memcpy (local_address, data, len);
175-
176-
/* return the fragment before calling the callback */
177-
MCA_BTL_VADER_FRAG_RETURN(frag);
178-
179-
cbfunc (btl, endpoint, local_address, NULL, context, cbdata, status);
180-
}
181-
182160
int mca_btl_vader_get_sc_emu (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address,
183161
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
184162
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
185163
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
186164
{
187-
mca_btl_vader_frag_t *frag;
188-
189165
if (size > mca_btl_vader.super.btl_get_limit) {
190166
return OPAL_ERR_NOT_AVAILABLE;
191167
}
192168

193-
frag = mca_btl_vader_rdma_frag_alloc (btl, endpoint, MCA_BTL_VADER_OP_GET, 0, 0, 0, order, flags, size,
194-
local_address, remote_address, cbfunc, cbcontext, cbdata,
195-
mca_btl_vader_sc_emu_get_complete);
196-
if (OPAL_UNLIKELY(NULL == frag)) {
197-
return OPAL_ERR_OUT_OF_RESOURCE;
198-
}
199-
200-
/* send is always successful */
201-
(void) mca_btl_vader_send (btl, endpoint, &frag->base, MCA_BTL_TAG_VADER);
202-
203-
return OPAL_SUCCESS;
169+
return mca_btl_vader_rdma_frag_start (btl, endpoint, MCA_BTL_VADER_OP_GET, 0, 0, 0, order, flags, size,
170+
local_address, remote_address, cbfunc, cbcontext, cbdata);
204171
}

opal/mca/btl/vader/btl_vader_put.c

Lines changed: 3 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
* reserved.
55
* Copyright (c) 2014-2018 Research Organization for Information Science
66
* and Technology (RIST). All rights reserved.
7+
* Copyright (c) 2019 Google, Inc. All rights reserved.
78
* $COPYRIGHT$
89
*
910
* Additional copyrights may follow
@@ -135,21 +136,6 @@ int mca_btl_vader_put_knem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
135136
}
136137
#endif
137138

138-
static void mca_btl_vader_sc_emu_put_complete (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint,
139-
mca_btl_base_descriptor_t *desc, int status)
140-
{
141-
mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) desc;
142-
void *local_address = frag->rdma.local_address;
143-
void *context = frag->rdma.context;
144-
void *cbdata = frag->rdma.cbdata;
145-
mca_btl_base_rdma_completion_fn_t cbfunc = frag->rdma.cbfunc;
146-
147-
/* return the fragment first since the callback may call put/get/amo and could use this fragment */
148-
MCA_BTL_VADER_FRAG_RETURN(frag);
149-
150-
cbfunc (btl, endpoint, local_address, NULL, context, cbdata, status);
151-
}
152-
153139
/**
154140
* @brief Provides an emulated put path which uses copy-in copy-out with shared memory buffers
155141
*/
@@ -158,26 +144,10 @@ int mca_btl_vader_put_sc_emu (mca_btl_base_module_t *btl, mca_btl_base_endpoint_
158144
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
159145
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
160146
{
161-
mca_btl_vader_sc_emu_hdr_t *hdr;
162-
mca_btl_vader_frag_t *frag;
163-
164147
if (size > mca_btl_vader.super.btl_put_limit) {
165148
return OPAL_ERR_NOT_AVAILABLE;
166149
}
167150

168-
frag = mca_btl_vader_rdma_frag_alloc (btl, endpoint, MCA_BTL_VADER_OP_PUT, 0, 0, 0, order, flags, size,
169-
local_address, remote_address, cbfunc, cbcontext, cbdata,
170-
mca_btl_vader_sc_emu_put_complete);
171-
if (OPAL_UNLIKELY(NULL == frag)) {
172-
return OPAL_ERR_OUT_OF_RESOURCE;
173-
}
174-
175-
hdr = (mca_btl_vader_sc_emu_hdr_t *) frag->segments[0].seg_addr.pval;
176-
177-
memcpy ((void *) (hdr + 1), local_address, size);
178-
179-
/* send is always successful */
180-
(void) mca_btl_vader_send (btl, endpoint, &frag->base, MCA_BTL_TAG_VADER);
181-
182-
return OPAL_SUCCESS;
151+
return mca_btl_vader_rdma_frag_start (btl, endpoint, MCA_BTL_VADER_OP_PUT, 0, 0, 0, order, flags, size,
152+
local_address, remote_address, cbfunc, cbcontext, cbdata);
183153
}

0 commit comments

Comments
 (0)