18
18
* Author(s): Torsten Hoefler <[email protected] >
19
19
*
20
20
*/
21
+ #include "opal/include/opal/align.h"
22
+ #include "ompi/op/op.h"
23
+
21
24
#include "nbc_internal.h"
22
25
26
+ static inline int scan_sched_linear (
27
+ int rank , int comm_size , const void * sendbuf , void * recvbuf , int count ,
28
+ MPI_Datatype datatype , MPI_Op op , char inplace , NBC_Schedule * schedule ,
29
+ void * tmpbuf );
30
+ static inline int scan_sched_recursivedoubling (
31
+ int rank , int comm_size , const void * sendbuf , void * recvbuf ,
32
+ int count , MPI_Datatype datatype , MPI_Op op , char inplace ,
33
+ NBC_Schedule * schedule , void * tmpbuf1 , void * tmpbuf2 );
34
+
23
35
#ifdef NBC_CACHE_SCHEDULE
24
36
/* tree comparison function for schedule cache */
25
37
int NBC_Scan_args_compare (NBC_Scan_args * a , NBC_Scan_args * b , void * param ) {
@@ -39,27 +51,41 @@ int NBC_Scan_args_compare(NBC_Scan_args *a, NBC_Scan_args *b, void *param) {
39
51
}
40
52
#endif
41
53
42
- /* linear iscan
43
- * working principle:
44
- * 1. each node (but node 0) receives from left neighbor
45
- * 2. performs op
46
- * 3. all but rank p-1 do sends to it's right neighbor and exits
47
- *
48
- */
49
54
static int nbc_scan_init (const void * sendbuf , void * recvbuf , int count , MPI_Datatype datatype , MPI_Op op ,
50
55
struct ompi_communicator_t * comm , ompi_request_t * * request ,
51
56
struct mca_coll_base_module_2_3_0_t * module , bool persistent ) {
52
- int rank , p , res ;
53
- ptrdiff_t gap , span ;
54
- NBC_Schedule * schedule ;
55
- void * tmpbuf = NULL ;
56
- char inplace ;
57
- ompi_coll_libnbc_module_t * libnbc_module = (ompi_coll_libnbc_module_t * ) module ;
57
+ int rank , p , res ;
58
+ ptrdiff_t gap , span ;
59
+ NBC_Schedule * schedule ;
60
+ void * tmpbuf = NULL , * tmpbuf1 = NULL , * tmpbuf2 = NULL ;
61
+ enum { NBC_SCAN_LINEAR , NBC_SCAN_RDBL } alg ;
62
+ char inplace ;
63
+ ompi_coll_libnbc_module_t * libnbc_module = (ompi_coll_libnbc_module_t * ) module ;
64
+
65
+ NBC_IN_PLACE (sendbuf , recvbuf , inplace );
58
66
59
- NBC_IN_PLACE (sendbuf , recvbuf , inplace );
67
+ rank = ompi_comm_rank (comm );
68
+ p = ompi_comm_size (comm );
60
69
61
- rank = ompi_comm_rank (comm );
62
- p = ompi_comm_size (comm );
70
+ if (count == 0 ) {
71
+ return nbc_get_noop_request (persistent , request );
72
+ }
73
+
74
+ span = opal_datatype_span (& datatype -> super , count , & gap );
75
+ if (libnbc_iscan_algorithm == 2 ) {
76
+ alg = NBC_SCAN_RDBL ;
77
+ ptrdiff_t span_align = OPAL_ALIGN (span , datatype -> super .align , ptrdiff_t );
78
+ tmpbuf = malloc (span_align + span );
79
+ if (NULL == tmpbuf ) { return OMPI_ERR_OUT_OF_RESOURCE ; }
80
+ tmpbuf1 = (void * )(- gap );
81
+ tmpbuf2 = (char * )(span_align ) - gap ;
82
+ } else {
83
+ alg = NBC_SCAN_LINEAR ;
84
+ if (rank > 0 ) {
85
+ tmpbuf = malloc (span );
86
+ if (NULL == tmpbuf ) { return OMPI_ERR_OUT_OF_RESOURCE ; }
87
+ }
88
+ }
63
89
64
90
#ifdef NBC_CACHE_SCHEDULE
65
91
NBC_Scan_args * args , * found , search ;
@@ -75,60 +101,28 @@ static int nbc_scan_init(const void* sendbuf, void* recvbuf, int count, MPI_Data
75
101
#endif
76
102
schedule = OBJ_NEW (NBC_Schedule );
77
103
if (OPAL_UNLIKELY (NULL == schedule )) {
78
- return OMPI_ERR_OUT_OF_RESOURCE ;
104
+ free (tmpbuf );
105
+ return OMPI_ERR_OUT_OF_RESOURCE ;
79
106
}
80
107
81
- if (!inplace ) {
82
- /* copy data to receivebuf */
83
- res = NBC_Sched_copy ((void * )sendbuf , false, count , datatype ,
84
- recvbuf , false, count , datatype , schedule , false);
85
- if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) {
86
- OBJ_RELEASE (schedule );
87
- return res ;
88
- }
108
+ if (alg == NBC_SCAN_LINEAR ) {
109
+ res = scan_sched_linear (rank , p , sendbuf , recvbuf , count , datatype ,
110
+ op , inplace , schedule , tmpbuf );
111
+ } else {
112
+ res = scan_sched_recursivedoubling (rank , p , sendbuf , recvbuf , count ,
113
+ datatype , op , inplace , schedule , tmpbuf1 , tmpbuf2 );
89
114
}
90
-
91
- if (rank != 0 ) {
92
- span = opal_datatype_span (& datatype -> super , count , & gap );
93
- tmpbuf = malloc (span );
94
- if (NULL == tmpbuf ) {
95
- OBJ_RELEASE (schedule );
96
- return OMPI_ERR_OUT_OF_RESOURCE ;
97
- }
98
-
99
- /* we have to wait until we have the data */
100
- res = NBC_Sched_recv ((void * )(- gap ), true, count , datatype , rank - 1 , schedule , true);
101
- if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) {
102
- OBJ_RELEASE (schedule );
103
- free (tmpbuf );
104
- return res ;
105
- }
106
-
107
- /* perform the reduce in my local buffer */
108
- /* this cannot be done until tmpbuf is unused :-( so barrier after the op */
109
- res = NBC_Sched_op ((void * )(- gap ), true, recvbuf , false, count , datatype , op , schedule ,
110
- true);
111
- if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) {
115
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) {
112
116
OBJ_RELEASE (schedule );
113
117
free (tmpbuf );
114
118
return res ;
115
- }
116
119
}
117
120
118
- if (rank != p - 1 ) {
119
- res = NBC_Sched_send (recvbuf , false, count , datatype , rank + 1 , schedule , false);
120
- if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) {
121
+ res = NBC_Sched_commit (schedule );
122
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) {
121
123
OBJ_RELEASE (schedule );
122
124
free (tmpbuf );
123
125
return res ;
124
- }
125
- }
126
-
127
- res = NBC_Sched_commit (schedule );
128
- if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) {
129
- OBJ_RELEASE (schedule );
130
- free (tmpbuf );
131
- return res ;
132
126
}
133
127
134
128
#ifdef NBC_CACHE_SCHEDULE
@@ -162,14 +156,160 @@ static int nbc_scan_init(const void* sendbuf, void* recvbuf, int count, MPI_Data
162
156
}
163
157
#endif
164
158
165
- res = NBC_Schedule_request (schedule , comm , libnbc_module , persistent , request , tmpbuf );
166
- if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) {
167
- OBJ_RELEASE (schedule );
168
- free (tmpbuf );
159
+ res = NBC_Schedule_request (schedule , comm , libnbc_module , persistent , request , tmpbuf );
160
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) {
161
+ OBJ_RELEASE (schedule );
162
+ free (tmpbuf );
163
+ return res ;
164
+ }
165
+
166
+ return OMPI_SUCCESS ;
167
+ }
168
+
169
+ /*
170
+ * scan_sched_linear:
171
+ *
172
+ * Function: Linear algorithm for inclusive scan.
173
+ * Accepts: Same as MPI_Iscan
174
+ * Returns: MPI_SUCCESS or error code
175
+ *
176
+ * Working principle:
177
+ * 1. Each process (but process 0) receives from left neighbor
178
+ * 2. Performs op
179
+ * 3. All but rank p-1 do sends to it's right neighbor and exits
180
+ *
181
+ * Schedule length: O(1)
182
+ */
183
+ static inline int scan_sched_linear (
184
+ int rank , int comm_size , const void * sendbuf , void * recvbuf , int count ,
185
+ MPI_Datatype datatype , MPI_Op op , char inplace , NBC_Schedule * schedule ,
186
+ void * tmpbuf )
187
+ {
188
+ int res = OMPI_SUCCESS ;
189
+
190
+ if (!inplace ) {
191
+ /* Copy data to recvbuf */
192
+ res = NBC_Sched_copy ((void * )sendbuf , false, count , datatype ,
193
+ recvbuf , false, count , datatype , schedule , false);
194
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
195
+ }
196
+
197
+ if (rank > 0 ) {
198
+ ptrdiff_t gap ;
199
+ opal_datatype_span (& datatype -> super , count , & gap );
200
+ /* We have to wait until we have the data */
201
+ res = NBC_Sched_recv ((void * )(- gap ), true, count , datatype , rank - 1 , schedule , true);
202
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
203
+
204
+ /* Perform the reduce in my local buffer */
205
+ /* this cannot be done until tmpbuf is unused :-( so barrier after the op */
206
+ res = NBC_Sched_op ((void * )(- gap ), true, recvbuf , false, count , datatype , op , schedule ,
207
+ true);
208
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
209
+ }
210
+
211
+ if (rank != comm_size - 1 ) {
212
+ res = NBC_Sched_send (recvbuf , false, count , datatype , rank + 1 , schedule , false);
213
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
214
+ }
215
+
216
+ cleanup_and_return :
169
217
return res ;
170
- }
218
+ }
171
219
172
- return OMPI_SUCCESS ;
220
+ /*
221
+ * scan_sched_recursivedoubling:
222
+ *
223
+ * Function: Recursive doubling algorithm for inclusive scan.
224
+ * Accepts: Same as MPI_Iscan
225
+ * Returns: MPI_SUCCESS or error code
226
+ *
227
+ * Description: Implements recursive doubling algorithm for MPI_Iscan.
228
+ * The algorithm preserves order of operations so it can
229
+ * be used both by commutative and non-commutative operations.
230
+ *
231
+ * Example for 5 processes and commutative operation MPI_SUM:
232
+ * Process: 0 1 2 3 4
233
+ * recvbuf: [0] [1] [2] [3] [4]
234
+ * psend: [0] [1] [2] [3] [4]
235
+ *
236
+ * Step 1:
237
+ * recvbuf: [0] [0+1] [2] [2+3] [4]
238
+ * psend: [1+0] [0+1] [3+2] [2+3] [4]
239
+ *
240
+ * Step 2:
241
+ * recvbuf: [0] [0+1] [(1+0)+2] [(1+0)+(2+3)] [4]
242
+ * psend: [(3+2)+(1+0)] [(2+3)+(0+1)] [(1+0)+(3+2)] [(1+0)+(2+3)] [4]
243
+ *
244
+ * Step 3:
245
+ * recvbuf: [0] [0+1] [(1+0)+2] [(1+0)+(2+3)] [((3+2)+(1+0))+4]
246
+ * psend: [4+((3+2)+(1+0))] [((3+2)+(1+0))+4]
247
+ *
248
+ * Time complexity (worst case): \ceil(\log_2(p))(2\alpha + 2m\beta + 2m\gamma)
249
+ * Memory requirements (per process): 2 * count * typesize = O(count)
250
+ * Limitations: intra-communicators only
251
+ * Schedule length: O(log(p))
252
+ */
253
+ static inline int scan_sched_recursivedoubling (
254
+ int rank , int comm_size , const void * sendbuf , void * recvbuf , int count ,
255
+ MPI_Datatype datatype , MPI_Op op , char inplace ,
256
+ NBC_Schedule * schedule , void * tmpbuf1 , void * tmpbuf2 )
257
+ {
258
+ int res = OMPI_SUCCESS ;
259
+
260
+ if (!inplace ) {
261
+ res = NBC_Sched_copy ((void * )sendbuf , false, count , datatype ,
262
+ recvbuf , false, count , datatype , schedule , true);
263
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
264
+ }
265
+ if (comm_size < 2 )
266
+ goto cleanup_and_return ;
267
+
268
+ char * psend = (char * )tmpbuf1 ;
269
+ char * precv = (char * )tmpbuf2 ;
270
+ res = NBC_Sched_copy (recvbuf , false, count , datatype ,
271
+ psend , true, count , datatype , schedule , true);
272
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
273
+
274
+ int is_commute = ompi_op_is_commute (op );
275
+ for (int mask = 1 ; mask < comm_size ; mask <<= 1 ) {
276
+ int remote = rank ^ mask ;
277
+ if (remote < comm_size ) {
278
+ res = NBC_Sched_send (psend , true, count , datatype , remote , schedule , false);
279
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
280
+ res = NBC_Sched_recv (precv , true, count , datatype , remote , schedule , true);
281
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
282
+
283
+ if (rank > remote ) {
284
+ /* Accumulate prefix reduction: recvbuf = precv <op> recvbuf */
285
+ res = NBC_Sched_op (precv , true, recvbuf , false, count ,
286
+ datatype , op , schedule , false);
287
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
288
+ /* Partial result: psend = precv <op> psend */
289
+ res = NBC_Sched_op (precv , true, psend , true, count ,
290
+ datatype , op , schedule , true);
291
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
292
+ } else {
293
+ if (is_commute ) {
294
+ /* psend = precv <op> psend */
295
+ res = NBC_Sched_op (precv , true, psend , true, count ,
296
+ datatype , op , schedule , true);
297
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
298
+ } else {
299
+ /* precv = psend <op> precv */
300
+ res = NBC_Sched_op (psend , true, precv , true, count ,
301
+ datatype , op , schedule , true);
302
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
303
+ char * tmp = psend ;
304
+ psend = precv ;
305
+ precv = tmp ;
306
+ }
307
+ }
308
+ }
309
+ }
310
+
311
+ cleanup_and_return :
312
+ return res ;
173
313
}
174
314
175
315
int ompi_coll_libnbc_iscan (const void * sendbuf , void * recvbuf , int count , MPI_Datatype datatype , MPI_Op op ,
0 commit comments