-
Notifications
You must be signed in to change notification settings - Fork 900
Wrong result on non-blocking collectives with user-defined op #1754
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Comments
with two ranks, this looks like a bug in Open MPI and here is a patch
i will double check what happens with 3 tasks and more. if i understand the standard correctly, a user defined reduction must be associative but might not be commutative, am i right ? |
static inline void ompi_3buff_op_user (ompi_op_t *op, void * restrict source1, void * restrict source2,
void * restrict result, int count, struct ompi_datatype_t *dtype)
{
ompi_datatype_copy_content_same_ddt (dtype, count, result, source1);
op->o_func.c_fn (source2, result, &count, &dtype);
} basically, it does also, it does not work if that being said, libnbc could/should be rewritten to use |
@yukiM-fj i made PR #1760 for master the inline patch is for v2.x diff --git a/ompi/mca/coll/libnbc/nbc.c b/ompi/mca/coll/libnbc/nbc.c
index 75c37c4..aee5418 100644
--- a/ompi/mca/coll/libnbc/nbc.c
+++ b/ompi/mca/coll/libnbc/nbc.c
@@ -10,7 +10,7 @@
* rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
- * Copyright (c) 2015 Research Organization for Information Science
+ * Copyright (c) 2015-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
*
* Author(s): Torsten Hoefler <[email protected]>
@@ -159,6 +159,32 @@ int NBC_Sched_recv (void* buf, char tmpbuf, int count, MPI_Datatype datatype, in
}
/* this function puts an operation into the schedule */
+int NBC_Sched_op2 (const void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int count, MPI_Datatype datatype, MPI_Op op, NBC_Schedule *schedule, bool barrier) {
+ NBC_Args_op op_args;
+ int ret;
+
+ /* store the passed arguments */
+ op_args.type = OP2;
+ op_args.buf1 = buf1;
+ op_args.buf2 = buf2;
+ op_args.tmpbuf1 = tmpbuf1;
+ op_args.tmpbuf2 = tmpbuf2;
+ op_args.count = count;
+ op_args.op = op;
+ op_args.datatype = datatype;
+
+ /* append to the round-schedule */
+ ret = nbc_schedule_round_append (schedule, &op_args, sizeof (op_args), barrier);
+ if (OMPI_SUCCESS != ret) {
+ return ret;
+ }
+
+ NBC_DEBUG(10, "added op2 - ends at byte %i\n", nbc_schedule_get_size (schedule));
+
+ return OMPI_SUCCESS;
+}
+
+/* this function puts an operation into the schedule */
int NBC_Sched_op(void *buf3, char tmpbuf3, const void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int count, MPI_Datatype datatype, MPI_Op op, NBC_Schedule *schedule, bool barrier) {
NBC_Args_op op_args;
int ret;
@@ -477,6 +503,24 @@ static inline int NBC_Start_round(NBC_Handle *handle) {
}
ompi_3buff_op_reduce(opargs.op, buf1, buf2, buf3, opargs.count, opargs.datatype);
break;
+ case OP2:
+ NBC_DEBUG(5, " OP2 (offset %li) ", offset);
+ NBC_GET_BYTES(ptr,opargs);
+ NBC_DEBUG(5, "*buf1: %p, buf2: %p, count: %i, type: %p)\n", opargs.buf1, opargs.buf2,
+ opargs.count, opargs.datatype);
+ /* get buffers */
+ if(opargs.tmpbuf1) {
+ buf1=(char*)handle->tmpbuf+(long)opargs.buf1;
+ } else {
+ buf1=(void *)opargs.buf1;
+ }
+ if(opargs.tmpbuf2) {
+ buf2=(char*)handle->tmpbuf+(long)opargs.buf2;
+ } else {
+ buf2=opargs.buf2;
+ }
+ ompi_op_reduce(opargs.op, buf1, buf2, opargs.count, opargs.datatype);
+ break;
case COPY:
NBC_DEBUG(5, " COPY (offset %li) ", offset);
NBC_GET_BYTES(ptr,copyargs);
diff --git a/ompi/mca/coll/libnbc/nbc_iallreduce.c b/ompi/mca/coll/libnbc/nbc_iallreduce.c
index 2e1b0dd..ee99b1a 100644
--- a/ompi/mca/coll/libnbc/nbc_iallreduce.c
+++ b/ompi/mca/coll/libnbc/nbc_iallreduce.c
@@ -7,7 +7,7 @@
* rights reserved.
* Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights
* reserved.
- * Copyright (c) 2014-2015 Research Organization for Information Science
+ * Copyright (c) 2014-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
*
* Author(s): Torsten Hoefler <[email protected]>
@@ -16,6 +16,7 @@
#include "nbc_internal.h"
#include "ompi/communicator/communicator.h"
#include "ompi/datatype/ompi_datatype.h"
+#include "ompi/op/op.h"
#include <assert.h>
@@ -101,7 +102,7 @@ int ompi_coll_libnbc_iallreduce(const void* sendbuf, void* recvbuf, int count, M
}
/* algorithm selection */
- if(p < 4 || size*count < 65536 || inplace) {
+ if(p < 4 || size*count < 65536 || !ompi_op_is_commute(op) || inplace) {
alg = NBC_ARED_BINOMIAL;
} else {
alg = NBC_ARED_RING;
@@ -299,10 +300,24 @@ int ompi_coll_libnbc_iallreduce_inter(const void* sendbuf, void* recvbuf, int co
static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype datatype, const void *sendbuf, void *recvbuf,
MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle) {
int root, vrank, maxr, vpeer, peer, res;
+ char *rbuf, *lbuf, *buf;
+ int tmprbuf, tmplbuf;
root = 0; /* this makes the code for ireduce and iallreduce nearly identical - could be changed to improve performance */
RANK2VRANK(rank, vrank, root);
maxr = (int)ceil((log((double)p)/LOG2));
+ /* ensure the result ends up in recvbuf on vrank 0 */
+ if (0 == (maxr%2)) {
+ rbuf = 0;
+ tmprbuf = true;
+ lbuf = recvbuf;
+ tmplbuf = false;
+ } else {
+ lbuf = 0;
+ tmplbuf = true;
+ rbuf = recvbuf;
+ tmprbuf = false;
+ }
for (int r = 1, firstred = 1 ; r <= maxr ; ++r) {
if ((vrank % (1 << r)) == 0) {
@@ -311,7 +326,7 @@ static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype dat
VRANK2RANK(peer, vpeer, root)
if (peer < p) {
/* we have to wait until we have the data */
- res = NBC_Sched_recv (0, true, count, datatype, peer, schedule, true);
+ res = NBC_Sched_recv (rbuf, tmprbuf, count, datatype, peer, schedule, true);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res;
}
@@ -319,16 +334,18 @@ static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype dat
/* this cannot be done until handle->tmpbuf is unused :-( so barrier after the op */
if (firstred && MPI_IN_PLACE != sendbuf) {
/* perform the reduce with the senbuf */
- res = NBC_Sched_op (recvbuf, false, sendbuf, false, 0, true, count, datatype, op, schedule, true);
+ res = NBC_Sched_op2 (sendbuf, false, rbuf, tmprbuf, count, datatype, op, schedule, true);
firstred = 0;
} else {
/* perform the reduce in my local buffer */
- res = NBC_Sched_op (recvbuf, false, recvbuf, false, 0, true, count, datatype, op, schedule, true);
+ res = NBC_Sched_op2 (lbuf, tmplbuf, rbuf, tmprbuf, count, datatype, op, schedule, true);
}
-
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res;
}
+ /* swap left and right buffers */
+ buf = rbuf; rbuf = lbuf ; lbuf = buf;
+ tmprbuf ^= 1; tmplbuf ^= 1;
}
} else {
/* we have to send this round */
@@ -338,8 +355,8 @@ static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype dat
/* we have to use the sendbuf in the first round .. */
res = NBC_Sched_send (sendbuf, false, count, datatype, peer, schedule, false);
} else {
- /* and the recvbuf in all remeining rounds */
- res = NBC_Sched_send (recvbuf, false, count, datatype, peer, schedule, false);
+ /* and the recvbuf in all remaining rounds */
+ res = NBC_Sched_send (lbuf, tmplbuf, count, datatype, peer, schedule, false);
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
@@ -373,6 +390,7 @@ static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype dat
}
}
+ if (0 == vrank) assert(lbuf == recvbuf);
/* now send to the right hosts */
for (int r = 0; r < maxr; ++r) {
if (((vrank + (1 << r) < p) && (vrank < (1 << r))) || (vrank == 0)) {
diff --git a/ompi/mca/coll/libnbc/nbc_internal.h b/ompi/mca/coll/libnbc/nbc_internal.h
index fbf6833..c32dfb6 100644
--- a/ompi/mca/coll/libnbc/nbc_internal.h
+++ b/ompi/mca/coll/libnbc/nbc_internal.h
@@ -10,7 +10,7 @@
*
* Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
- * Copyright (c) 2015 Research Organization for Information Science
+ * Copyright (c) 2015-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
@@ -80,6 +80,7 @@ typedef enum {
SEND,
RECV,
OP,
+ OP2,
COPY,
UNPACK
} NBC_Fn_type;
@@ -147,6 +148,7 @@ int NBC_Sched_create(NBC_Schedule* schedule);
int NBC_Sched_send(const void* buf, char tmpbuf, int count, MPI_Datatype datatype, int dest, NBC_Schedule *schedule, bool barrier);
int NBC_Sched_recv(void* buf, char tmpbuf, int count, MPI_Datatype datatype, int source, NBC_Schedule *schedule, bool barrier);
int NBC_Sched_op(void* buf3, char tmpbuf3, const void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int count, MPI_Datatype datatype, MPI_Op op, NBC_Schedule *schedule, bool barrier);
+int NBC_Sched_op2 (const void* buf1, char tmpbuf1, void* buf2, char tmpbuf2, int count, MPI_Datatype datatype,MPI_Op op, NBC_Schedule *schedule, bool barrier);
int NBC_Sched_copy(void *src, char tmpsrc, int srccount, MPI_Datatype srctype, void *tgt, char tmptgt, int tgtcount, MPI_Datatype tgttype, NBC_Schedule *schedule, bool barrier);
int NBC_Sched_unpack(void *inbuf, char tmpinbuf, int count, MPI_Datatype datatype, void *outbuf, char tmpoutbuf, NBC_Schedule *schedule, bool barrier);
int NBC_Sched_barrier(NBC_Schedule *schedule);
@@ -314,6 +316,7 @@ static inline void nbc_get_round_size (char *p, unsigned long *size) {
offset += sizeof(NBC_Args_recv);
break;
case OP:
+ case OP2:
/*printf("found a OP at offset %li\n", (long)p-(long)schedule); */
offset += sizeof(NBC_Args_op); \
break;
@@ -390,6 +393,7 @@ static inline void nbc_schedule_inc_round (NBC_Schedule *schedule) {
printf("*buf: %lu, count: %i, type: %lu, source: %i)\n", (unsigned long)recvargs.buf, recvargs.count, (unsigned long)recvargs.datatype, recvargs.source); \
break; \
case OP: \
+ case OP2: \
printf("[%i] OP (offset %li) ", myrank, (long)p-(long)schedule); \
NBC_GET_BYTES(p,opargs); \
printf("*buf1: %lu, buf2: %lu, count: %i, type: %lu)\n", (unsigned long)opargs.buf1, (unsigned long)opargs.buf2, opargs.count, (unsigned long)opargs.datatype); \ could you please give it a try ? fwiw, if i replace i will review this and other non blocking collectives that use thanks for the report and test case, it was very helpful ! |
@ggouaillardet, thank you. I try building & testing your PR now. fwiw, I add other TPs at gist |
@yukiM-fj thanks ! out of curiosity, what is the difference between Open MPI and Fujitsu MPI regarding non blocking collectives ? |
@ggouaillardet it is ok to adding these tests. |
Sorry! I missed operation. |
@ggouaillardet, I tested your PR with these tests in my gist. |
@yukiM-fj i just pushed some more commits into #1760 |
@ggouaillardet I look your PR, but may be tomorrow... binomial ireduce alg needs more buffer when op is non-commutative. |
@yukiM-fj i kind of simplified that |
@ggouaillardet , I look your PR. I think your modification is ok.
|
@yukiM-fj i did not check non blocking collectives with inter communicator, but they are likely busted as well. if you have some test cases, they are more than welcome ! note i just pushed 0e39319, it fixes blocking (all)reduce. i updated the PR with a checkpoint of my work, it fixes all the test cases i wrote based on yours. fwiw, is here a snapshot of my open-mpi/ompi-tests diff --git a/ibm/collective/Makefile.am b/ibm/collective/Makefile.am
index 0112d9e..5ce1630 100644
--- a/ibm/collective/Makefile.am
+++ b/ibm/collective/Makefile.am
@@ -17,6 +17,12 @@ noinst_PROGRAMS = \
allgatherv_in_place \
allreduce \
allreduce_in_place \
+ allreduce_nocommute \
+ allreduce_nocommute_gap \
+ allreduce_nocommute_gap_in_place \
+ allreduce_nocommute_in_place \
+ allreduce_nocommute_stride \
+ allreduce_nocommute_stride_in_place \
alltoall \
alltoall_in_place \
alltoallv_somezeros \
@@ -38,6 +44,10 @@ noinst_PROGRAMS = \
reduce_big \
reduce_in_place \
reduce_loc \
+ reduce_nocommute \
+ reduce_nocommute_gap \
+ reduce_nocommute_gap_in_place \
+ reduce_nocommute_in_place \
reduce_scatter \
reduce_scatter_in_place \
scan \
@@ -75,6 +85,12 @@ noinst_PROGRAMS += \
iallgatherv_in_place \
iallreduce \
iallreduce_in_place \
+ iallreduce_nocommute \
+ iallreduce_nocommute_gap \
+ iallreduce_nocommute_gap_in_place \
+ iallreduce_nocommute_in_place \
+ iallreduce_nocommute_stride \
+ iallreduce_nocommute_stride_stride \
ialltoall \
ibarrier \
ibcast \
@@ -90,6 +106,10 @@ noinst_PROGRAMS += \
ireduce \
ireduce_big \
ireduce_in_place \
+ ireduce_nocommute \
+ ireduce_nocommute_gap \
+ ireduce_nocommute_gap_in_place \
+ ireduce_nocommute_in_place \
ireduce_loc \
ireduce_scatter \
ireduce_scatter_block \
diff --git a/ibm/collective/allreduce_nocommute.c b/ibm/collective/allreduce_nocommute.c
new file mode 100644
index 0000000..4003927
--- /dev/null
+++ b/ibm/collective/allreduce_nocommute.c
@@ -0,0 +1,98 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <mpi.h>
+
+#define MAXCOUNT (1024*1024)
+
+static void func(void *_a, void *_b, int *_c, MPI_Datatype *d){
+ int *a = _a;
+ int *b = _b;
+ int c = *_c;
+ int i;
+
+ for(i = 0; i < c; i++){
+ int64_t a00 = a[i*4+0];
+ int64_t a01 = a[i*4+1];
+ int64_t a10 = a[i*4+2];
+ int64_t a11 = a[i*4+3];
+ int64_t b00 = b[i*4+0];
+ int64_t b01 = b[i*4+1];
+ int64_t b10 = b[i*4+2];
+ int64_t b11 = b[i*4+3];
+ b[i*4+0] = a00 * b00 + a01 * b10;
+ b[i*4+1] = a00 * b01 + a01 * b11;
+ b[i*4+2] = a10 * b00 + a11 * b10;
+ b[i*4+3] = a10 * b01 + a11 * b11;
+ }
+}
+
+int main(int argc, char *argv[]){
+ MPI_Datatype type;
+ MPI_Op op;
+ int *sbuf, *rbuf, *ans;
+ int nprocs, myrank;
+ int count;
+ int i;
+
+ MPI_Init(&argc, &argv);
+
+ MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+ MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+ MPI_Type_contiguous(4, MPI_INT, &type);
+ MPI_Type_commit(&type);
+
+ MPI_Op_create(func, 0, &op);
+
+ count = nprocs*nprocs*nprocs*nprocs ;
+ if (count > MAXCOUNT) count = MAXCOUNT;
+
+ sbuf = malloc(count * 4 * sizeof(int));
+ rbuf = malloc(count * 4 * sizeof(int));
+ ans = malloc(count * 4 * sizeof(int));
+
+
+ for(i = 0; i < count * 4; i++){
+ sbuf[i] = myrank + i;
+ }
+
+ MPI_Allreduce(sbuf, rbuf, count, type, op, MPI_COMM_WORLD);
+
+ for(i = 0; i < count * 4; i++){
+ ans[i] = ((i+3)&2)>>1;
+ }
+
+ for(i = nprocs; i--; ){
+ int c = count;
+ int j;
+ for(j = 0; j < count * 4; j++){
+ sbuf[j] = i + j;
+ }
+ func(sbuf, ans, &c, NULL);
+ }
+
+ int error_flag=0;
+ for(i = 0; i < count * 4; i++){
+ if(rbuf[i] != ans[i]){
+ if (0 == error_flag) printf("rbuf[%d] = %d, ans[%d] = %d\n", i, rbuf[i], i, ans[i]);
+ error_flag++;
+ }
+ }
+ MPI_Barrier(MPI_COMM_WORLD);
+ if(error_flag > 0){
+ MPI_Abort(MPI_COMM_WORLD, 100);
+ }
+
+ free(sbuf);
+ free(rbuf);
+ free(ans);
+
+ MPI_Type_free(&type);
+ MPI_Op_free(&op);
+
+ MPI_Finalize();
+
+ return 0;
+}
+
diff --git a/ibm/collective/allreduce_nocommute_gap.c b/ibm/collective/allreduce_nocommute_gap.c
new file mode 100644
index 0000000..0c39cf0
--- /dev/null
+++ b/ibm/collective/allreduce_nocommute_gap.c
@@ -0,0 +1,103 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <mpi.h>
+
+#define MAXCOUNT (1024*1024)
+
+#define OFFSET (-2345)
+
+static void func(void *_a, void *_b, int *_c, MPI_Datatype *d){
+ int *a = _a;
+ int *b = _b;
+ int c = *_c;
+ int i;
+
+ for(i = 0; i < c; i++){
+ int64_t a00 = a[i*4+0+OFFSET];
+ int64_t a01 = a[i*4+1+OFFSET];
+ int64_t a10 = a[i*4+2+OFFSET];
+ int64_t a11 = a[i*4+3+OFFSET];
+ int64_t b00 = b[i*4+0+OFFSET];
+ int64_t b01 = b[i*4+1+OFFSET];
+ int64_t b10 = b[i*4+2+OFFSET];
+ int64_t b11 = b[i*4+3+OFFSET];
+ b[i*4+0+OFFSET] = a00 * b00 + a01 * b10;
+ b[i*4+1+OFFSET] = a00 * b01 + a01 * b11;
+ b[i*4+2+OFFSET] = a10 * b00 + a11 * b10;
+ b[i*4+3+OFFSET] = a10 * b01 + a11 * b11;
+ }
+}
+
+int main(int argc, char *argv[]){
+ MPI_Datatype type;
+ MPI_Op op;
+ int *sbuf, *rbuf, *ans;
+ int nprocs, myrank;
+ int count;
+ int i;
+ int ab[1] = {4};
+ MPI_Aint ad[1] = {OFFSET*sizeof(int)};
+ MPI_Datatype at[1] = {MPI_INT};
+
+ MPI_Init(&argc, &argv);
+
+ MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+ MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+ MPI_Type_create_struct(1, ab, ad, at, &type);
+ MPI_Type_commit(&type);
+
+ MPI_Op_create(func, 0, &op);
+
+ count = nprocs*nprocs*nprocs*nprocs ;
+ if (count > MAXCOUNT) count = MAXCOUNT;
+
+ sbuf = malloc(count * 4 * sizeof(int));
+ rbuf = malloc(count * 4 * sizeof(int));
+ ans = malloc(count * 4 * sizeof(int));
+
+
+ for(i = 0; i < count * 4; i++){
+ sbuf[i] = myrank + i;
+ }
+
+ MPI_Allreduce(sbuf-OFFSET, rbuf-OFFSET, count, type, op, MPI_COMM_WORLD);
+
+ for(i = 0; i < count * 4; i++){
+ ans[i] = ((i+3)&2)>>1;
+ }
+
+ for(i = nprocs; i--; ){
+ int c = count;
+ int j;
+ for(j = 0; j < count * 4; j++){
+ sbuf[j] = i + j;
+ }
+ func(sbuf-OFFSET, ans-OFFSET, &c, NULL);
+ }
+
+ int error_flag=0;
+ for(i = 0; i < count * 4; i++){
+ if(rbuf[i] != ans[i]){
+ if (0 == error_flag) printf("rbuf[%d] = %d, ans[%d] = %d\n", i, rbuf[i], i, ans[i]);
+ error_flag++;
+ }
+ }
+ MPI_Barrier(MPI_COMM_WORLD);
+ if(error_flag > 0){
+ MPI_Abort(MPI_COMM_WORLD, 100);
+ }
+
+ free(sbuf);
+ free(rbuf);
+ free(ans);
+
+ MPI_Type_free(&type);
+ MPI_Op_free(&op);
+
+ MPI_Finalize();
+
+ return 0;
+}
+
diff --git a/ibm/collective/allreduce_nocommute_gap_in_place.c b/ibm/collective/allreduce_nocommute_gap_in_place.c
new file mode 100644
index 0000000..f6d06ec
--- /dev/null
+++ b/ibm/collective/allreduce_nocommute_gap_in_place.c
@@ -0,0 +1,103 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <mpi.h>
+
+#define MAXCOUNT (1024*1024)
+
+#define OFFSET (-2345)
+
+static void func(void *_a, void *_b, int *_c, MPI_Datatype *d){
+ int *a = _a;
+ int *b = _b;
+ int c = *_c;
+ int i;
+
+ for(i = 0; i < c; i++){
+ int64_t a00 = a[i*4+0+OFFSET];
+ int64_t a01 = a[i*4+1+OFFSET];
+ int64_t a10 = a[i*4+2+OFFSET];
+ int64_t a11 = a[i*4+3+OFFSET];
+ int64_t b00 = b[i*4+0+OFFSET];
+ int64_t b01 = b[i*4+1+OFFSET];
+ int64_t b10 = b[i*4+2+OFFSET];
+ int64_t b11 = b[i*4+3+OFFSET];
+ b[i*4+0+OFFSET] = a00 * b00 + a01 * b10;
+ b[i*4+1+OFFSET] = a00 * b01 + a01 * b11;
+ b[i*4+2+OFFSET] = a10 * b00 + a11 * b10;
+ b[i*4+3+OFFSET] = a10 * b01 + a11 * b11;
+ }
+}
+
+int main(int argc, char *argv[]){
+ MPI_Datatype type;
+ MPI_Op op;
+ int *sbuf, *rbuf, *ans;
+ int nprocs, myrank;
+ int count;
+ int i;
+ int ab[1] = {4};
+ MPI_Aint ad[1] = {OFFSET*sizeof(int)};
+ MPI_Datatype at[1] = {MPI_INT};
+
+ MPI_Init(&argc, &argv);
+
+ MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+ MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+ MPI_Type_create_struct(1, ab, ad, at, &type);
+ MPI_Type_commit(&type);
+
+ MPI_Op_create(func, 0, &op);
+
+ count = nprocs*nprocs*nprocs*nprocs ;
+ if (count > MAXCOUNT) count = MAXCOUNT;
+
+ sbuf = malloc(count * 4 * sizeof(int));
+ rbuf = malloc(count * 4 * sizeof(int));
+ ans = malloc(count * 4 * sizeof(int));
+
+
+ for(i = 0; i < count * 4; i++){
+ rbuf[i] = myrank + i;
+ }
+
+ MPI_Allreduce(MPI_IN_PLACE, rbuf-OFFSET, count, type, op, MPI_COMM_WORLD);
+
+ for(i = 0; i < count * 4; i++){
+ ans[i] = ((i+3)&2)>>1;
+ }
+
+ for(i = nprocs; i--; ){
+ int c = count;
+ int j;
+ for(j = 0; j < count * 4; j++){
+ sbuf[j] = i + j;
+ }
+ func(sbuf-OFFSET, ans-OFFSET, &c, NULL);
+ }
+
+ int error_flag=0;
+ for(i = 0; i < count * 4; i++){
+ if(rbuf[i] != ans[i]){
+ if (0 == error_flag) printf("rbuf[%d] = %d, ans[%d] = %d\n", i, rbuf[i], i, ans[i]);
+ error_flag++;
+ }
+ }
+ MPI_Barrier(MPI_COMM_WORLD);
+ if(error_flag > 0){
+ MPI_Abort(MPI_COMM_WORLD, 100);
+ }
+
+ free(sbuf);
+ free(rbuf);
+ free(ans);
+
+ MPI_Type_free(&type);
+ MPI_Op_free(&op);
+
+ MPI_Finalize();
+
+ return 0;
+}
+
diff --git a/ibm/collective/allreduce_nocommute_in_place.c b/ibm/collective/allreduce_nocommute_in_place.c
new file mode 100644
index 0000000..29b15d5
--- /dev/null
+++ b/ibm/collective/allreduce_nocommute_in_place.c
@@ -0,0 +1,98 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <mpi.h>
+
+#define MAXCOUNT (1024*1024)
+
+static void func(void *_a, void *_b, int *_c, MPI_Datatype *d){
+ int *a = _a;
+ int *b = _b;
+ int c = *_c;
+ int i;
+
+ for(i = 0; i < c; i++){
+ int64_t a00 = a[i*4+0];
+ int64_t a01 = a[i*4+1];
+ int64_t a10 = a[i*4+2];
+ int64_t a11 = a[i*4+3];
+ int64_t b00 = b[i*4+0];
+ int64_t b01 = b[i*4+1];
+ int64_t b10 = b[i*4+2];
+ int64_t b11 = b[i*4+3];
+ b[i*4+0] = a00 * b00 + a01 * b10;
+ b[i*4+1] = a00 * b01 + a01 * b11;
+ b[i*4+2] = a10 * b00 + a11 * b10;
+ b[i*4+3] = a10 * b01 + a11 * b11;
+ }
+}
+
+int main(int argc, char *argv[]){
+ MPI_Datatype type;
+ MPI_Op op;
+ int *sbuf, *rbuf, *ans;
+ int nprocs, myrank;
+ int count;
+ int i;
+
+ MPI_Init(&argc, &argv);
+
+ MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+ MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+ MPI_Type_contiguous(4, MPI_INT, &type);
+ MPI_Type_commit(&type);
+
+ MPI_Op_create(func, 0, &op);
+
+ count = nprocs*nprocs*nprocs*nprocs ;
+ if (count > MAXCOUNT) count = MAXCOUNT;
+
+ sbuf = malloc(count * 4 * sizeof(int));
+ rbuf = malloc(count * 4 * sizeof(int));
+ ans = malloc(count * 4 * sizeof(int));
+
+
+ for(i = 0; i < count * 4; i++){
+ rbuf[i] = myrank + i;
+ }
+
+ MPI_Allreduce(MPI_IN_PLACE, rbuf, count, type, op, MPI_COMM_WORLD);
+
+ for(i = 0; i < count * 4; i++){
+ ans[i] = ((i+3)&2)>>1;
+ }
+
+ for(i = nprocs; i--; ){
+ int c = count;
+ int j;
+ for(j = 0; j < count * 4; j++){
+ sbuf[j] = i + j;
+ }
+ func(sbuf, ans, &c, NULL);
+ }
+
+ int error_flag=0;
+ for(i = 0; i < count * 4; i++){
+ if(rbuf[i] != ans[i]){
+ if (0 == error_flag) printf("rbuf[%d] = %d, ans[%d] = %d\n", i, rbuf[i], i, ans[i]);
+ error_flag++;
+ }
+ }
+ MPI_Barrier(MPI_COMM_WORLD);
+ if(error_flag > 0){
+ MPI_Abort(MPI_COMM_WORLD, 100);
+ }
+
+ free(sbuf);
+ free(rbuf);
+ free(ans);
+
+ MPI_Type_free(&type);
+ MPI_Op_free(&op);
+
+ MPI_Finalize();
+
+ return 0;
+}
+
diff --git a/ibm/collective/allreduce_nocommute_stride.c b/ibm/collective/allreduce_nocommute_stride.c
new file mode 100644
index 0000000..51915bf
--- /dev/null
+++ b/ibm/collective/allreduce_nocommute_stride.c
@@ -0,0 +1,146 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <mpi.h>
+
+#define MAXCOUNT (1024*1024)
+
+static void func(void *_a, void *_b, int *_c, MPI_Datatype *d){
+ int *a = _a;
+ int *b = _b;
+ int c = *_c;
+ int i;
+
+ for(i = 0; i < c; i++){
+ int64_t a00 = a[i*4+0];
+ int64_t a01 = a[i*4+1];
+ int64_t a10 = a[i*4+2];
+ int64_t a11 = a[i*4+3];
+ int64_t b00 = b[i*4+0];
+ int64_t b01 = b[i*4+1];
+ int64_t b10 = b[i*4+2];
+ int64_t b11 = b[i*4+3];
+ b[i*4+0] = a00 * b00 + a01 * b10;
+ b[i*4+1] = a00 * b01 + a01 * b11;
+ b[i*4+2] = a10 * b00 + a11 * b10;
+ b[i*4+3] = a10 * b01 + a11 * b11;
+ }
+}
+
+static void func2(void *_a, void *_b, int *_c, MPI_Datatype *d){
+ int *a = _a;
+ int *b = _b;
+ int c = *_c;
+ int i;
+
+ for(i = 0; i < c; i++){
+ int64_t a00 = a[i+0*c];
+ int64_t a01 = a[i+1*c];
+ int64_t a10 = a[i+2*c];
+ int64_t a11 = a[i+3*c];
+ int64_t b00 = b[i+0*c];
+ int64_t b01 = b[i+1*c];
+ int64_t b10 = b[i+2*c];
+ int64_t b11 = b[i+3*c];
+ b[i+0*c] = a00 * b00 + a01 * b10;
+ b[i+1*c] = a00 * b01 + a01 * b11;
+ b[i+2*c] = a10 * b00 + a11 * b10;
+ b[i+3*c] = a10 * b01 + a11 * b11;
+ }
+}
+
+int main(int argc, char *argv[]){
+ MPI_Datatype type, type2;
+ MPI_Op op;
+ MPI_Op op2;
+ int *sbuf , *ans ;
+ int *sbuf2, *rbuf2, *ans2;
+ int *rbuf3, *ans3;
+ int nprocs, myrank;
+ int count;
+ int i;
+
+ MPI_Init(&argc, &argv);
+
+ MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+ MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+ MPI_Op_create(func, 0, &op);
+ MPI_Op_create(func2, 0, &op2);
+
+ count = nprocs*nprocs*nprocs*nprocs ;
+ if (count > MAXCOUNT) count = MAXCOUNT;
+
+ MPI_Type_vector(4, 1, count, MPI_INT, &type);
+ MPI_Type_create_resized(type, 0, 4, &type2);
+ MPI_Type_commit(&type2);
+ MPI_Type_free(&type);
+ MPI_Type_contiguous(4, MPI_INT, &type);
+ MPI_Type_commit(&type);
+
+ sbuf = malloc(count * 4 * sizeof(int));
+ ans = malloc(count * 4 * sizeof(int));
+ sbuf2 = malloc(count * 4 * sizeof(int));
+ rbuf2 = malloc(count * 4 * sizeof(int));
+ ans2 = malloc(count * 4 * sizeof(int));
+ rbuf3 = malloc(count * 4 * sizeof(int));
+ ans3 = malloc(count * 4 * sizeof(int));
+
+
+ for(i = 0; i < count * 4; i++){
+ sbuf[i] = myrank + i;
+ ans[i] = ((i+3)&2)>>1;
+ }
+
+ MPI_Sendrecv(sbuf, count, type, 0, 0, sbuf2, count, type2, 0, 0, MPI_COMM_SELF, MPI_STATUS_IGNORE);
+ MPI_Sendrecv(ans , count, type, 0, 0, ans2 , count, type2, 0, 0, MPI_COMM_SELF, MPI_STATUS_IGNORE);
+
+ MPI_Allreduce(sbuf2, rbuf2, count, type2, op2, MPI_COMM_WORLD);
+ MPI_Sendrecv(rbuf2 , count, type2, 0, 0, rbuf3 , count, type, 0, 0, MPI_COMM_SELF, MPI_STATUS_IGNORE);
+
+ for(i = nprocs; i--; ){
+ int c = count;
+ int j;
+ for(j = 0; j < count * 4; j++){
+ sbuf[j] = i + j;
+ }
+ MPI_Sendrecv(sbuf, count, type, 0, 0, sbuf2, count, type2, 0, 0, MPI_COMM_SELF, MPI_STATUS_IGNORE);
+ func (sbuf , ans , &c, NULL);
+ func2(sbuf2, ans2, &c, NULL);
+ }
+ MPI_Sendrecv(ans2, count, type2, 0, 0, ans3, count, type, 0, 0, MPI_COMM_SELF, MPI_STATUS_IGNORE);
+
+ int error_flag=0;
+
+ for(i = 0; i < count * 4; i++){
+ if(ans3[i] != ans[i]){
+ if (0 == error_flag) printf("!!! ans3[%d] = %d, ans[%d] = %d\n", i, ans3[i], i, ans[i]);
+ error_flag++;
+ }
+ if(rbuf3[i] != ans3[i]){
+ if (0 == error_flag) printf("rbuf3[%d] = %d, ans3[%d] = %d\n", i, rbuf3[i], i, ans3[i]);
+ error_flag++;
+ }
+ }
+ MPI_Barrier(MPI_COMM_WORLD);
+ if(error_flag > 0){
+ MPI_Abort(MPI_COMM_WORLD, 100);
+ }
+ free(sbuf);
+ free(sbuf2);
+ free(rbuf2);
+ free(rbuf3);
+ free(ans);
+ free(ans2);
+ free(ans3);
+
+ MPI_Type_free(&type);
+ MPI_Type_free(&type2);
+ MPI_Op_free(&op);
+ MPI_Op_free(&op2);
+
+ MPI_Finalize();
+
+ return 0;
+}
+
diff --git a/ibm/collective/allreduce_nocommute_stride_in_place.c b/ibm/collective/allreduce_nocommute_stride_in_place.c
new file mode 100644
index 0000000..18ceb93
--- /dev/null
+++ b/ibm/collective/allreduce_nocommute_stride_in_place.c
@@ -0,0 +1,146 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <mpi.h>
+
+#define MAXCOUNT (1024*1024)
+
+static void func(void *_a, void *_b, int *_c, MPI_Datatype *d){
+ int *a = _a;
+ int *b = _b;
+ int c = *_c;
+ int i;
+
+ for(i = 0; i < c; i++){
+ int64_t a00 = a[i*4+0];
+ int64_t a01 = a[i*4+1];
+ int64_t a10 = a[i*4+2];
+ int64_t a11 = a[i*4+3];
+ int64_t b00 = b[i*4+0];
+ int64_t b01 = b[i*4+1];
+ int64_t b10 = b[i*4+2];
+ int64_t b11 = b[i*4+3];
+ b[i*4+0] = a00 * b00 + a01 * b10;
+ b[i*4+1] = a00 * b01 + a01 * b11;
+ b[i*4+2] = a10 * b00 + a11 * b10;
+ b[i*4+3] = a10 * b01 + a11 * b11;
+ }
+}
+
+static void func2(void *_a, void *_b, int *_c, MPI_Datatype *d){
+ int *a = _a;
+ int *b = _b;
+ int c = *_c;
+ int i;
+
+ for(i = 0; i < c; i++){
+ int64_t a00 = a[i+0*c];
+ int64_t a01 = a[i+1*c];
+ int64_t a10 = a[i+2*c];
+ int64_t a11 = a[i+3*c];
+ int64_t b00 = b[i+0*c];
+ int64_t b01 = b[i+1*c];
+ int64_t b10 = b[i+2*c];
+ int64_t b11 = b[i+3*c];
+ b[i+0*c] = a00 * b00 + a01 * b10;
+ b[i+1*c] = a00 * b01 + a01 * b11;
+ b[i+2*c] = a10 * b00 + a11 * b10;
+ b[i+3*c] = a10 * b01 + a11 * b11;
+ }
+}
+
+int main(int argc, char *argv[]){
+ MPI_Datatype type, type2;
+ MPI_Op op;
+ MPI_Op op2;
+ int *sbuf , *ans ;
+ int *sbuf2, *rbuf2, *ans2;
+ int *rbuf3, *ans3;
+ int nprocs, myrank;
+ int count;
+ int i;
+
+ MPI_Init(&argc, &argv);
+
+ MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+ MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+ MPI_Op_create(func, 0, &op);
+ MPI_Op_create(func2, 0, &op2);
+
+ count = nprocs*nprocs*nprocs*nprocs ;
+ if (count > MAXCOUNT) count = MAXCOUNT;
+
+ MPI_Type_vector(4, 1, count, MPI_INT, &type);
+ MPI_Type_create_resized(type, 0, 4, &type2);
+ MPI_Type_commit(&type2);
+ MPI_Type_free(&type);
+ MPI_Type_contiguous(4, MPI_INT, &type);
+ MPI_Type_commit(&type);
+
+ sbuf = malloc(count * 4 * sizeof(int));
+ ans = malloc(count * 4 * sizeof(int));
+ sbuf2 = malloc(count * 4 * sizeof(int));
+ rbuf2 = malloc(count * 4 * sizeof(int));
+ ans2 = malloc(count * 4 * sizeof(int));
+ rbuf3 = malloc(count * 4 * sizeof(int));
+ ans3 = malloc(count * 4 * sizeof(int));
+
+
+ for(i = 0; i < count * 4; i++){
+ sbuf[i] = myrank + i;
+ ans[i] = ((i+3)&2)>>1;
+ }
+
+ MPI_Sendrecv(sbuf, count, type, 0, 0, rbuf2, count, type2, 0, 0, MPI_COMM_SELF, MPI_STATUS_IGNORE);
+ MPI_Sendrecv(ans , count, type, 0, 0, ans2 , count, type2, 0, 0, MPI_COMM_SELF, MPI_STATUS_IGNORE);
+
+ MPI_Allreduce(MPI_IN_PLACE, rbuf2, count, type2, op2, MPI_COMM_WORLD);
+ MPI_Sendrecv(rbuf2 , count, type2, 0, 0, rbuf3 , count, type, 0, 0, MPI_COMM_SELF, MPI_STATUS_IGNORE);
+
+ for(i = nprocs; i--; ){
+ int c = count;
+ int j;
+ for(j = 0; j < count * 4; j++){
+ sbuf[j] = i + j;
+ }
+ MPI_Sendrecv(sbuf, count, type, 0, 0, sbuf2, count, type2, 0, 0, MPI_COMM_SELF, MPI_STATUS_IGNORE);
+ func (sbuf , ans , &c, NULL);
+ func2(sbuf2, ans2, &c, NULL);
+ }
+ MPI_Sendrecv(ans2, count, type2, 0, 0, ans3, count, type, 0, 0, MPI_COMM_SELF, MPI_STATUS_IGNORE);
+
+ int error_flag=0;
+
+ for(i = 0; i < count * 4; i++){
+ if(ans3[i] != ans[i]){
+ if (0 == error_flag) printf("!!! ans3[%d] = %d, ans[%d] = %d\n", i, ans3[i], i, ans[i]);
+ error_flag++;
+ }
+ if(rbuf3[i] != ans3[i]){
+ if (0 == error_flag) printf("rbuf3[%d] = %d, ans3[%d] = %d\n", i, rbuf3[i], i, ans3[i]);
+ error_flag++;
+ }
+ }
+ MPI_Barrier(MPI_COMM_WORLD);
+ if(error_flag > 0){
+ MPI_Abort(MPI_COMM_WORLD, 100);
+ }
+ free(sbuf);
+ free(sbuf2);
+ free(rbuf2);
+ free(rbuf3);
+ free(ans);
+ free(ans2);
+ free(ans3);
+
+ MPI_Type_free(&type);
+ MPI_Type_free(&type2);
+ MPI_Op_free(&op);
+ MPI_Op_free(&op2);
+
+ MPI_Finalize();
+
+ return 0;
+}
+
diff --git a/ibm/collective/iallreduce_nocommute.c b/ibm/collective/iallreduce_nocommute.c
new file mode 100644
index 0000000..0ec5513
--- /dev/null
+++ b/ibm/collective/iallreduce_nocommute.c
@@ -0,0 +1,101 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <mpi.h>
+
+#define MAXCOUNT (1024*1024)
+
+static void func(void *_a, void *_b, int *_c, MPI_Datatype *d){
+ int *a = _a;
+ int *b = _b;
+ int c = *_c;
+ int i;
+
+ for(i = 0; i < c; i++){
+ int64_t a00 = a[i*4+0];
+ int64_t a01 = a[i*4+1];
+ int64_t a10 = a[i*4+2];
+ int64_t a11 = a[i*4+3];
+ int64_t b00 = b[i*4+0];
+ int64_t b01 = b[i*4+1];
+ int64_t b10 = b[i*4+2];
+ int64_t b11 = b[i*4+3];
+ b[i*4+0] = a00 * b00 + a01 * b10;
+ b[i*4+1] = a00 * b01 + a01 * b11;
+ b[i*4+2] = a10 * b00 + a11 * b10;
+ b[i*4+3] = a10 * b01 + a11 * b11;
+ }
+}
+
+int main(int argc, char *argv[]){
+ MPI_Datatype type;
+ MPI_Op op;
+ int *sbuf, *rbuf, *ans;
+ int nprocs, myrank;
+ int count;
+ int i;
+ MPI_Request rq;
+ MPI_Status st;
+
+ MPI_Init(&argc, &argv);
+
+ MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+ MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+ MPI_Type_contiguous(4, MPI_INT, &type);
+ MPI_Type_commit(&type);
+
+ MPI_Op_create(func, 0, &op);
+
+ count = nprocs*nprocs*nprocs*nprocs ;
+ if (count > MAXCOUNT) count = MAXCOUNT;
+
+ sbuf = malloc(count * 4 * sizeof(int));
+ rbuf = malloc(count * 4 * sizeof(int));
+ ans = malloc(count * 4 * sizeof(int));
+
+
+ for(i = 0; i < count * 4; i++){
+ sbuf[i] = myrank + i;
+ }
+
+ MPI_Iallreduce(sbuf, rbuf, count, type, op, MPI_COMM_WORLD,&rq);
+ MPI_Wait(&rq,&st);
+
+ for(i = 0; i < count * 4; i++){
+ ans[i] = ((i+3)&2)>>1;
+ }
+
+ for(i = nprocs; i--; ){
+ int c = count;
+ int j;
+ for(j = 0; j < count * 4; j++){
+ sbuf[j] = i + j;
+ }
+ func(sbuf, ans, &c, NULL);
+ }
+
+ int error_flag=0;
+ for(i = 0; i < count * 4; i++){
+ if(rbuf[i] != ans[i]){
+ if (0 == error_flag) printf("rbuf[%d] = %d, ans[%d] = %d\n", i, rbuf[i], i, ans[i]);
+ error_flag++;
+ }
+ }
+ MPI_Barrier(MPI_COMM_WORLD);
+ if(error_flag > 0){
+ MPI_Abort(MPI_COMM_WORLD, 100);
+ }
+
+ free(sbuf);
+ free(rbuf);
+ free(ans);
+
+ MPI_Type_free(&type);
+ MPI_Op_free(&op);
+
+ MPI_Finalize();
+
+ return 0;
+}
+
diff --git a/ibm/collective/iallreduce_nocommute_gap.c b/ibm/collective/iallreduce_nocommute_gap.c
new file mode 100644
index 0000000..47ea41b
--- /dev/null
+++ b/ibm/collective/iallreduce_nocommute_gap.c
@@ -0,0 +1,106 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <mpi.h>
+
+#define MAXCOUNT (1024*1024)
+
+#define OFFSET (-2345)
+
+static void func(void *_a, void *_b, int *_c, MPI_Datatype *d){
+ int *a = _a;
+ int *b = _b;
+ int c = *_c;
+ int i;
+
+ for(i = 0; i < c; i++){
+ int64_t a00 = a[i*4+0+OFFSET];
+ int64_t a01 = a[i*4+1+OFFSET];
+ int64_t a10 = a[i*4+2+OFFSET];
+ int64_t a11 = a[i*4+3+OFFSET];
+ int64_t b00 = b[i*4+0+OFFSET];
+ int64_t b01 = b[i*4+1+OFFSET];
+ int64_t b10 = b[i*4+2+OFFSET];
+ int64_t b11 = b[i*4+3+OFFSET];
+ b[i*4+0+OFFSET] = a00 * b00 + a01 * b10;
+ b[i*4+1+OFFSET] = a00 * b01 + a01 * b11;
+ b[i*4+2+OFFSET] = a10 * b00 + a11 * b10;
+ b[i*4+3+OFFSET] = a10 * b01 + a11 * b11;
+ }
+}
+
+int main(int argc, char *argv[]){
+ MPI_Datatype type;
+ MPI_Op op;
+ int *sbuf, *rbuf, *ans;
+ int nprocs, myrank;
+ int count;
+ int i;
+ MPI_Request rq;
+ MPI_Status st;
+ int ab[1] = {4};
+ MPI_Aint ad[1] = {OFFSET*sizeof(int)};
+ MPI_Datatype at[1] = {MPI_INT};
+
+ MPI_Init(&argc, &argv);
+
+ MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+ MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+ MPI_Type_create_struct(1, ab, ad, at, &type);
+ MPI_Type_commit(&type);
+
+ MPI_Op_create(func, 0, &op);
+
+ count = nprocs*nprocs*nprocs*nprocs ;
+ if (count > MAXCOUNT) count = MAXCOUNT;
+
+ sbuf = malloc(count * 4 * sizeof(int));
+ rbuf = malloc(count * 4 * sizeof(int));
+ ans = malloc(count * 4 * sizeof(int));
+
+
+ for(i = 0; i < count * 4; i++){
+ sbuf[i] = myrank + i;
+ }
+
+ MPI_Iallreduce(sbuf-OFFSET, rbuf-OFFSET, count, type, op, MPI_COMM_WORLD,&rq);
+ MPI_Wait(&rq,&st);
+
+ for(i = 0; i < count * 4; i++){
+ ans[i] = ((i+3)&2)>>1;
+ }
+
+ for(i = nprocs; i--; ){
+ int c = count;
+ int j;
+ for(j = 0; j < count * 4; j++){
+ sbuf[j] = i + j;
+ }
+ func(sbuf-OFFSET, ans-OFFSET, &c, NULL);
+ }
+
+ int error_flag=0;
+ for(i = 0; i < count * 4; i++){
+ if(rbuf[i] != ans[i]){
+ if (0 == error_flag) printf("rbuf[%d] = %d, ans[%d] = %d\n", i, rbuf[i], i, ans[i]);
+ error_flag++;
+ }
+ }
+ MPI_Barrier(MPI_COMM_WORLD);
+ if(error_flag > 0){
+ MPI_Abort(MPI_COMM_WORLD, 100);
+ }
+
+ free(sbuf);
+ free(rbuf);
+ free(ans);
+
+ MPI_Type_free(&type);
+ MPI_Op_free(&op);
+
+ MPI_Finalize();
+
+ return 0;
+}
+
diff --git a/ibm/collective/iallreduce_nocommute_gap_in_place.c b/ibm/collective/iallreduce_nocommute_gap_in_place.c
new file mode 100644
index 0000000..dff0b63
--- /dev/null
+++ b/ibm/collective/iallreduce_nocommute_gap_in_place.c
@@ -0,0 +1,106 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <mpi.h>
+
+#define MAXCOUNT (1024*1024)
+
+#define OFFSET (-2345)
+
+static void func(void *_a, void *_b, int *_c, MPI_Datatype *d){
+ int *a = _a;
+ int *b = _b;
+ int c = *_c;
+ int i;
+
+ for(i = 0; i < c; i++){
+ int64_t a00 = a[i*4+0+OFFSET];
+ int64_t a01 = a[i*4+1+OFFSET];
+ int64_t a10 = a[i*4+2+OFFSET];
+ int64_t a11 = a[i*4+3+OFFSET];
+ int64_t b00 = b[i*4+0+OFFSET];
+ int64_t b01 = b[i*4+1+OFFSET];
+ int64_t b10 = b[i*4+2+OFFSET];
+ int64_t b11 = b[i*4+3+OFFSET];
+ b[i*4+0+OFFSET] = a00 * b00 + a01 * b10;
+ b[i*4+1+OFFSET] = a00 * b01 + a01 * b11;
+ b[i*4+2+OFFSET] = a10 * b00 + a11 * b10;
+ b[i*4+3+OFFSET] = a10 * b01 + a11 * b11;
+ }
+}
+
+int main(int argc, char *argv[]){
+ MPI_Datatype type;
+ MPI_Op op;
+ int *sbuf, *rbuf, *ans;
+ int nprocs, myrank;
+ int count;
+ int i;
+ MPI_Request rq;
+ MPI_Status st;
+ int ab[1] = {4};
+ MPI_Aint ad[1] = {OFFSET*sizeof(int)};
+ MPI_Datatype at[1] = {MPI_INT};
+
+ MPI_Init(&argc, &argv);
+
+ MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+ MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+ MPI_Type_create_struct(1, ab, ad, at, &type);
+ MPI_Type_commit(&type);
+
+ MPI_Op_create(func, 0, &op);
+
+ count = nprocs*nprocs*nprocs*nprocs ;
+ if (count > MAXCOUNT) count = MAXCOUNT;
+
+ sbuf = malloc(count * 4 * sizeof(int));
+ rbuf = malloc(count * 4 * sizeof(int));
+ ans = malloc(count * 4 * sizeof(int));
+
+
+ for(i = 0; i < count * 4; i++){
+ rbuf[i] = myrank + i;
+ }
+
+ MPI_Iallreduce(MPI_IN_PLACE, rbuf-OFFSET, count, type, op, MPI_COMM_WORLD,&rq);
+ MPI_Wait(&rq,&st);
+
+ for(i = 0; i < count * 4; i++){
+ ans[i] = ((i+3)&2)>>1;
+ }
+
+ for(i = nprocs; i--; ){
+ int c = count;
+ int j;
+ for(j = 0; j < count * 4; j++){
+ sbuf[j] = i + j;
+ }
+ func(sbuf-OFFSET, ans-OFFSET, &c, NULL);
+ }
+
+ int error_flag=0;
+ for(i = 0; i < count * 4; i++){
+ if(rbuf[i] != ans[i]){
+ if (0 == error_flag) printf("rbuf[%d] = %d, ans[%d] = %d\n", i, rbuf[i], i, ans[i]);
+ error_flag++;
+ }
+ }
+ MPI_Barrier(MPI_COMM_WORLD);
+ if(error_flag > 0){
+ MPI_Abort(MPI_COMM_WORLD, 100);
+ }
+
+ free(sbuf);
+ free(rbuf);
+ free(ans);
+
+ MPI_Type_free(&type);
+ MPI_Op_free(&op);
+
+ MPI_Finalize();
+
+ return 0;
+}
+
diff --git a/ibm/collective/iallreduce_nocommute_in_place.c b/ibm/collective/iallreduce_nocommute_in_place.c
new file mode 100644
index 0000000..d1a571c
--- /dev/null
+++ b/ibm/collective/iallreduce_nocommute_in_place.c
@@ -0,0 +1,101 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <mpi.h>
+
+#define MAXCOUNT (1024*1024)
+
+static void func(void *_a, void *_b, int *_c, MPI_Datatype *d){
+ int *a = _a;
+ int *b = _b;
+ int c = *_c;
+ int i;
+
+ for(i = 0; i < c; i++){
+ int64_t a00 = a[i*4+0];
+ int64_t a01 = a[i*4+1];
+ int64_t a10 = a[i*4+2];
+ int64_t a11 = a[i*4+3];
+ int64_t b00 = b[i*4+0];
+ int64_t b01 = b[i*4+1];
+ int64_t b10 = b[i*4+2];
+ int64_t b11 = b[i*4+3];
+ b[i*4+0] = a00 * b00 + a01 * b10;
+ b[i*4+1] = a00 * b01 + a01 * b11;
+ b[i*4+2] = a10 * b00 + a11 * b10;
+ b[i*4+3] = a10 * b01 + a11 * b11;
+ }
+}
+
+int main(int argc, char *argv[]){
+ MPI_Datatype type;
+ MPI_Op op;
+ int *sbuf, *rbuf, *ans;
+ int nprocs, myrank;
+ int count;
+ int i;
+ MPI_Request rq;
+ MPI_Status st;
+
+ MPI_Init(&argc, &argv);
+
+ MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+ MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+ MPI_Type_contiguous(4, MPI_INT, &type);
+ MPI_Type_commit(&type);
+
+ MPI_Op_create(func, 0, &op);
+
+ count = nprocs*nprocs*nprocs*nprocs ;
+ if (count > MAXCOUNT) count = MAXCOUNT;
+
+ sbuf = malloc(count * 4 * sizeof(int));
+ rbuf = malloc(count * 4 * sizeof(int));
+ ans = malloc(count * 4 * sizeof(int));
+
+
+ for(i = 0; i < count * 4; i++){
+ rbuf[i] = myrank + i;
+ }
+
+ MPI_Iallreduce(MPI_IN_PLACE, rbuf, count, type, op, MPI_COMM_WORLD,&rq);
+ MPI_Wait(&rq,&st);
+
+ for(i = 0; i < count * 4; i++){
+ ans[i] = ((i+3)&2)>>1;
+ }
+
+ for(i = nprocs; i--; ){
+ int c = count;
+ int j;
+ for(j = 0; j < count * 4; j++){
+ sbuf[j] = i + j;
+ }
+ func(sbuf, ans, &c, NULL);
+ }
+
+ int error_flag=0;
+ for(i = 0; i < count * 4; i++){
+ if(rbuf[i] != ans[i]){
+ if (0 == error_flag) printf("rbuf[%d] = %d, ans[%d] = %d\n", i, rbuf[i], i, ans[i]);
+ error_flag++;
+ }
+ }
+ MPI_Barrier(MPI_COMM_WORLD);
+ if(error_flag > 0){
+ MPI_Abort(MPI_COMM_WORLD, 100);
+ }
+
+ free(sbuf);
+ free(rbuf);
+ free(ans);
+
+ MPI_Type_free(&type);
+ MPI_Op_free(&op);
+
+ MPI_Finalize();
+
+ return 0;
+}
+
diff --git a/ibm/collective/iallreduce_nocommute_stride.c b/ibm/collective/iallreduce_nocommute_stride.c
new file mode 100644
index 0000000..64db179
--- /dev/null
+++ b/ibm/collective/iallreduce_nocommute_stride.c
@@ -0,0 +1,149 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <mpi.h>
+
+#define MAXCOUNT (1024*1024)
+
+static void func(void *_a, void *_b, int *_c, MPI_Datatype *d){
+ int *a = _a;
+ int *b = _b;
+ int c = *_c;
+ int i;
+
+ for(i = 0; i < c; i++){
+ int64_t a00 = a[i*4+0];
+ int64_t a01 = a[i*4+1];
+ int64_t a10 = a[i*4+2];
+ int64_t a11 = a[i*4+3];
+ int64_t b00 = b[i*4+0];
+ int64_t b01 = b[i*4+1];
+ int64_t b10 = b[i*4+2];
+ int64_t b11 = b[i*4+3];
+ b[i*4+0] = a00 * b00 + a01 * b10;
+ b[i*4+1] = a00 * b01 + a01 * b11;
+ b[i*4+2] = a10 * b00 + a11 * b10;
+ b[i*4+3] = a10 * b01 + a11 * b11;
+ }
+}
+
+static void func2(void *_a, void *_b, int *_c, MPI_Datatype *d){
+ int *a = _a;
+ int *b = _b;
+ int c = *_c;
+ int i;
+
+ for(i = 0; i < c; i++){
+ int64_t a00 = a[i+0*c];
+ int64_t a01 = a[i+1*c];
+ int64_t a10 = a[i+2*c];
+ int64_t a11 = a[i+3*c];
+ int64_t b00 = b[i+0*c];
+ int64_t b01 = b[i+1*c];
+ int64_t b10 = b[i+2*c];
+ int64_t b11 = b[i+3*c];
+ b[i+0*c] = a00 * b00 + a01 * b10;
+ b[i+1*c] = a00 * b01 + a01 * b11;
+ b[i+2*c] = a10 * b00 + a11 * b10;
+ b[i+3*c] = a10 * b01 + a11 * b11;
+ }
+}
+
+int main(int argc, char *argv[]){
+ MPI_Datatype type, type2;
+ MPI_Op op;
+ MPI_Op op2;
+ int *sbuf , *ans ;
+ int *sbuf2, *rbuf2, *ans2;
+ int *rbuf3, *ans3;
+ int nprocs, myrank;
+ int count;
+ int i;
+ MPI_Request rq;
+ MPI_Status st;
+
+ MPI_Init(&argc, &argv);
+
+ MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+ MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+ MPI_Op_create(func, 0, &op);
+ MPI_Op_create(func2, 0, &op2);
+
+ count = nprocs*nprocs*nprocs*nprocs ;
+ if (count > MAXCOUNT) count = MAXCOUNT;
+
+ MPI_Type_vector(4, 1, count, MPI_INT, &type);
+ MPI_Type_create_resized(type, 0, 4, &type2);
+ MPI_Type_commit(&type2);
+ MPI_Type_free(&type);
+ MPI_Type_contiguous(4, MPI_INT, &type);
+ MPI_Type_commit(&type);
+
+ sbuf = malloc(count * 4 * sizeof(int));
+ ans = malloc(count * 4 * sizeof(int));
+ sbuf2 = malloc(count * 4 * sizeof(int));
+ rbuf2 = malloc(count * 4 * sizeof(int));
+ ans2 = malloc(count * 4 * sizeof(int));
+ rbuf3 = malloc(count * 4 * sizeof(int));
+ ans3 = malloc(count * 4 * sizeof(int));
+
+
+ for(i = 0; i < count * 4; i++){
+ sbuf[i] = myrank + i;
+ ans[i] = ((i+3)&2)>>1;
+ }
+
+ MPI_Sendrecv(sbuf, count, type, 0, 0, sbuf2, count, type2, 0, 0, MPI_COMM_SELF, MPI_STATUS_IGNORE);
+ MPI_Sendrecv(ans , count, type, 0, 0, ans2 , count, type2, 0, 0, MPI_COMM_SELF, MPI_STATUS_IGNORE);
+
+ MPI_Iallreduce(sbuf2, rbuf2, count, type2, op2, MPI_COMM_WORLD,&rq);
+ MPI_Wait(&rq,&st);
+ MPI_Sendrecv(rbuf2 , count, type2, 0, 0, rbuf3 , count, type, 0, 0, MPI_COMM_SELF, MPI_STATUS_IGNORE);
+
+ for(i = nprocs; i--; ){
+ int c = count;
+ int j;
+ for(j = 0; j < count * 4; j++){
+ sbuf[j] = i + j;
+ }
+ MPI_Sendrecv(sbuf, count, type, 0, 0, sbuf2, count, type2, 0, 0, MPI_COMM_SELF, MPI_STATUS_IGNORE);
+ func (sbuf , ans , &c, NULL);
+ func2(sbuf2, ans2, &c, NULL);
+ }
+ MPI_Sendrecv(ans2, count, type2, 0, 0, ans3, count, type, 0, 0, MPI_COMM_SELF, MPI_STATUS_IGNORE);
+
+ int error_flag=0;
+
+ for(i = 0; i < count * 4; i++){
+ if(ans3[i] != ans[i]){
+ if (0 == error_flag) printf("!!! ans3[%d] = %d, ans[%d] = %d\n", i, ans3[i], i, ans[i]);
+ error_flag++;
+ }
+ if(rbuf3[i] != ans3[i]){
+ if (0 == error_flag) printf("rbuf3[%d] = %d, ans3[%d] = %d\n", i, rbuf3[i], i, ans3[i]);
+ error_flag++;
+ }
+ }
+ MPI_Barrier(MPI_COMM_WORLD);
+ if(error_flag > 0){
+ MPI_Abort(MPI_COMM_WORLD, 100);
+ }
+ free(sbuf);
+ free(sbuf2);
+ free(rbuf2);
+ free(rbuf3);
+ free(ans);
+ free(ans2);
+ free(ans3);
+
+ MPI_Type_free(&type);
+ MPI_Type_free(&type2);
+ MPI_Op_free(&op);
+ MPI_Op_free(&op2);
+
+ MPI_Finalize();
+
+ return 0;
+}
+
diff --git a/ibm/collective/iallreduce_nocommute_stride_in_place.c b/ibm/collective/iallreduce_nocommute_stride_in_place.c
new file mode 100644
index 0000000..fbffe98
--- /dev/null
+++ b/ibm/collective/iallreduce_nocommute_stride_in_place.c
@@ -0,0 +1,149 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <mpi.h>
+
+#define MAXCOUNT (1024*1024)
+
+static void func(void *_a, void *_b, int *_c, MPI_Datatype *d){
+ int *a = _a;
+ int *b = _b;
+ int c = *_c;
+ int i;
+
+ for(i = 0; i < c; i++){
+ int64_t a00 = a[i*4+0];
+ int64_t a01 = a[i*4+1];
+ int64_t a10 = a[i*4+2];
+ int64_t a11 = a[i*4+3];
+ int64_t b00 = b[i*4+0];
+ int64_t b01 = b[i*4+1];
+ int64_t b10 = b[i*4+2];
+ int64_t b11 = b[i*4+3];
+ b[i*4+0] = a00 * b00 + a01 * b10;
+ b[i*4+1] = a00 * b01 + a01 * b11;
+ b[i*4+2] = a10 * b00 + a11 * b10;
+ b[i*4+3] = a10 * b01 + a11 * b11;
+ }
+}
+
+static void func2(void *_a, void *_b, int *_c, MPI_Datatype *d){
+ int *a = _a;
+ int *b = _b;
+ int c = *_c;
+ int i;
+
+ for(i = 0; i < c; i++){
+ int64_t a00 = a[i+0*c];
+ int64_t a01 = a[i+1*c];
+ int64_t a10 = a[i+2*c];
+ int64_t a11 = a[i+3*c];
+ int64_t b00 = b[i+0*c];
+ int64_t b01 = b[i+1*c];
+ int64_t b10 = b[i+2*c];
+ int64_t b11 = b[i+3*c];
+ b[i+0*c] = a00 * b00 + a01 * b10;
+ b[i+1*c] = a00 * b01 + a01 * b11;
+ b[i+2*c] = a10 * b00 + a11 * b10;
+ b[i+3*c] = a10 * b01 + a11 * b11;
+ }
+}
+
+int main(int argc, char *argv[]){
+ MPI_Datatype type, type2;
+ MPI_Op op;
+ MPI_Op op2;
+ int *sbuf , *ans ;
+ int *sbuf2, *rbuf2, *ans2;
+ int *rbuf3, *ans3;
+ int nprocs, myrank;
+ int count;
+ int i;
+ MPI_Request rq;
+ MPI_Status st;
+
+ MPI_Init(&argc, &argv);
+
+ MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+ MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+ MPI_Op_create(func, 0, &op);
+ MPI_Op_create(func2, 0, &op2);
+
+ count = nprocs*nprocs*nprocs*nprocs ;
+ if (count > MAXCOUNT) count = MAXCOUNT;
+
+ MPI_Type_vector(4, 1, count, MPI_INT, &type);
+ MPI_Type_create_resized(type, 0, 4, &type2);
+ MPI_Type_commit(&type2);
+ MPI_Type_free(&type);
+ MPI_Type_contiguous(4, MPI_INT, &type);
+ MPI_Type_commit(&type);
+
+ sbuf = malloc(count * 4 * sizeof(int));
+ ans = malloc(count * 4 * sizeof(int));
+ sbuf2 = malloc(count * 4 * sizeof(int));
+ rbuf2 = malloc(count * 4 * sizeof(int));
+ ans2 = malloc(count * 4 * sizeof(int));
+ rbuf3 = malloc(count * 4 * sizeof(int));
+ ans3 = malloc(count * 4 * sizeof(int));
+
+
+ for(i = 0; i < count * 4; i++){
+ sbuf[i] = myrank + i;
+ ans[i] = ((i+3)&2)>>1;
+ }
+
+ MPI_Sendrecv(sbuf, count, type, 0, 0, rbuf2, count, type2, 0, 0, MPI_COMM_SELF, MPI_STATUS_IGNORE);
+ MPI_Sendrecv(ans , count, type, 0, 0, ans2 , count, type2, 0, 0, MPI_COMM_SELF, MPI_STATUS_IGNORE);
+
+ MPI_Iallreduce(MPI_IN_PLACE, rbuf2, count, type2, op2, MPI_COMM_WORLD,&rq);
+ MPI_Wait(&rq,&st);
+ MPI_Sendrecv(rbuf2 , count, type2, 0, 0, rbuf3 , count, type, 0, 0, MPI_COMM_SELF, MPI_STATUS_IGNORE);
+
+ for(i = nprocs; i--; ){
+ int c = count;
+ int j;
+ for(j = 0; j < count * 4; j++){
+ sbuf[j] = i + j;
+ }
+ MPI_Sendrecv(sbuf, count, type, 0, 0, sbuf2, count, type2, 0, 0, MPI_COMM_SELF, MPI_STATUS_IGNORE);
+ func (sbuf , ans , &c, NULL);
+ func2(sbuf2, ans2, &c, NULL);
+ }
+ MPI_Sendrecv(ans2, count, type2, 0, 0, ans3, count, type, 0, 0, MPI_COMM_SELF, MPI_STATUS_IGNORE);
+
+ int error_flag=0;
+
+ for(i = 0; i < count * 4; i++){
+ if(ans3[i] != ans[i]){
+ if (0 == error_flag) printf("!!! ans3[%d] = %d, ans[%d] = %d\n", i, ans3[i], i, ans[i]);
+ error_flag++;
+ }
+ if(rbuf3[i] != ans3[i]){
+ if (0 == error_flag) printf("rbuf3[%d] = %d, ans3[%d] = %d\n", i, rbuf3[i], i, ans3[i]);
+ error_flag++;
+ }
+ }
+ MPI_Barrier(MPI_COMM_WORLD);
+ if(error_flag > 0){
+ MPI_Abort(MPI_COMM_WORLD, 100);
+ }
+ free(sbuf);
+ free(sbuf2);
+ free(rbuf2);
+ free(rbuf3);
+ free(ans);
+ free(ans2);
+ free(ans3);
+
+ MPI_Type_free(&type);
+ MPI_Type_free(&type2);
+ MPI_Op_free(&op);
+ MPI_Op_free(&op2);
+
+ MPI_Finalize();
+
+ return 0;
+}
+
diff --git a/ibm/collective/ireduce_nocommute.c b/ibm/collective/ireduce_nocommute.c
new file mode 100644
index 0000000..7342ed2
--- /dev/null
+++ b/ibm/collective/ireduce_nocommute.c
@@ -0,0 +1,107 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <mpi.h>
+
+#define MAXCOUNT (1024*1024)
+
+static void func(void *_a, void *_b, int *_c, MPI_Datatype *d){
+ int *a = _a;
+ int *b = _b;
+ int c = *_c;
+ int i;
+
+ for(i = 0; i < c; i++){
+ int64_t a00 = a[i*4+0];
+ int64_t a01 = a[i*4+1];
+ int64_t a10 = a[i*4+2];
+ int64_t a11 = a[i*4+3];
+ int64_t b00 = b[i*4+0];
+ int64_t b01 = b[i*4+1];
+ int64_t b10 = b[i*4+2];
+ int64_t b11 = b[i*4+3];
+ b[i*4+0] = a00 * b00 + a01 * b10;
+ b[i*4+1] = a00 * b01 + a01 * b11;
+ b[i*4+2] = a10 * b00 + a11 * b10;
+ b[i*4+3] = a10 * b01 + a11 * b11;
+ }
+}
+
+int main(int argc, char *argv[]){
+ MPI_Datatype type;
+ MPI_Op op;
+ int *sbuf, *rbuf, *ans;
+ int nprocs, myrank;
+ int count;
+ int i;
+ int root;
+ MPI_Request rq;
+ MPI_Status st;
+
+ MPI_Init(&argc, &argv);
+
+ MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+ MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+ MPI_Type_contiguous(4, MPI_INT, &type);
+ MPI_Type_commit(&type);
+
+ MPI_Op_create(func, 0, &op);
+
+ count = nprocs*nprocs*nprocs*nprocs ;
+ if (count > MAXCOUNT) count = MAXCOUNT;
+
+ sbuf = malloc(count * 4 * sizeof(int));
+ rbuf = malloc(count * 4 * sizeof(int));
+ ans = malloc(count * 4 * sizeof(int));
+
+
+ for(i = 0; i < count * 4; i++){
+ ans[i] = ((i+3)&2)>>1;
+ }
+
+ for(i = nprocs; i--; ){
+ int c = count;
+ int j;
+ for(j = 0; j < count * 4; j++){
+ sbuf[j] = i + j;
+ }
+ func(sbuf, ans, &c, NULL);
+ }
+
+ for (root=0; root<nprocs; root++) {
+ for(i = 0; i < count * 4; i++){
+ sbuf[i] = myrank + i;
+ rbuf[i] = -1;
+ }
+
+ MPI_Ireduce(sbuf, rbuf, count, type, op, root, MPI_COMM_WORLD,&rq);
+ MPI_Wait(&rq,&st);
+
+ int error_flag=0;
+ if (root == myrank) {
+ for(i = 0; i < count * 4; i++){
+ if(rbuf[i] != ans[i]){
+ if (0 == error_flag) printf("rbuf[%d] = %d, ans[%d] = %d root = %d\n", i, rbuf[i], i, ans[i], root);
+ error_flag++;
+ }
+ }
+ }
+ MPI_Barrier(MPI_COMM_WORLD);
+ if(error_flag > 0){
+ MPI_Abort(MPI_COMM_WORLD, 100);
+ }
+ }
+
+ free(sbuf);
+ free(rbuf);
+ free(ans);
+
+ MPI_Type_free(&type);
+ MPI_Op_free(&op);
+
+ MPI_Finalize();
+
+ return 0;
+}
+
diff --git a/ibm/collective/ireduce_nocommute_gap.c b/ibm/collective/ireduce_nocommute_gap.c
new file mode 100644
index 0000000..1615922
--- /dev/null
+++ b/ibm/collective/ireduce_nocommute_gap.c
@@ -0,0 +1,112 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <mpi.h>
+
+#define MAXCOUNT (1024*1024)
+
+#define OFFSET (-2345)
+
+static void func(void *_a, void *_b, int *_c, MPI_Datatype *d){
+ int *a = _a;
+ int *b = _b;
+ int c = *_c;
+ int i;
+
+ for(i = 0; i < c; i++){
+ int64_t a00 = a[i*4+0+OFFSET];
+ int64_t a01 = a[i*4+1+OFFSET];
+ int64_t a10 = a[i*4+2+OFFSET];
+ int64_t a11 = a[i*4+3+OFFSET];
+ int64_t b00 = b[i*4+0+OFFSET];
+ int64_t b01 = b[i*4+1+OFFSET];
+ int64_t b10 = b[i*4+2+OFFSET];
+ int64_t b11 = b[i*4+3+OFFSET];
+ b[i*4+0+OFFSET] = a00 * b00 + a01 * b10;
+ b[i*4+1+OFFSET] = a00 * b01 + a01 * b11;
+ b[i*4+2+OFFSET] = a10 * b00 + a11 * b10;
+ b[i*4+3+OFFSET] = a10 * b01 + a11 * b11;
+ }
+}
+
+int main(int argc, char *argv[]){
+ MPI_Datatype type;
+ MPI_Op op;
+ int *sbuf, *rbuf, *ans;
+ int nprocs, myrank;
+ int count;
+ int i;
+ int root;
+ int ab[1] = {4};
+ MPI_Aint ad[1] = {OFFSET*sizeof(int)};
+ MPI_Datatype at[1] = {MPI_INT};
+ MPI_Request rq;
+ MPI_Status st;
+
+ MPI_Init(&argc, &argv);
+
+ MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+ MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+ MPI_Type_create_struct(1, ab, ad, at, &type);
+ MPI_Type_commit(&type);
+
+ MPI_Op_create(func, 0, &op);
+
+ count = nprocs*nprocs*nprocs*nprocs ;
+ if (count > MAXCOUNT) count = MAXCOUNT;
+
+ sbuf = malloc(count * 4 * sizeof(int));
+ rbuf = malloc(count * 4 * sizeof(int));
+ ans = malloc(count * 4 * sizeof(int));
+
+
+ for(i = 0; i < count * 4; i++){
+ ans[i] = ((i+3)&2)>>1;
+ }
+
+ for(i = nprocs; i--; ){
+ int c = count;
+ int j;
+ for(j = 0; j < count * 4; j++){
+ sbuf[j] = i + j;
+ }
+ func(sbuf-OFFSET, ans-OFFSET, &c, NULL);
+ }
+
+ for (root=0; root<nprocs; root++) {
+ for(i = 0; i < count * 4; i++){
+ sbuf[i] = myrank + i;
+ rbuf[i] = -1;
+ }
+
+ MPI_Ireduce(sbuf-OFFSET, rbuf-OFFSET, count, type, op, root, MPI_COMM_WORLD,&rq);
+ MPI_Wait(&rq,&st);
+
+ int error_flag=0;
+ if (root == myrank) {
+ for(i = 0; i < count * 4; i++){
+ if(rbuf[i] != ans[i]){
+ if (0 == error_flag) printf("rbuf[%d] = %d, ans[%d] = %d root = %d\n", i, rbuf[i], i, ans[i], root);
+ error_flag++;
+ }
+ }
+ }
+ MPI_Barrier(MPI_COMM_WORLD);
+ if(error_flag > 0){
+ MPI_Abort(MPI_COMM_WORLD, 100);
+ }
+ }
+
+ free(sbuf);
+ free(rbuf);
+ free(ans);
+
+ MPI_Type_free(&type);
+ MPI_Op_free(&op);
+
+ MPI_Finalize();
+
+ return 0;
+}
+
diff --git a/ibm/collective/ireduce_nocommute_gap_in_place.c b/ibm/collective/ireduce_nocommute_gap_in_place.c
new file mode 100644
index 0000000..232773f
--- /dev/null
+++ b/ibm/collective/ireduce_nocommute_gap_in_place.c
@@ -0,0 +1,120 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <mpi.h>
+
+#define MAXCOUNT (1024*1024)
+
+#define OFFSET (-2345)
+
+static void func(void *_a, void *_b, int *_c, MPI_Datatype *d){
+ int *a = _a;
+ int *b = _b;
+ int c = *_c;
+ int i;
+
+ for(i = 0; i < c; i++){
+ int64_t a00 = a[i*4+0+OFFSET];
+ int64_t a01 = a[i*4+1+OFFSET];
+ int64_t a10 = a[i*4+2+OFFSET];
+ int64_t a11 = a[i*4+3+OFFSET];
+ int64_t b00 = b[i*4+0+OFFSET];
+ int64_t b01 = b[i*4+1+OFFSET];
+ int64_t b10 = b[i*4+2+OFFSET];
+ int64_t b11 = b[i*4+3+OFFSET];
+ b[i*4+0+OFFSET] = a00 * b00 + a01 * b10;
+ b[i*4+1+OFFSET] = a00 * b01 + a01 * b11;
+ b[i*4+2+OFFSET] = a10 * b00 + a11 * b10;
+ b[i*4+3+OFFSET] = a10 * b01 + a11 * b11;
+ }
+}
+
+int main(int argc, char *argv[]){
+ MPI_Datatype type;
+ MPI_Op op;
+ int *sbuf, *rbuf, *ans;
+ int nprocs, myrank;
+ int count;
+ int i;
+ int root;
+ MPI_Request rq;
+ MPI_Status st;
+ int ab[1] = {4};
+ MPI_Aint ad[1] = {OFFSET*sizeof(int)};
+ MPI_Datatype at[1] = {MPI_INT};
+
+ MPI_Init(&argc, &argv);
+
+ MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+ MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+ MPI_Type_create_struct(1, ab, ad, at, &type);
+ MPI_Type_commit(&type);
+
+ MPI_Op_create(func, 0, &op);
+
+ count = nprocs*nprocs*nprocs*nprocs ;
+ if (count > MAXCOUNT) count = MAXCOUNT;
+
+ sbuf = malloc(count * 4 * sizeof(int));
+ rbuf = malloc(count * 4 * sizeof(int));
+ ans = malloc(count * 4 * sizeof(int));
+
+
+ for(i = 0; i < count * 4; i++){
+ ans[i] = ((i+3)&2)>>1;
+ }
+
+ for(i = nprocs; i--; ){
+ int c = count;
+ int j;
+ for(j = 0; j < count * 4; j++){
+ sbuf[j] = i + j;
+ }
+ func(sbuf, ans, &c, NULL);
+ }
+
+ for (root=0; root<nprocs; root++) {
+ for(i = 0; i < count * 4; i++){
+ if(root == myrank) {
+ rbuf[i] = myrank + i;
+ } else {
+ sbuf[i] = myrank + i;
+ rbuf[i] = -1;
+ }
+ }
+
+ if (root == myrank) {
+ MPI_Ireduce(MPI_IN_PLACE, rbuf-OFFSET, count, type, op, root, MPI_COMM_WORLD,&rq);
+ } else {
+ MPI_Ireduce(sbuf-OFFSET, rbuf-OFFSET, count, type, op, root, MPI_COMM_WORLD,&rq);
+ }
+ MPI_Wait(&rq,&st);
+
+ int error_flag=0;
+ if (root == myrank) {
+ for(i = 0; i < count * 4; i++){
+ if(rbuf[i] != ans[i]){
+ if (0 == error_flag) printf("rbuf[%d] = %d, ans[%d] = %d root = %d\n", i, rbuf[i], i, ans[i], root);
+ error_flag++;
+ }
+ }
+ }
+ MPI_Barrier(MPI_COMM_WORLD);
+ if(error_flag > 0){
+ MPI_Abort(MPI_COMM_WORLD, 100);
+ }
+ }
+
+ free(sbuf);
+ free(rbuf);
+ free(ans);
+
+ MPI_Type_free(&type);
+ MPI_Op_free(&op);
+
+ MPI_Finalize();
+
+ return 0;
+}
+
diff --git a/ibm/collective/ireduce_nocommute_in_place.c b/ibm/collective/ireduce_nocommute_in_place.c
new file mode 100644
index 0000000..4d463ed
--- /dev/null
+++ b/ibm/collective/ireduce_nocommute_in_place.c
@@ -0,0 +1,115 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <mpi.h>
+
+#define MAXCOUNT (1024*1024)
+
+static void func(void *_a, void *_b, int *_c, MPI_Datatype *d){
+ int *a = _a;
+ int *b = _b;
+ int c = *_c;
+ int i;
+
+ for(i = 0; i < c; i++){
+ int64_t a00 = a[i*4+0];
+ int64_t a01 = a[i*4+1];
+ int64_t a10 = a[i*4+2];
+ int64_t a11 = a[i*4+3];
+ int64_t b00 = b[i*4+0];
+ int64_t b01 = b[i*4+1];
+ int64_t b10 = b[i*4+2];
+ int64_t b11 = b[i*4+3];
+ b[i*4+0] = a00 * b00 + a01 * b10;
+ b[i*4+1] = a00 * b01 + a01 * b11;
+ b[i*4+2] = a10 * b00 + a11 * b10;
+ b[i*4+3] = a10 * b01 + a11 * b11;
+ }
+}
+
+int main(int argc, char *argv[]){
+ MPI_Datatype type;
+ MPI_Op op;
+ int *sbuf, *rbuf, *ans;
+ int nprocs, myrank;
+ int count;
+ int i;
+ int root;
+ MPI_Request rq;
+ MPI_Status st;
+
+ MPI_Init(&argc, &argv);
+
+ MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+ MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+ MPI_Type_contiguous(4, MPI_INT, &type);
+ MPI_Type_commit(&type);
+
+ MPI_Op_create(func, 0, &op);
+
+ count = nprocs*nprocs*nprocs*nprocs ;
+ if (count > MAXCOUNT) count = MAXCOUNT;
+
+ sbuf = malloc(count * 4 * sizeof(int));
+ rbuf = malloc(count * 4 * sizeof(int));
+ ans = malloc(count * 4 * sizeof(int));
+
+
+ for(i = 0; i < count * 4; i++){
+ ans[i] = ((i+3)&2)>>1;
+ }
+
+ for(i = nprocs; i--; ){
+ int c = count;
+ int j;
+ for(j = 0; j < count * 4; j++){
+ sbuf[j] = i + j;
+ }
+ func(sbuf, ans, &c, NULL);
+ }
+
+ for (root=0; root<nprocs; root++) {
+ for(i = 0; i < count * 4; i++){
+ if(root == myrank) {
+ rbuf[i] = myrank + i;
+ } else {
+ sbuf[i] = myrank + i;
+ rbuf[i] = -1;
+ }
+ }
+
+ if (root == myrank) {
+ MPI_Ireduce(MPI_IN_PLACE, rbuf, count, type, op, root, MPI_COMM_WORLD,&rq);
+ } else {
+ MPI_Ireduce(sbuf, rbuf, count, type, op, root, MPI_COMM_WORLD,&rq);
+ }
+ MPI_Wait(&rq,&st);
+
+ int error_flag=0;
+ if (root == myrank) {
+ for(i = 0; i < count * 4; i++){
+ if(rbuf[i] != ans[i]){
+ if (0 == error_flag) printf("rbuf[%d] = %d, ans[%d] = %d root = %d\n", i, rbuf[i], i, ans[i], root);
+ error_flag++;
+ }
+ }
+ }
+ MPI_Barrier(MPI_COMM_WORLD);
+ if(error_flag > 0){
+ MPI_Abort(MPI_COMM_WORLD, 100);
+ }
+ }
+
+ free(sbuf);
+ free(rbuf);
+ free(ans);
+
+ MPI_Type_free(&type);
+ MPI_Op_free(&op);
+
+ MPI_Finalize();
+
+ return 0;
+}
+
diff --git a/ibm/collective/reduce_nocommute.c b/ibm/collective/reduce_nocommute.c
new file mode 100644
index 0000000..f994568
--- /dev/null
+++ b/ibm/collective/reduce_nocommute.c
@@ -0,0 +1,104 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <mpi.h>
+
+#define MAXCOUNT (1024*1024)
+
+static void func(void *_a, void *_b, int *_c, MPI_Datatype *d){
+ int *a = _a;
+ int *b = _b;
+ int c = *_c;
+ int i;
+
+ for(i = 0; i < c; i++){
+ int64_t a00 = a[i*4+0];
+ int64_t a01 = a[i*4+1];
+ int64_t a10 = a[i*4+2];
+ int64_t a11 = a[i*4+3];
+ int64_t b00 = b[i*4+0];
+ int64_t b01 = b[i*4+1];
+ int64_t b10 = b[i*4+2];
+ int64_t b11 = b[i*4+3];
+ b[i*4+0] = a00 * b00 + a01 * b10;
+ b[i*4+1] = a00 * b01 + a01 * b11;
+ b[i*4+2] = a10 * b00 + a11 * b10;
+ b[i*4+3] = a10 * b01 + a11 * b11;
+ }
+}
+
+int main(int argc, char *argv[]){
+ MPI_Datatype type;
+ MPI_Op op;
+ int *sbuf, *rbuf, *ans;
+ int nprocs, myrank;
+ int count;
+ int i;
+ int root;
+
+ MPI_Init(&argc, &argv);
+
+ MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+ MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+ MPI_Type_contiguous(4, MPI_INT, &type);
+ MPI_Type_commit(&type);
+
+ MPI_Op_create(func, 0, &op);
+
+ count = nprocs*nprocs*nprocs*nprocs ;
+ if (count > MAXCOUNT) count = MAXCOUNT;
+
+ sbuf = malloc(count * 4 * sizeof(int));
+ rbuf = malloc(count * 4 * sizeof(int));
+ ans = malloc(count * 4 * sizeof(int));
+
+
+ for(i = 0; i < count * 4; i++){
+ ans[i] = ((i+3)&2)>>1;
+ }
+
+ for(i = nprocs; i--; ){
+ int c = count;
+ int j;
+ for(j = 0; j < count * 4; j++){
+ sbuf[j] = i + j;
+ }
+ func(sbuf, ans, &c, NULL);
+ }
+
+ for (root=0; root<nprocs; root++) {
+ for(i = 0; i < count * 4; i++){
+ sbuf[i] = myrank + i;
+ rbuf[i] = -1;
+ }
+
+ MPI_Reduce(sbuf, rbuf, count, type, op, root, MPI_COMM_WORLD);
+
+ int error_flag=0;
+ if (root == myrank) {
+ for(i = 0; i < count * 4; i++){
+ if(rbuf[i] != ans[i]){
+ if (0 == error_flag) printf("rbuf[%d] = %d, ans[%d] = %d root = %d\n", i, rbuf[i], i, ans[i], root);
+ error_flag++;
+ }
+ }
+ }
+ MPI_Barrier(MPI_COMM_WORLD);
+ if(error_flag > 0){
+ MPI_Abort(MPI_COMM_WORLD, 100);
+ }
+ }
+
+ free(sbuf);
+ free(rbuf);
+ free(ans);
+
+ MPI_Type_free(&type);
+ MPI_Op_free(&op);
+
+ MPI_Finalize();
+
+ return 0;
+}
+
diff --git a/ibm/collective/reduce_nocommute_gap.c b/ibm/collective/reduce_nocommute_gap.c
new file mode 100644
index 0000000..eee36b3
--- /dev/null
+++ b/ibm/collective/reduce_nocommute_gap.c
@@ -0,0 +1,109 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <mpi.h>
+
+#define MAXCOUNT (1024*1024)
+
+#define OFFSET (-2345)
+
+static void func(void *_a, void *_b, int *_c, MPI_Datatype *d){
+ int *a = _a;
+ int *b = _b;
+ int c = *_c;
+ int i;
+
+ for(i = 0; i < c; i++){
+ int64_t a00 = a[i*4+0+OFFSET];
+ int64_t a01 = a[i*4+1+OFFSET];
+ int64_t a10 = a[i*4+2+OFFSET];
+ int64_t a11 = a[i*4+3+OFFSET];
+ int64_t b00 = b[i*4+0+OFFSET];
+ int64_t b01 = b[i*4+1+OFFSET];
+ int64_t b10 = b[i*4+2+OFFSET];
+ int64_t b11 = b[i*4+3+OFFSET];
+ b[i*4+0+OFFSET] = a00 * b00 + a01 * b10;
+ b[i*4+1+OFFSET] = a00 * b01 + a01 * b11;
+ b[i*4+2+OFFSET] = a10 * b00 + a11 * b10;
+ b[i*4+3+OFFSET] = a10 * b01 + a11 * b11;
+ }
+}
+
+int main(int argc, char *argv[]){
+ MPI_Datatype type;
+ MPI_Op op;
+ int *sbuf, *rbuf, *ans;
+ int nprocs, myrank;
+ int count;
+ int i;
+ int root;
+ int ab[1] = {4};
+ MPI_Aint ad[1] = {OFFSET*sizeof(int)};
+ MPI_Datatype at[1] = {MPI_INT};
+
+ MPI_Init(&argc, &argv);
+
+ MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+ MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+ MPI_Type_create_struct(1, ab, ad, at, &type);
+ MPI_Type_commit(&type);
+
+ MPI_Op_create(func, 0, &op);
+
+ count = nprocs*nprocs*nprocs*nprocs ;
+ if (count > MAXCOUNT) count = MAXCOUNT;
+
+ sbuf = malloc(count * 4 * sizeof(int));
+ rbuf = malloc(count * 4 * sizeof(int));
+ ans = malloc(count * 4 * sizeof(int));
+
+
+ for(i = 0; i < count * 4; i++){
+ ans[i] = ((i+3)&2)>>1;
+ }
+
+ for(i = nprocs; i--; ){
+ int c = count;
+ int j;
+ for(j = 0; j < count * 4; j++){
+ sbuf[j] = i + j;
+ }
+ func(sbuf, ans, &c, NULL);
+ }
+
+ for (root=0; root<nprocs; root++) {
+ for(i = 0; i < count * 4; i++){
+ sbuf[i] = myrank + i;
+ rbuf[i] = -1;
+ }
+
+ MPI_Reduce(sbuf-OFFSET, rbuf-OFFSET, count, type, op, root, MPI_COMM_WORLD);
+
+ int error_flag=0;
+ if (root == myrank) {
+ for(i = 0; i < count * 4; i++){
+ if(rbuf[i] != ans[i]){
+ if (0 == error_flag) printf("rbuf[%d] = %d, ans[%d] = %d root = %d\n", i, rbuf[i], i, ans[i], root);
+ error_flag++;
+ }
+ }
+ }
+ MPI_Barrier(MPI_COMM_WORLD);
+ if(error_flag > 0){
+ MPI_Abort(MPI_COMM_WORLD, 100);
+ }
+ }
+
+ free(sbuf);
+ free(rbuf);
+ free(ans);
+
+ MPI_Type_free(&type);
+ MPI_Op_free(&op);
+
+ MPI_Finalize();
+
+ return 0;
+}
+
diff --git a/ibm/collective/reduce_nocommute_gap_in_place.c b/ibm/collective/reduce_nocommute_gap_in_place.c
new file mode 100644
index 0000000..bfb51e3
--- /dev/null
+++ b/ibm/collective/reduce_nocommute_gap_in_place.c
@@ -0,0 +1,118 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <mpi.h>
+
+#define MAXCOUNT (1024*1024)
+
+#define OFFSET (-2345)
+
+static void func(void *_a, void *_b, int *_c, MPI_Datatype *d){
+ int *a = _a;
+ int *b = _b;
+ int c = *_c;
+ int i;
+
+ for(i = 0; i < c; i++){
+ int64_t a00 = a[i*4+0+OFFSET];
+ int64_t a01 = a[i*4+1+OFFSET];
+ int64_t a10 = a[i*4+2+OFFSET];
+ int64_t a11 = a[i*4+3+OFFSET];
+ int64_t b00 = b[i*4+0+OFFSET];
+ int64_t b01 = b[i*4+1+OFFSET];
+ int64_t b10 = b[i*4+2+OFFSET];
+ int64_t b11 = b[i*4+3+OFFSET];
+ b[i*4+0+OFFSET] = a00 * b00 + a01 * b10;
+ b[i*4+1+OFFSET] = a00 * b01 + a01 * b11;
+ b[i*4+2+OFFSET] = a10 * b00 + a11 * b10;
+ b[i*4+3+OFFSET] = a10 * b01 + a11 * b11;
+ }
+}
+
+int main(int argc, char *argv[]){
+ MPI_Datatype type;
+ MPI_Op op;
+ int *sbuf, *rbuf, *ans;
+ int nprocs, myrank;
+ int count;
+ int i;
+ int root;
+ int ab[1] = {4};
+ MPI_Aint ad[1] = {OFFSET*sizeof(int)};
+ MPI_Datatype at[1] = {MPI_INT};
+
+ MPI_Init(&argc, &argv);
+
+ MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+ MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+ MPI_Type_create_struct(1, ab, ad, at, &type);
+ MPI_Type_commit(&type);
+
+ MPI_Op_create(func, 0, &op);
+
+ count = nprocs*nprocs*nprocs*nprocs ;
+ if (count > MAXCOUNT) count = MAXCOUNT;
+
+ sbuf = malloc(count * 4 * sizeof(int));
+ rbuf = malloc(count * 4 * sizeof(int));
+ ans = malloc(count * 4 * sizeof(int));
+
+
+ for(i = 0; i < count * 4; i++){
+ ans[i] = ((i+3)&2)>>1;
+ }
+
+ for(i = nprocs; i--; ){
+ int c = count;
+ int j;
+ for(j = 0; j < count * 4; j++){
+ sbuf[j] = i + j;
+ }
+ func(sbuf, ans, &c, NULL);
+ }
+
+ for (root=0; root<nprocs; root++) {
+ for(i = 0; i < count * 4; i++){
+ if(root == myrank) {
+ rbuf[i] = myrank + i;
+ } else {
+ sbuf[i] = myrank + i;
+ rbuf[i] = -1;
+ }
+ }
+
+ if (root == myrank) {
+ MPI_Reduce(MPI_IN_PLACE, rbuf-OFFSET, count, type, op, root, MPI_COMM_WORLD);
+ } else {
+ MPI_Reduce(sbuf-OFFSET, rbuf-OFFSET, count, type, op, root, MPI_COMM_WORLD);
+ }
+
+
+ int error_flag=0;
+ if (root == myrank) {
+ for(i = 0; i < count * 4; i++){
+ if(rbuf[i] != ans[i]){
+ if (0 == error_flag) printf("rbuf[%d] = %d, ans[%d] = %d root = %d\n", i, rbuf[i], i, ans[i], root);
+ error_flag++;
+ }
+ }
+ }
+ MPI_Barrier(MPI_COMM_WORLD);
+ if(error_flag > 0){
+ MPI_Abort(MPI_COMM_WORLD, 100);
+ }
+ }
+
+ free(sbuf);
+ free(rbuf);
+ free(ans);
+
+ MPI_Type_free(&type);
+ MPI_Op_free(&op);
+
+ MPI_Finalize();
+
+ return 0;
+}
+
diff --git a/ibm/collective/reduce_nocommute_in_place.c b/ibm/collective/reduce_nocommute_in_place.c
new file mode 100644
index 0000000..9322e5a
--- /dev/null
+++ b/ibm/collective/reduce_nocommute_in_place.c
@@ -0,0 +1,112 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <mpi.h>
+
+#define MAXCOUNT (1024*1024)
+
+static void func(void *_a, void *_b, int *_c, MPI_Datatype *d){
+ int *a = _a;
+ int *b = _b;
+ int c = *_c;
+ int i;
+
+ for(i = 0; i < c; i++){
+ int64_t a00 = a[i*4+0];
+ int64_t a01 = a[i*4+1];
+ int64_t a10 = a[i*4+2];
+ int64_t a11 = a[i*4+3];
+ int64_t b00 = b[i*4+0];
+ int64_t b01 = b[i*4+1];
+ int64_t b10 = b[i*4+2];
+ int64_t b11 = b[i*4+3];
+ b[i*4+0] = a00 * b00 + a01 * b10;
+ b[i*4+1] = a00 * b01 + a01 * b11;
+ b[i*4+2] = a10 * b00 + a11 * b10;
+ b[i*4+3] = a10 * b01 + a11 * b11;
+ }
+}
+
+int main(int argc, char *argv[]){
+ MPI_Datatype type;
+ MPI_Op op;
+ int *sbuf, *rbuf, *ans;
+ int nprocs, myrank;
+ int count;
+ int i;
+ int root;
+
+ MPI_Init(&argc, &argv);
+
+ MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+ MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+ MPI_Type_contiguous(4, MPI_INT, &type);
+ MPI_Type_commit(&type);
+
+ MPI_Op_create(func, 0, &op);
+
+ count = nprocs*nprocs*nprocs*nprocs ;
+ if (count > MAXCOUNT) count = MAXCOUNT;
+
+ sbuf = malloc(count * 4 * sizeof(int));
+ rbuf = malloc(count * 4 * sizeof(int));
+ ans = malloc(count * 4 * sizeof(int));
+
+
+ for(i = 0; i < count * 4; i++){
+ ans[i] = ((i+3)&2)>>1;
+ }
+
+ for(i = nprocs; i--; ){
+ int c = count;
+ int j;
+ for(j = 0; j < count * 4; j++){
+ sbuf[j] = i + j;
+ }
+ func(sbuf, ans, &c, NULL);
+ }
+
+ for (root=0; root<nprocs; root++) {
+ for(i = 0; i < count * 4; i++){
+ if(root == myrank) {
+ rbuf[i] = myrank + i;
+ } else {
+ sbuf[i] = myrank + i;
+ rbuf[i] = -1;
+ }
+ }
+
+ if (root == myrank) {
+ MPI_Reduce(MPI_IN_PLACE, rbuf, count, type, op, root, MPI_COMM_WORLD);
+ } else {
+ MPI_Reduce(sbuf, rbuf, count, type, op, root, MPI_COMM_WORLD);
+ }
+
+ int error_flag=0;
+ if (root == myrank) {
+ for(i = 0; i < count * 4; i++){
+ if(rbuf[i] != ans[i]){
+ if (0 == error_flag) printf("rbuf[%d] = %d, ans[%d] = %d root = %d\n", i, rbuf[i], i, ans[i], root);
+ error_flag++;
+ }
+ }
+ }
+ MPI_Barrier(MPI_COMM_WORLD);
+ if(error_flag > 0){
+ MPI_Abort(MPI_COMM_WORLD, 100);
+ }
+ }
+
+ free(sbuf);
+ free(rbuf);
+ free(ans);
+
+ MPI_Type_free(&type);
+ MPI_Op_free(&op);
+
+ MPI_Finalize();
+
+ return 0;
+}
+ |
@ggouaillardet , thank you for your PR(blocking allreduce fix) and test. I wrote test using iallreduce with intercommunicator and non-commutative op in my gist |
thanks for the test case, i pushed an other commit into #1760 |
Thanks @ggouaillardet. She is my colleague. I can access ompi-tests and can show her the pull request and the repository. |
Thanks @ggouaillardet. I look your ompi-tests (Thanks to @kawashima-fj ) . |
Thanks, i will take care of that tomorrow |
@yukiM-fj i updated both my #1760 and the ompi-tests repo note your tests only work if local and remote groups have the same size, and a necessary condition is to run on an even number of tasks. |
@ggouaillardet I look your PR and test. |
@yukiM-fj master, v2.x and ompi-tests repo have been updated |
@ggouaillardet Sorry for lating to close this issue. I close this issue because the PR has been already merged. |
Our team(Fujitsu MPI team) found a problem in libnbc at Open MPI 2.0.0rc1.
Results may be wrong when using non-blocking collectives with user-defined op.
The problem occurrs because of the following two reasons:
I wrote a program to reproduce this problem at gist.
We fixed libnbc by following ways:
I wrote two files to show pseudo-code to fix it at another gist.
These files are based on Open MPI v1.8.4. In my gist, "psedo-alg-selection.c" is for the algorithm selection and behavior and "pseudo-wrapper_libnbcop.c" is for the wrapper.
The text was updated successfully, but these errors were encountered: