Description
While implementing the cdesc
stuff for the use mpi_f08
bindings, I noted some surprising performance numbers.
This can be evidenced with use mpi
(e.g. no need for a modern Fortran 2008 compiler) with the code below
TL;DR do a pingpong with the buf(1:1048576:2)
subarray, and compare standard Fortran (it will automagically perform a copy under the hood) and the (manual) use of ddt.
$ mpirun -np 2 --mca pml ob1 --mca btl tcp,self ./pp
BW fortran = 0.49293196172865800 GW/s
BW ddt = 0.24797224656726258 GW/s
to my surprise, copying boosts the performance by a factor 2 !
I digged this, and found that the bottleneck comes from opal_generic_simple_[un]pack_function()
, when [UN]PACK_PREDEFINED_DATATYPE()
calls MEMCPY_CSUM()
with size=4
If I replace memcpy(dest, src, 4)
with *(int *)dest = *(int *)src
, then I get much better performances (more than a 4x improvement).
BW fortran = 0.47470623052957683 GW/s
BW ddt = 1.1068177049734438 GW/s
The inline patch below was used. Note that even if it is big and only support OPAL_INT4
, macros could be used to support the other predefined datatypes. it could also be improved to support more datatypes (for example MPI_Type_vector(..., 2, 4, MPI_INT, ...)
)
Please let me know if and how I should move forward.
Incidentally, I noted CONVERTOR_WITH_CHECKSUM
is tested but never set, so it looks like dead code to me.
Shall I do something about it ? If so, what do you recommend
- simply remove the dead code
#ifdef
out the dead code (we might need it later)- add some
OPAL_UNLIKELY()
around theflags & CONVERTOR_WITH_CHECKSUM
tests
subroutine pingpong_usempi(buf, rank)
use mpi
implicit none
integer :: buf(0:1048577)
integer, intent(in) :: rank
double precision :: t0, t1
integer i, ierr
if (rank.eq.0) then
do i=0,1048577
buf(i) = i
enddo
call mpi_send(buf(1:1048576:2), 524288, MPI_INTEGER, 1, 0, MPI_COMM_WORLD, ierr)
call mpi_recv(buf(1:1048576:2), 524288, MPI_INTEGER, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
t0 = MPI_Wtime()
do i=1,1024
call mpi_send(buf(1:1048576:2), 524288, MPI_INTEGER, 1, 0, MPI_COMM_WORLD, ierr)
call mpi_recv(buf(1:1048576:2), 524288, MPI_INTEGER, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
end do
t1 = MPI_Wtime()
write (*,*) 'BW fortran = ', 1/(t1-t0), ' GW/s'
elseif (rank.eq.1) then
buf = -1
call mpi_recv(buf(1:1048576:2), 524288, MPI_INTEGER, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
call mpi_send(buf(1:1048576:2), 524288, MPI_INTEGER, 0, 0, MPI_COMM_WORLD, ierr)
do i=1,1024
call mpi_recv(buf(1:1048576:2), 524288, MPI_INTEGER, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
call mpi_send(buf(1:1048576:2), 524288, MPI_INTEGER, 0, 0, MPI_COMM_WORLD, ierr)
end do
do i=0,1048577,2
if (buf(i).ne.-1) write (*,*) 'buf(', i, ') = ', i, ' != -1'
enddo
do i=1,1048576,2
if (buf(i).ne.i) write (*,*) 'buf(', i, ') = ', i, ' != ', i
enddo
endif
end subroutine
subroutine pingpong_ddt(buf, rank)
use mpi
implicit none
integer :: buf(0:1048577)
integer, intent(in) :: rank
double precision :: t0, t1
integer :: datatype
integer i, ierr
call mpi_type_vector(524288, 1, 2, MPI_INT, datatype, ierr)
call mpi_type_commit(datatype, ierr)
if (rank.eq.0) then
do i=0,1048577
buf(i) = i
enddo
call mpi_send(buf(1), 1, datatype, 1, 0, MPI_COMM_WORLD, ierr)
call mpi_recv(buf(1), 1, datatype, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
t0 = MPI_Wtime()
do i=1,1024
call mpi_send(buf(1), 1, datatype, 1, 0, MPI_COMM_WORLD, ierr)
call mpi_recv(buf(1), 1, datatype, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
end do
t1 = MPI_Wtime()
write (*,*) 'BW ddt = ', 1/(t1-t0), ' GW/s'
elseif (rank.eq.1) then
buf = -1
call mpi_recv(buf(1), 1, datatype, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
call mpi_send(buf(1), 1, datatype, 0, 0, MPI_COMM_WORLD, ierr)
do i=1,1024
call mpi_recv(buf(1), 1, datatype, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
call mpi_send(buf(1), 1, datatype, 0, 0, MPI_COMM_WORLD, ierr)
end do
do i=0,1048577,2
if (buf(i).ne.-1) write (*,*) 'buf(', i, ') = ', i, ' != -1'
enddo
do i=1,1048576,2
if (buf(i).ne.i) write (*,*) 'buf(', i, ') = ', i, ' != ', i
enddo
endif
end subroutine
program pingpong
use mpi
implicit none
integer :: buf(0:1048577)
integer :: datatype, ierr
integer type_size
integer rank
call mpi_init(ierr)
call mpi_comm_rank(mpi_comm_world, rank, ierr)
call pingpong_usempi(buf, rank)
call mpi_barrier(MPI_COMM_WORLD, ierr)
call pingpong_ddt(buf, rank)
call mpi_barrier(MPI_COMM_WORLD, ierr)
call mpi_finalize(ierr)
end program
diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c
index ce889f7..22bd0c9 100644
--- a/opal/datatype/opal_convertor.c
+++ b/opal/datatype/opal_convertor.c
@@ -595,7 +595,11 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
convertor->fAdvance = opal_unpack_homogeneous_contig;
} else {
- convertor->fAdvance = opal_generic_simple_unpack;
+ if(1 == convertor->use_desc->used && OPAL_DATATYPE_INT4 == convertor->use_desc->desc[0].elem.common.type) {
+ convertor->fAdvance = opal_generic_INT4_unpack;
+ } else {
+ convertor->fAdvance = opal_generic_simple_unpack;
+ }
}
}
}
@@ -642,7 +646,11 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
else
convertor->fAdvance = opal_pack_homogeneous_contig_with_gaps;
} else {
- convertor->fAdvance = opal_generic_simple_pack;
+ if(1 == convertor->use_desc->used && OPAL_DATATYPE_INT4 == convertor->use_desc->desc[0].elem.common.type) {
+ convertor->fAdvance = opal_generic_INT4_pack;
+ } else {
+ convertor->fAdvance = opal_generic_simple_pack;
+ }
}
}
}
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index 55889fc..4ad07de 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -396,6 +396,113 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
return 0;
}
+
+#if !defined(CHECKSUM)
+int32_t
+opal_generic_INT4_pack( opal_convertor_t* pConvertor,
+ struct iovec* iov, uint32_t* out_size,
+ size_t* max_data )
+{
+ dt_stack_t* pStack; /* pointer to the position on the stack */
+ uint32_t pos_desc; /* actual position in the description of the derived datatype */
+ size_t count_desc; /* the number of items already done in the actual pos_desc */
+ size_t total_packed = 0; /* total amount packed this time */
+ dt_elem_desc_t* description;
+ dt_elem_desc_t* pElem;
+ const opal_datatype_t *pData = pConvertor->pDesc;
+ unsigned char *conv_ptr, *iov_ptr;
+ size_t iov_len_local;
+ uint32_t iov_count;
+
+ DO_DEBUG( opal_output( 0, "opal_convertor_generic_simple_pack( %p:%p, {%p, %lu}, %d )\n",
+ (void*)pConvertor, (void*)pConvertor->pBaseBuf,
+ (void*)iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size ); );
+
+ description = pConvertor->use_desc->desc;
+
+ /* For the first step we have to add both displacement to the source. After in the
+ * main while loop we will set back the conv_ptr to the correct value. This is
+ * due to the fact that the convertor can stop in the middle of a data with a count
+ */
+ pStack = pConvertor->pStack + pConvertor->stack_pos;
+ pos_desc = pStack->index;
+ conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+ count_desc = pStack->count;
+ pStack--;
+ pConvertor->stack_pos--;
+ pElem = &(description[pos_desc]);
+
+ DO_DEBUG( opal_output( 0, "pack start pos_desc %d count_desc %" PRIsize_t " disp %ld\n"
+ "stack_pos %d pos_desc %d count_desc %" PRIsize_t " disp %ld\n",
+ pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
+ pConvertor->stack_pos, pStack->index, pStack->count, pStack->disp ); );
+
+ for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
+ iov_ptr = (unsigned char *) iov[iov_count].iov_base;
+ iov_len_local = iov[iov_count].iov_len;
+ while( 1 ) {
+ while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+ /* now here we have a basic datatype */
+ PACK_INT4 ( pConvertor, pElem, count_desc,
+ conv_ptr, iov_ptr, iov_len_local );
+ if( 0 == count_desc ) { /* completed */
+ conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+ pos_desc++; /* advance to the next data */
+ UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+ continue;
+ }
+ goto complete_loop;
+ }
+ assert(OPAL_DATATYPE_END_LOOP == pElem->elem.common.type);
+ DO_DEBUG( opal_output( 0, "pack end_loop count %" PRIsize_t " stack_pos %d"
+ " pos_desc %d disp %ld space %lu\n",
+ pStack->count, pConvertor->stack_pos,
+ pos_desc, pStack->disp, (unsigned long)iov_len_local ); );
+ if( --(pStack->count) == 0 ) { /* end of loop */
+ if( 0 == pConvertor->stack_pos ) {
+ /* we're done. Force the exit of the main for loop (around iovec) */
+ *out_size = iov_count;
+ goto complete_loop;
+ }
+ pConvertor->stack_pos--; /* go one position up on the stack */
+ pStack--;
+ pos_desc++; /* and move to the next element */
+ } else {
+ pos_desc = pStack->index + 1; /* jump back to the begining of the loop */
+ if( pStack->index == -1 ) { /* If it's the datatype count loop */
+ pStack->disp += (pData->ub - pData->lb); /* jump by the datatype extent */
+ } else {
+ assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
+ pStack->disp += description[pStack->index].loop.extent; /* jump by the loop extent */
+ }
+ }
+ conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+ UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+ DO_DEBUG( opal_output( 0, "pack new_loop count %" PRIsize_t " stack_pos %d pos_desc %d count_desc %" PRIsize_t " disp %ld space %lu\n",
+ pStack->count, pConvertor->stack_pos, pos_desc,
+ count_desc, pStack->disp, (unsigned long)iov_len_local ); );
+ assert(OPAL_DATATYPE_LOOP != pElem->elem.common.type);
+ }
+ complete_loop:
+ iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */
+ total_packed += iov[iov_count].iov_len;
+ }
+ *max_data = total_packed;
+ pConvertor->bConverted += total_packed; /* update the already converted bytes */
+ *out_size = iov_count;
+ if( pConvertor->bConverted == pConvertor->local_size ) {
+ pConvertor->flags |= CONVERTOR_COMPLETED;
+ return 1;
+ }
+ /* Save the global position for the next round */
+ PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
+ conv_ptr - pConvertor->pBaseBuf );
+ DO_DEBUG( opal_output( 0, "pack save stack stack_pos %d pos_desc %d count_desc %" PRIsize_t " disp %ld\n",
+ pConvertor->stack_pos, pStack->index, pStack->count, pStack->disp ); );
+ return 0;
+}
+#endif
+
/*
* Remember that the first item in the stack (ie. position 0) is the number
* of times the datatype is involved in the operation (ie. the count argument
diff --git a/opal/datatype/opal_datatype_pack.h b/opal/datatype/opal_datatype_pack.h
index f952cab..27744ab 100644
--- a/opal/datatype/opal_datatype_pack.h
+++ b/opal/datatype/opal_datatype_pack.h
@@ -73,6 +73,40 @@ static inline void pack_predefined_data( opal_convertor_t* CONVERTOR,
*(COUNT) -= _copy_count;
}
+static inline void pack_INT4( opal_convertor_t* CONVERTOR,
+ const dt_elem_desc_t* ELEM,
+ size_t* COUNT,
+ unsigned char** SOURCE,
+ unsigned char** DESTINATION,
+ size_t* SPACE )
+{
+ size_t _copy_count = *(COUNT);
+ size_t _copy_blength;
+ const ddt_elem_desc_t* _elem = &((ELEM)->elem);
+ unsigned char* _source = (*SOURCE) + _elem->disp;
+
+ _copy_blength = opal_datatype_basicDatatypes[_elem->common.type]->size;
+ assert((ptrdiff_t) _copy_blength != _elem->extent);
+ assert(_copy_blength == sizeof(int));
+ if( (_copy_count * _copy_blength) > *(SPACE) ) {
+ _copy_count = (*(SPACE) / _copy_blength);
+ if( 0 == _copy_count ) return; /* nothing to do */
+ }
+
+ int *dest = (int *)(*DESTINATION);
+ int* _s = (int *)_source;
+ for(size_t _i = 0; _i < _copy_count; _i++) {
+ *dest++ = *_s;
+ _s += _elem->extent / sizeof(int);
+ }
+ _source = (unsigned char*)_s;
+ *(DESTINATION) = (unsigned char *)dest;
+ _copy_blength *= _copy_count;
+ *(SOURCE) = _source - _elem->disp;
+ *(SPACE) -= _copy_blength;
+ *(COUNT) -= _copy_count;
+}
+
static inline void pack_contiguous_loop( opal_convertor_t* CONVERTOR,
const dt_elem_desc_t* ELEM,
size_t* COUNT,
@@ -109,6 +143,14 @@ static inline void pack_contiguous_loop( opal_convertor_t* CONVERTOR,
SPACE ) /* the space in the destination buffer */ \
pack_predefined_data( (CONVERTOR), (ELEM), &(COUNT), &(SOURCE), &(DESTINATION), &(SPACE) )
+#define PACK_INT4( CONVERTOR, /* the convertor */ \
+ ELEM, /* the basic element to be packed */ \
+ COUNT, /* the number of elements */ \
+ SOURCE, /* the source pointer (char*) */ \
+ DESTINATION, /* the destination pointer (char*) */ \
+ SPACE ) /* the space in the destination buffer */ \
+pack_INT4( (CONVERTOR), (ELEM), &(COUNT), &(SOURCE), &(DESTINATION), &(SPACE) )
+
#define PACK_CONTIGUOUS_LOOP( CONVERTOR, ELEM, COUNT, SOURCE, DESTINATION, SPACE ) \
pack_contiguous_loop( (CONVERTOR), (ELEM), &(COUNT), &(SOURCE), &(DESTINATION), &(SPACE) )
diff --git a/opal/datatype/opal_datatype_prototypes.h b/opal/datatype/opal_datatype_prototypes.h
index 6683971..cd3ce67 100644
--- a/opal/datatype/opal_datatype_prototypes.h
+++ b/opal/datatype/opal_datatype_prototypes.h
@@ -68,6 +68,10 @@ opal_generic_simple_pack_checksum( opal_convertor_t* pConvertor,
struct iovec* iov, uint32_t* out_size,
size_t* max_data );
int32_t
+opal_generic_INT4_pack( opal_convertor_t* pConvertor,
+ struct iovec* iov, uint32_t* out_size,
+ size_t* max_data );
+int32_t
opal_unpack_homogeneous_contig( opal_convertor_t* pConv,
struct iovec* iov, uint32_t* out_size,
size_t* max_data );
@@ -83,6 +87,10 @@ int32_t
opal_generic_simple_unpack_checksum( opal_convertor_t* pConvertor,
struct iovec* iov, uint32_t* out_size,
size_t* max_data );
+int32_t
+opal_generic_INT4_unpack( opal_convertor_t* pConvertor,
+ struct iovec* iov, uint32_t* out_size,
+ size_t* max_data );
END_C_DECLS
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index 3edb916..02b6bb3 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -419,6 +419,172 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
return 0;
}
+#if !defined(CHECKSUM)
+int32_t
+opal_generic_INT4_unpack( opal_convertor_t* pConvertor,
+ struct iovec* iov, uint32_t* out_size,
+ size_t* max_data )
+{
+ dt_stack_t* pStack; /* pointer to the position on the stack */
+ uint32_t pos_desc; /* actual position in the description of the derived datatype */
+ size_t count_desc; /* the number of items already done in the actual pos_desc */
+ size_t total_unpacked = 0; /* total size unpacked this time */
+ dt_elem_desc_t* description;
+ dt_elem_desc_t* pElem;
+ const opal_datatype_t *pData = pConvertor->pDesc;
+ unsigned char *conv_ptr, *iov_ptr;
+ size_t iov_len_local;
+ uint32_t iov_count;
+
+ DO_DEBUG( opal_output( 0, "opal_convertor_generic_simple_unpack( %p, {%p, %lu}, %u )\n",
+ (void*)pConvertor, (void*)iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size ); );
+
+ description = pConvertor->use_desc->desc;
+
+ /* For the first step we have to add both displacement to the source. After in the
+ * main while loop we will set back the source_base to the correct value. This is
+ * due to the fact that the convertor can stop in the middle of a data with a count
+ */
+ pStack = pConvertor->pStack + pConvertor->stack_pos;
+ pos_desc = pStack->index;
+ conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+ count_desc = pStack->count;
+ pStack--;
+ pConvertor->stack_pos--;
+ pElem = &(description[pos_desc]);
+
+ DO_DEBUG( opal_output( 0, "unpack start pos_desc %d count_desc %" PRIsize_t " disp %ld\n"
+ "stack_pos %d pos_desc %d count_desc %" PRIsize_t " disp %ld\n",
+ pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
+ pConvertor->stack_pos, pStack->index, pStack->count, (long)(pStack->disp) ); );
+
+ for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
+ iov_ptr = (unsigned char *) iov[iov_count].iov_base;
+ iov_len_local = iov[iov_count].iov_len;
+ if( 0 != pConvertor->partial_length ) {
+ size_t element_length = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;
+ size_t missing_length = element_length - pConvertor->partial_length;
+
+ assert( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA );
+ COMPUTE_CSUM( iov_ptr, missing_length, pConvertor );
+ opal_unpack_partial_datatype( pConvertor, pElem,
+ iov_ptr,
+ pConvertor->partial_length, element_length - pConvertor->partial_length,
+ &conv_ptr );
+ --count_desc;
+ if( 0 == count_desc ) {
+ conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+ pos_desc++; /* advance to the next data */
+ UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+ }
+ iov_ptr += missing_length;
+ iov_len_local -= missing_length;
+ pConvertor->partial_length = 0; /* nothing more inside */
+ }
+ while( 1 ) {
+ while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+ /* now here we have a basic datatype */
+ UNPACK_INT4( pConvertor, pElem, count_desc,
+ iov_ptr, conv_ptr, iov_len_local );
+ if( 0 == count_desc ) { /* completed */
+ conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+ pos_desc++; /* advance to the next data */
+ UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+ continue;
+ }
+ assert( pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED );
+ if( 0 != iov_len_local ) {
+ unsigned char* temp = conv_ptr;
+ /* We have some partial data here. Let's copy it into the convertor
+ * and keep it hot until the next round.
+ */
+ assert( iov_len_local < opal_datatype_basicDatatypes[pElem->elem.common.type]->size );
+ COMPUTE_CSUM( iov_ptr, iov_len_local, pConvertor );
+
+ opal_unpack_partial_datatype( pConvertor, pElem,
+ iov_ptr, 0, iov_len_local,
+ &temp );
+
+ pConvertor->partial_length = iov_len_local;
+ iov_len_local = 0;
+ }
+ goto complete_loop;
+ }
+ assert( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type );
+ if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
+ DO_DEBUG( opal_output( 0, "unpack end_loop count %" PRIsize_t " stack_pos %d pos_desc %d disp %ld space %lu\n",
+ pStack->count, pConvertor->stack_pos, pos_desc,
+ pStack->disp, (unsigned long)iov_len_local ); );
+ if( --(pStack->count) == 0 ) { /* end of loop */
+ if( 0 == pConvertor->stack_pos ) {
+ /* Do the same thing as when the loop is completed */
+ iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */
+ total_unpacked += iov[iov_count].iov_len;
+ iov_count++; /* go to the next */
+ goto complete_conversion;
+ }
+ pConvertor->stack_pos--;
+ pStack--;
+ pos_desc++;
+ } else {
+ pos_desc = pStack->index + 1;
+ if( pStack->index == -1 ) {
+ pStack->disp += (pData->ub - pData->lb);
+ } else {
+ assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
+ pStack->disp += description[pStack->index].loop.extent;
+ }
+ }
+ conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+ UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+ DO_DEBUG( opal_output( 0, "unpack new_loop count %" PRIsize_t " stack_pos %d pos_desc %d disp %ld space %lu\n",
+ pStack->count, pConvertor->stack_pos, pos_desc,
+ pStack->disp, (unsigned long)iov_len_local ); );
+ }
+ assert( OPAL_DATATYPE_LOOP != pElem->elem.common.type );
+ if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+ ptrdiff_t local_disp = (ptrdiff_t)conv_ptr;
+ if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
+ UNPACK_CONTIGUOUS_LOOP( pConvertor, pElem, count_desc,
+ iov_ptr, conv_ptr, iov_len_local );
+ if( 0 == count_desc ) { /* completed */
+ pos_desc += pElem->loop.items + 1;
+ goto update_loop_description;
+ }
+ /* Save the stack with the correct last_count value. */
+ }
+ local_disp = (ptrdiff_t)conv_ptr - local_disp;
+ PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
+ pStack->disp + local_disp);
+ pos_desc++;
+ update_loop_description: /* update the current state */
+ conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+ UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+ DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" );
+ continue;
+ }
+ }
+ complete_loop:
+ iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */
+ total_unpacked += iov[iov_count].iov_len;
+ }
+ complete_conversion:
+ *max_data = total_unpacked;
+ pConvertor->bConverted += total_unpacked; /* update the already converted bytes */
+ *out_size = iov_count;
+ if( pConvertor->bConverted == pConvertor->remote_size ) {
+ pConvertor->flags |= CONVERTOR_COMPLETED;
+ return 1;
+ }
+ /* Save the global position for the next round */
+ PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
+ conv_ptr - pConvertor->pBaseBuf );
+ DO_DEBUG( opal_output( 0, "unpack save stack stack_pos %d pos_desc %d count_desc %" PRIsize_t " disp %ld\n",
+ pConvertor->stack_pos, pStack->index, pStack->count, (long)pStack->disp ); );
+ return 0;
+}
+#endif
+
/*
* Remember that the first item in the stack (ie. position 0) is the number
* of times the datatype is involved in the operation (ie. the count argument
diff --git a/opal/datatype/opal_datatype_unpack.h b/opal/datatype/opal_datatype_unpack.h
index d837aad..501edc5 100644
--- a/opal/datatype/opal_datatype_unpack.h
+++ b/opal/datatype/opal_datatype_unpack.h
@@ -72,6 +72,40 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR, /* the convertor */
*(COUNT) -= _copy_count;
}
+static inline void
+unpack_INT4( opal_convertor_t* CONVERTOR, /* the convertor */
+ const dt_elem_desc_t* ELEM, /* the element description */
+ size_t* COUNT, /* the number of elements */
+ unsigned char** SOURCE, /* the source pointer */
+ unsigned char** DESTINATION, /* the destination pointer */
+ size_t* SPACE ) /* the space in the destination buffer */
+{
+ size_t _copy_count = *(COUNT);
+ size_t _copy_blength;
+ const ddt_elem_desc_t* _elem = &((ELEM)->elem);
+ unsigned char* _destination = (*DESTINATION) + _elem->disp;
+
+ _copy_blength = opal_datatype_basicDatatypes[_elem->common.type]->size;
+ assert((ptrdiff_t) _copy_blength != _elem->extent);
+ if( (_copy_count * _copy_blength) > *(SPACE) ) {
+ _copy_count = (*(SPACE) / _copy_blength);
+ if( 0 == _copy_count ) return; /* nothing to do */
+ }
+
+ int *_d = (int *)_destination;
+ int *_s = (int *)(*(SOURCE));
+ for(size_t _i = 0; _i < _copy_count; _i++ ) {
+ *_d = *_s++;
+ _d += _elem->extent / sizeof(int);
+ }
+ _destination = (unsigned char *)_d;
+ *(SOURCE) = (unsigned char *)_s;
+_copy_blength *= _copy_count;
+ (*DESTINATION) = _destination - _elem->disp;
+ *(SPACE) -= _copy_blength;
+ *(COUNT) -= _copy_count;
+}
+
static inline void unpack_contiguous_loop( opal_convertor_t* CONVERTOR,
const dt_elem_desc_t* ELEM,
size_t* COUNT,
@@ -103,6 +137,9 @@ static inline void unpack_contiguous_loop( opal_convertor_t* CONVERTOR,
#define UNPACK_PREDEFINED_DATATYPE( CONVERTOR, ELEM, COUNT, SOURCE, DESTINATION, SPACE ) \
unpack_predefined_data( (CONVERTOR), (ELEM), &(COUNT), &(SOURCE), &(DESTINATION), &(SPACE) )
+#define UNPACK_INT4( CONVERTOR, ELEM, COUNT, SOURCE, DESTINATION, SPACE ) \
+ unpack_INT4( (CONVERTOR), (ELEM), &(COUNT), &(SOURCE), &(DESTINATION), &(SPACE) )
+
#define UNPACK_CONTIGUOUS_LOOP( CONVERTOR, ELEM, COUNT, SOURCE, DESTINATION, SPACE ) \
unpack_contiguous_loop( (CONVERTOR), (ELEM), &(COUNT), &(SOURCE), &(DESTINATION), &(SPACE) )