convertor pack/unpack performance with strided datatypes

While implementing the `cdesc` stuff for the `use mpi_f08` bindings, I noted some surprising performance numbers.
This can be evidenced with `use mpi` (e.g. no need for a modern Fortran 2008 compiler) with the code below

TL;DR do a pingpong with the `buf(1:1048576:2)` subarray, and compare standard Fortran (it will automagically perform a copy under the hood) and the (manual) use of ddt.

```
$ mpirun -np 2 --mca pml ob1 --mca btl tcp,self ./pp
 BW fortran =   0.49293196172865800       GW/s
 BW ddt =   0.24797224656726258       GW/s
```

to my surprise, copying boosts the performance by a factor 2 !

I digged this, and found that the bottleneck comes from `opal_generic_simple_[un]pack_function()`, when `[UN]PACK_PREDEFINED_DATATYPE()` calls `MEMCPY_CSUM()` with `size=4`
If I replace `memcpy(dest, src, 4)` with `*(int *)dest = *(int *)src`, then I get much better performances (more than a 4x improvement).

```
 BW fortran =   0.47470623052957683       GW/s
 BW ddt =    1.1068177049734438       GW/s
```

The inline patch below was used. Note that even if it is big and only support `OPAL_INT4`, macros could be used to support the other predefined datatypes. it could also be improved to support more datatypes (for example `MPI_Type_vector(..., 2, 4, MPI_INT, ...)`)

Please let me know if and how I should move forward.

Incidentally, I noted `CONVERTOR_WITH_CHECKSUM` is tested but never set, so it looks like dead code to me.
Shall I do something about it ? If so, what do you recommend 
 - simply remove the dead code
 - `#ifdef` out the dead code (we might need it later)
 - add some `OPAL_UNLIKELY()` around the `flags & CONVERTOR_WITH_CHECKSUM` tests 

```fortran
subroutine pingpong_usempi(buf, rank)
use mpi
implicit none
integer :: buf(0:1048577)
integer, intent(in) :: rank
double precision :: t0, t1
integer i, ierr

if (rank.eq.0) then
  do i=0,1048577
    buf(i) = i
  enddo
  call mpi_send(buf(1:1048576:2), 524288, MPI_INTEGER, 1, 0, MPI_COMM_WORLD, ierr)
  call mpi_recv(buf(1:1048576:2), 524288, MPI_INTEGER, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
  t0 = MPI_Wtime()
  do i=1,1024
    call mpi_send(buf(1:1048576:2), 524288, MPI_INTEGER, 1, 0, MPI_COMM_WORLD, ierr)
    call mpi_recv(buf(1:1048576:2), 524288, MPI_INTEGER, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
  end do
  t1 = MPI_Wtime()
  write (*,*) 'BW fortran = ', 1/(t1-t0), ' GW/s'
elseif (rank.eq.1) then
  buf = -1
  call mpi_recv(buf(1:1048576:2), 524288, MPI_INTEGER, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
  call mpi_send(buf(1:1048576:2), 524288, MPI_INTEGER, 0, 0, MPI_COMM_WORLD, ierr)
  do i=1,1024
    call mpi_recv(buf(1:1048576:2), 524288, MPI_INTEGER, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
    call mpi_send(buf(1:1048576:2), 524288, MPI_INTEGER, 0, 0, MPI_COMM_WORLD, ierr)
  end do
  do i=0,1048577,2
    if (buf(i).ne.-1) write (*,*) 'buf(', i, ') = ', i, ' != -1'
  enddo
  do i=1,1048576,2
    if (buf(i).ne.i) write (*,*) 'buf(', i, ') = ', i, ' != ', i
  enddo
endif
end subroutine

subroutine pingpong_ddt(buf, rank)
use mpi
implicit none
integer :: buf(0:1048577)
integer, intent(in) :: rank
double precision :: t0, t1
integer :: datatype
integer i, ierr

call mpi_type_vector(524288, 1, 2, MPI_INT, datatype, ierr)
call mpi_type_commit(datatype, ierr)

if (rank.eq.0) then
  do i=0,1048577
    buf(i) = i
  enddo
  call mpi_send(buf(1), 1, datatype, 1, 0, MPI_COMM_WORLD, ierr)
  call mpi_recv(buf(1), 1, datatype, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
  t0 = MPI_Wtime()
  do i=1,1024
    call mpi_send(buf(1), 1, datatype, 1, 0, MPI_COMM_WORLD, ierr)
    call mpi_recv(buf(1), 1, datatype, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
  end do
  t1 = MPI_Wtime()
  write (*,*) 'BW ddt = ', 1/(t1-t0), ' GW/s'
elseif (rank.eq.1) then
  buf = -1
  call mpi_recv(buf(1), 1, datatype, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
  call mpi_send(buf(1), 1, datatype, 0, 0, MPI_COMM_WORLD, ierr)
  do i=1,1024
    call mpi_recv(buf(1), 1, datatype, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
    call mpi_send(buf(1), 1, datatype, 0, 0, MPI_COMM_WORLD, ierr)
  end do
  do i=0,1048577,2
    if (buf(i).ne.-1) write (*,*) 'buf(', i, ') = ', i, ' != -1'
  enddo
  do i=1,1048576,2
    if (buf(i).ne.i) write (*,*) 'buf(', i, ') = ', i, ' != ', i
  enddo
endif
end subroutine

program pingpong
use mpi
implicit none

integer :: buf(0:1048577)
integer :: datatype, ierr
integer type_size
integer rank

call mpi_init(ierr)
call mpi_comm_rank(mpi_comm_world, rank, ierr)

call pingpong_usempi(buf, rank)
call mpi_barrier(MPI_COMM_WORLD, ierr)

call pingpong_ddt(buf, rank)
call mpi_barrier(MPI_COMM_WORLD, ierr)

call mpi_finalize(ierr)
end program
```

```patch
diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c
index ce889f7..22bd0c9 100644
--- a/opal/datatype/opal_convertor.c
+++ b/opal/datatype/opal_convertor.c
@@ -595,7 +595,11 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
             if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
                 convertor->fAdvance = opal_unpack_homogeneous_contig;
             } else {
-                convertor->fAdvance = opal_generic_simple_unpack;
+                if(1 == convertor->use_desc->used && OPAL_DATATYPE_INT4 == convertor->use_desc->desc[0].elem.common.type) {
+                    convertor->fAdvance = opal_generic_INT4_unpack;
+                } else {
+                    convertor->fAdvance = opal_generic_simple_unpack;
+                }
             }
         }
     }
@@ -642,7 +646,11 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
                 else
                     convertor->fAdvance = opal_pack_homogeneous_contig_with_gaps;
             } else {
-                convertor->fAdvance = opal_generic_simple_pack;
+                if(1 == convertor->use_desc->used && OPAL_DATATYPE_INT4 == convertor->use_desc->desc[0].elem.common.type) {
+                    convertor->fAdvance = opal_generic_INT4_pack;
+                } else {
+                    convertor->fAdvance = opal_generic_simple_pack;
+                }
             }
         }
     }
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index 55889fc..4ad07de 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -396,6 +396,113 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
     return 0;
 }
 
+
+#if !defined(CHECKSUM)
+int32_t
+opal_generic_INT4_pack( opal_convertor_t* pConvertor,
+                        struct iovec* iov, uint32_t* out_size,
+                        size_t* max_data )
+{
+    dt_stack_t* pStack;       /* pointer to the position on the stack */
+    uint32_t pos_desc;        /* actual position in the description of the derived datatype */
+    size_t count_desc;        /* the number of items already done in the actual pos_desc */
+    size_t total_packed = 0;  /* total amount packed this time */
+    dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
+    const opal_datatype_t *pData = pConvertor->pDesc;
+    unsigned char *conv_ptr, *iov_ptr;
+    size_t iov_len_local;
+    uint32_t iov_count;
+
+    DO_DEBUG( opal_output( 0, "opal_convertor_generic_simple_pack( %p:%p, {%p, %lu}, %d )\n",
+                           (void*)pConvertor, (void*)pConvertor->pBaseBuf,
+                           (void*)iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size ); );
+
+    description = pConvertor->use_desc->desc;
+
+    /* For the first step we have to add both displacement to the source. After in the
+     * main while loop we will set back the conv_ptr to the correct value. This is
+     * due to the fact that the convertor can stop in the middle of a data with a count
+     */
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    pos_desc   = pStack->index;
+    conv_ptr   = pConvertor->pBaseBuf + pStack->disp;
+    count_desc = pStack->count;
+    pStack--;
+    pConvertor->stack_pos--;
+    pElem = &(description[pos_desc]);
+
+    DO_DEBUG( opal_output( 0, "pack start pos_desc %d count_desc %" PRIsize_t " disp %ld\n"
+                           "stack_pos %d pos_desc %d count_desc %" PRIsize_t " disp %ld\n",
+                           pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
+                           pConvertor->stack_pos, pStack->index, pStack->count, pStack->disp ); );
+
+    for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
+        iov_ptr = (unsigned char *) iov[iov_count].iov_base;
+        iov_len_local = iov[iov_count].iov_len;
+        while( 1 ) {
+            while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+                /* now here we have a basic datatype */
+                PACK_INT4 ( pConvertor, pElem, count_desc,
+                            conv_ptr, iov_ptr, iov_len_local );
+                if( 0 == count_desc ) {  /* completed */
+                    conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                    pos_desc++;  /* advance to the next data */
+                    UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                    continue;
+                }
+                goto complete_loop;
+            }
+            assert(OPAL_DATATYPE_END_LOOP == pElem->elem.common.type);
+            DO_DEBUG( opal_output( 0, "pack end_loop count %" PRIsize_t " stack_pos %d"
+                                   " pos_desc %d disp %ld space %lu\n",
+                                   pStack->count, pConvertor->stack_pos,
+                                   pos_desc, pStack->disp, (unsigned long)iov_len_local ); );
+            if( --(pStack->count) == 0 ) { /* end of loop */
+                if( 0 == pConvertor->stack_pos ) {
+                    /* we're done. Force the exit of the main for loop (around iovec) */
+                    *out_size = iov_count;
+                    goto complete_loop;
+                }
+                pConvertor->stack_pos--;  /* go one position up on the stack */
+                pStack--;
+                pos_desc++;  /* and move to the next element */
+            } else {
+                pos_desc = pStack->index + 1;  /* jump back to the begining of the loop */
+                if( pStack->index == -1 ) {  /* If it's the datatype count loop */
+                    pStack->disp += (pData->ub - pData->lb);  /* jump by the datatype extent */
+                } else {
+                    assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
+                    pStack->disp += description[pStack->index].loop.extent;  /* jump by the loop extent */
+                }
+            }
+            conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+            UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+            DO_DEBUG( opal_output( 0, "pack new_loop count %" PRIsize_t " stack_pos %d pos_desc %d count_desc %" PRIsize_t " disp %ld space %lu\n",
+                                   pStack->count, pConvertor->stack_pos, pos_desc,
+                                   count_desc, pStack->disp, (unsigned long)iov_len_local ); );
+            assert(OPAL_DATATYPE_LOOP != pElem->elem.common.type);
+        }
+    complete_loop:
+        iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+        total_packed += iov[iov_count].iov_len;
+    }
+    *max_data = total_packed;
+    pConvertor->bConverted += total_packed;  /* update the already converted bytes */
+    *out_size = iov_count;
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        return 1;
+    }
+    /* Save the global position for the next round */
+    PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
+                conv_ptr - pConvertor->pBaseBuf );
+    DO_DEBUG( opal_output( 0, "pack save stack stack_pos %d pos_desc %d count_desc %" PRIsize_t " disp %ld\n",
+                           pConvertor->stack_pos, pStack->index, pStack->count, pStack->disp ); );
+    return 0;
+}
+#endif
+
 /*
  * Remember that the first item in the stack (ie. position 0) is the number
  * of times the datatype is involved in the operation (ie. the count argument
diff --git a/opal/datatype/opal_datatype_pack.h b/opal/datatype/opal_datatype_pack.h
index f952cab..27744ab 100644
--- a/opal/datatype/opal_datatype_pack.h
+++ b/opal/datatype/opal_datatype_pack.h
@@ -73,6 +73,40 @@ static inline void pack_predefined_data( opal_convertor_t* CONVERTOR,
     *(COUNT)  -= _copy_count;
 }
 
+static inline void pack_INT4( opal_convertor_t* CONVERTOR,
+                                   const dt_elem_desc_t* ELEM,
+                                   size_t* COUNT,
+                                   unsigned char** SOURCE,
+                                   unsigned char** DESTINATION,
+                                   size_t* SPACE )
+{
+    size_t _copy_count = *(COUNT);
+    size_t _copy_blength;
+    const ddt_elem_desc_t* _elem = &((ELEM)->elem);
+    unsigned char* _source = (*SOURCE) + _elem->disp;
+
+    _copy_blength = opal_datatype_basicDatatypes[_elem->common.type]->size;
+    assert((ptrdiff_t) _copy_blength != _elem->extent);
+    assert(_copy_blength == sizeof(int));
+    if( (_copy_count * _copy_blength) > *(SPACE) ) {
+        _copy_count = (*(SPACE) / _copy_blength);
+        if( 0 == _copy_count ) return;  /* nothing to do */
+    }
+
+    int *dest = (int *)(*DESTINATION);
+    int* _s = (int *)_source;
+    for(size_t _i = 0; _i < _copy_count; _i++) {
+        *dest++ = *_s;
+        _s += _elem->extent / sizeof(int);
+    }
+    _source = (unsigned char*)_s;
+    *(DESTINATION) = (unsigned char *)dest;
+    _copy_blength *= _copy_count;
+    *(SOURCE)  = _source - _elem->disp;
+    *(SPACE)  -= _copy_blength;
+    *(COUNT)  -= _copy_count;
+}
+
 static inline void pack_contiguous_loop( opal_convertor_t* CONVERTOR,
                                          const dt_elem_desc_t* ELEM,
                                          size_t* COUNT,
@@ -109,6 +143,14 @@ static inline void pack_contiguous_loop( opal_convertor_t* CONVERTOR,
                                   SPACE )       /* the space in the destination buffer */ \
 pack_predefined_data( (CONVERTOR), (ELEM), &(COUNT), &(SOURCE), &(DESTINATION), &(SPACE) )
 
+#define PACK_INT4( CONVERTOR,    /* the convertor */                       \
+                   ELEM,         /* the basic element to be packed */      \
+                   COUNT,        /* the number of elements */              \
+                   SOURCE,       /* the source pointer (char*) */          \
+                   DESTINATION,  /* the destination pointer (char*) */     \
+                   SPACE )       /* the space in the destination buffer */ \
+pack_INT4( (CONVERTOR), (ELEM), &(COUNT), &(SOURCE), &(DESTINATION), &(SPACE) )
+
 #define PACK_CONTIGUOUS_LOOP( CONVERTOR, ELEM, COUNT, SOURCE, DESTINATION, SPACE ) \
     pack_contiguous_loop( (CONVERTOR), (ELEM), &(COUNT), &(SOURCE), &(DESTINATION), &(SPACE) )
 
diff --git a/opal/datatype/opal_datatype_prototypes.h b/opal/datatype/opal_datatype_prototypes.h
index 6683971..cd3ce67 100644
--- a/opal/datatype/opal_datatype_prototypes.h
+++ b/opal/datatype/opal_datatype_prototypes.h
@@ -68,6 +68,10 @@ opal_generic_simple_pack_checksum( opal_convertor_t* pConvertor,
                                    struct iovec* iov, uint32_t* out_size,
                                    size_t* max_data );
 int32_t
+opal_generic_INT4_pack( opal_convertor_t* pConvertor,
+                        struct iovec* iov, uint32_t* out_size,
+                        size_t* max_data );
+int32_t
 opal_unpack_homogeneous_contig( opal_convertor_t* pConv,
                                 struct iovec* iov, uint32_t* out_size,
                                 size_t* max_data );
@@ -83,6 +87,10 @@ int32_t
 opal_generic_simple_unpack_checksum( opal_convertor_t* pConvertor,
                                      struct iovec* iov, uint32_t* out_size,
                                      size_t* max_data );
+int32_t
+opal_generic_INT4_unpack( opal_convertor_t* pConvertor,
+                          struct iovec* iov, uint32_t* out_size,
+                          size_t* max_data );
 
 END_C_DECLS
 
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index 3edb916..02b6bb3 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -419,6 +419,172 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
     return 0;
 }
 
+#if !defined(CHECKSUM)
+int32_t
+opal_generic_INT4_unpack( opal_convertor_t* pConvertor,
+                          struct iovec* iov, uint32_t* out_size,
+                          size_t* max_data )
+{
+    dt_stack_t* pStack;                /* pointer to the position on the stack */
+    uint32_t pos_desc;                 /* actual position in the description of the derived datatype */
+    size_t count_desc;                 /* the number of items already done in the actual pos_desc */
+    size_t total_unpacked = 0;         /* total size unpacked this time */
+    dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
+    const opal_datatype_t *pData = pConvertor->pDesc;
+    unsigned char *conv_ptr, *iov_ptr;
+    size_t iov_len_local;
+    uint32_t iov_count;
+
+    DO_DEBUG( opal_output( 0, "opal_convertor_generic_simple_unpack( %p, {%p, %lu}, %u )\n",
+                           (void*)pConvertor, (void*)iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size ); );
+
+    description = pConvertor->use_desc->desc;
+
+    /* For the first step we have to add both displacement to the source. After in the
+     * main while loop we will set back the source_base to the correct value. This is
+     * due to the fact that the convertor can stop in the middle of a data with a count
+     */
+    pStack     = pConvertor->pStack + pConvertor->stack_pos;
+    pos_desc   = pStack->index;
+    conv_ptr   = pConvertor->pBaseBuf + pStack->disp;
+    count_desc = pStack->count;
+    pStack--;
+    pConvertor->stack_pos--;
+    pElem = &(description[pos_desc]);
+
+    DO_DEBUG( opal_output( 0, "unpack start pos_desc %d count_desc %" PRIsize_t " disp %ld\n"
+                           "stack_pos %d pos_desc %d count_desc %" PRIsize_t " disp %ld\n",
+                           pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
+                           pConvertor->stack_pos, pStack->index, pStack->count, (long)(pStack->disp) ); );
+
+    for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
+        iov_ptr = (unsigned char *) iov[iov_count].iov_base;
+        iov_len_local = iov[iov_count].iov_len;
+        if( 0 != pConvertor->partial_length ) {
+            size_t element_length = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;
+            size_t missing_length = element_length - pConvertor->partial_length;
+
+            assert( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA );
+            COMPUTE_CSUM( iov_ptr, missing_length, pConvertor );
+            opal_unpack_partial_datatype( pConvertor, pElem,
+                                          iov_ptr,
+                                          pConvertor->partial_length, element_length - pConvertor->partial_length,
+                                          &conv_ptr );
+            --count_desc;
+            if( 0 == count_desc ) {
+                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                pos_desc++;  /* advance to the next data */
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+            }
+            iov_ptr       += missing_length;
+            iov_len_local -= missing_length;
+            pConvertor->partial_length = 0;  /* nothing more inside */
+        }
+        while( 1 ) {
+            while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+                /* now here we have a basic datatype */
+                UNPACK_INT4( pConvertor, pElem, count_desc,
+                             iov_ptr, conv_ptr, iov_len_local );
+                if( 0 == count_desc ) {  /* completed */
+                    conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                    pos_desc++;  /* advance to the next data */
+                    UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                    continue;
+                }
+                assert( pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED );
+                if( 0 != iov_len_local ) {
+                    unsigned char* temp = conv_ptr;
+                    /* We have some partial data here. Let's copy it into the convertor
+                     * and keep it hot until the next round.
+                     */
+                    assert( iov_len_local < opal_datatype_basicDatatypes[pElem->elem.common.type]->size );
+                    COMPUTE_CSUM( iov_ptr, iov_len_local, pConvertor );
+
+                    opal_unpack_partial_datatype( pConvertor, pElem,
+                                                  iov_ptr, 0, iov_len_local,
+                                                  &temp );
+
+                    pConvertor->partial_length = iov_len_local;
+                    iov_len_local = 0;
+                }
+                goto complete_loop;
+            }
+            assert( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type );
+            if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
+                DO_DEBUG( opal_output( 0, "unpack end_loop count %" PRIsize_t " stack_pos %d pos_desc %d disp %ld space %lu\n",
+                                       pStack->count, pConvertor->stack_pos, pos_desc,
+                                       pStack->disp, (unsigned long)iov_len_local ); );
+                if( --(pStack->count) == 0 ) { /* end of loop */
+                    if( 0 == pConvertor->stack_pos ) {
+                        /* Do the same thing as when the loop is completed */
+                        iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+                        total_unpacked += iov[iov_count].iov_len;
+                        iov_count++;  /* go to the next */
+                        goto complete_conversion;
+                    }
+                    pConvertor->stack_pos--;
+                    pStack--;
+                    pos_desc++;
+                } else {
+                    pos_desc = pStack->index + 1;
+                    if( pStack->index == -1 ) {
+                        pStack->disp += (pData->ub - pData->lb);
+                    } else {
+                        assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
+                        pStack->disp += description[pStack->index].loop.extent;
+                    }
+                }
+                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                DO_DEBUG( opal_output( 0, "unpack new_loop count %" PRIsize_t " stack_pos %d pos_desc %d disp %ld space %lu\n",
+                                       pStack->count, pConvertor->stack_pos, pos_desc,
+                                       pStack->disp, (unsigned long)iov_len_local ); );
+            }
+            assert( OPAL_DATATYPE_LOOP != pElem->elem.common.type );
+            if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+                ptrdiff_t local_disp = (ptrdiff_t)conv_ptr;
+                if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
+                    UNPACK_CONTIGUOUS_LOOP( pConvertor, pElem, count_desc,
+                                            iov_ptr, conv_ptr, iov_len_local );
+                    if( 0 == count_desc ) {  /* completed */
+                        pos_desc += pElem->loop.items + 1;
+                        goto update_loop_description;
+                    }
+                    /* Save the stack with the correct last_count value. */
+                }
+                local_disp = (ptrdiff_t)conv_ptr - local_disp;
+                PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
+                            pStack->disp + local_disp);
+                pos_desc++;
+            update_loop_description:  /* update the current state */
+                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" );
+                continue;
+            }
+        }
+    complete_loop:
+        iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+        total_unpacked += iov[iov_count].iov_len;
+    }
+ complete_conversion:
+    *max_data = total_unpacked;
+    pConvertor->bConverted += total_unpacked;  /* update the already converted bytes */
+    *out_size = iov_count;
+    if( pConvertor->bConverted == pConvertor->remote_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        return 1;
+    }
+    /* Save the global position for the next round */
+    PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
+                conv_ptr - pConvertor->pBaseBuf );
+    DO_DEBUG( opal_output( 0, "unpack save stack stack_pos %d pos_desc %d count_desc %" PRIsize_t " disp %ld\n",
+                           pConvertor->stack_pos, pStack->index, pStack->count, (long)pStack->disp ); );
+    return 0;
+}
+#endif
+
 /*
  *  Remember that the first item in the stack (ie. position 0) is the number
  * of times the datatype is involved in the operation (ie. the count argument
diff --git a/opal/datatype/opal_datatype_unpack.h b/opal/datatype/opal_datatype_unpack.h
index d837aad..501edc5 100644
--- a/opal/datatype/opal_datatype_unpack.h
+++ b/opal/datatype/opal_datatype_unpack.h
@@ -72,6 +72,40 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR,  /* the convertor */
     *(COUNT)       -= _copy_count;
 }
 
+static inline void
+unpack_INT4( opal_convertor_t* CONVERTOR,  /* the convertor */
+             const dt_elem_desc_t* ELEM,   /* the element description */
+             size_t* COUNT,                /* the number of elements */
+             unsigned char** SOURCE,       /* the source pointer */
+             unsigned char** DESTINATION,  /* the destination pointer */
+             size_t* SPACE )               /* the space in the destination buffer */
+{
+    size_t _copy_count = *(COUNT);
+    size_t _copy_blength;
+    const ddt_elem_desc_t* _elem = &((ELEM)->elem);
+    unsigned char* _destination = (*DESTINATION) + _elem->disp;
+
+    _copy_blength = opal_datatype_basicDatatypes[_elem->common.type]->size;
+    assert((ptrdiff_t) _copy_blength != _elem->extent);
+    if( (_copy_count * _copy_blength) > *(SPACE) ) {
+        _copy_count = (*(SPACE) / _copy_blength);
+        if( 0 == _copy_count ) return;  /* nothing to do */
+    }
+
+    int *_d = (int *)_destination;
+    int *_s = (int *)(*(SOURCE));
+    for(size_t _i = 0; _i < _copy_count; _i++ ) {
+        *_d = *_s++;
+        _d += _elem->extent / sizeof(int);
+    }
+    _destination = (unsigned char *)_d;
+    *(SOURCE) = (unsigned char *)_s;
+_copy_blength *= _copy_count;
+    (*DESTINATION)  = _destination - _elem->disp;
+    *(SPACE)       -= _copy_blength;
+    *(COUNT)       -= _copy_count;
+}
+
 static inline void unpack_contiguous_loop( opal_convertor_t* CONVERTOR,
                                            const dt_elem_desc_t* ELEM,
                                            size_t* COUNT,
@@ -103,6 +137,9 @@ static inline void unpack_contiguous_loop( opal_convertor_t* CONVERTOR,
 #define UNPACK_PREDEFINED_DATATYPE( CONVERTOR, ELEM, COUNT, SOURCE, DESTINATION, SPACE ) \
     unpack_predefined_data( (CONVERTOR), (ELEM), &(COUNT), &(SOURCE), &(DESTINATION), &(SPACE) )
 
+#define UNPACK_INT4( CONVERTOR, ELEM, COUNT, SOURCE, DESTINATION, SPACE ) \
+    unpack_INT4( (CONVERTOR), (ELEM), &(COUNT), &(SOURCE), &(DESTINATION), &(SPACE) )
+
 #define UNPACK_CONTIGUOUS_LOOP( CONVERTOR, ELEM, COUNT, SOURCE, DESTINATION, SPACE ) \
     unpack_contiguous_loop( (CONVERTOR), (ELEM), &(COUNT), &(SOURCE), &(DESTINATION), &(SPACE) )
 ```


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

convertor pack/unpack performance with strided datatypes #6617

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

convertor pack/unpack performance with strided datatypes #6617

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions