Skip to content

Commit 3562d70

Browse files
committed
Get rid of the division in the critical path.
Amazing how a bad instruction scheduling can have such a drastic impact on the code performance. With this change, the get a boost of at least 50% on the performance of data with a small blocklen and/or count. Signed-off-by: George Bosilca <[email protected]>
1 parent a802552 commit 3562d70

File tree

2 files changed

+38
-17
lines changed

2 files changed

+38
-17
lines changed

opal/datatype/opal_datatype_pack.h

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -35,19 +35,24 @@ pack_predefined_data( opal_convertor_t* CONVERTOR,
3535
size_t* SPACE )
3636
{
3737
const ddt_elem_desc_t* _elem = &((ELEM)->elem);
38-
size_t cando_count = (*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size;
39-
size_t do_now, do_now_bytes;
4038
size_t blocklen_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size;
39+
size_t cando_count = *(COUNT), do_now, do_now_bytes;
4140
unsigned char* _memory = (*memory) + _elem->disp;
4241
unsigned char* _packed = *packed;
4342

4443
assert( *(COUNT) <= _elem->count * _elem->blocklen);
4544

46-
if( cando_count > *(COUNT) )
47-
cando_count = *(COUNT);
45+
if( (blocklen_bytes * cando_count) > *(SPACE) )
46+
cando_count = (*SPACE) / blocklen_bytes;
4847

48+
do_now = *(COUNT); /* save the COUNT for later */
49+
/* premptively update the number of COUNT we will return. */
50+
*(COUNT) -= cando_count;
51+
52+
if( 1 == _elem->count ) { /* Everything is contiguous, handle it as a prologue */
53+
goto do_epilog;
54+
}
4955
if( 1 == _elem->blocklen ) { /* Do as many full blocklen as possible */
50-
*(COUNT) -= cando_count;
5156
for(; cando_count > 0; cando_count--) {
5257
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf,
5358
(CONVERTOR)->pDesc, (CONVERTOR)->count );
@@ -59,17 +64,19 @@ pack_predefined_data( opal_convertor_t* CONVERTOR,
5964
}
6065
goto update_and_return;
6166
}
62-
blocklen_bytes *= _elem->blocklen;
6367

68+
blocklen_bytes *= _elem->blocklen;
69+
if( (_elem->count * _elem->blocklen) == cando_count ) {
70+
goto skip_prolog;
71+
}
6472
/**
6573
* First check if we already did something on this element ? The COUNT is the number
6674
* of remaining predefined types in the current elem, not how many predefined types
6775
* should be manipulated in the current call (this number is instead reflected on the
6876
* SPACE).
6977
*/
70-
do_now = *(COUNT) % _elem->blocklen; /* any partial elements ? */
71-
/* premptively update the number of COUNT we will return. */
72-
*(COUNT) -= cando_count;
78+
do_now = do_now % _elem->blocklen; /* any partial elements ? */
79+
7380
if( 0 != do_now ) {
7481
size_t left_in_block = do_now; /* left in the current blocklen */
7582
do_now = (do_now > cando_count ) ? cando_count : do_now;
@@ -88,6 +95,7 @@ pack_predefined_data( opal_convertor_t* CONVERTOR,
8895
cando_count -= do_now;
8996
}
9097

98+
skip_prolog:
9199
/* Do as many full blocklen as possible */
92100
for(size_t _i = 0; _elem->blocklen <= cando_count; _i++ ) {
93101
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf,
@@ -104,6 +112,8 @@ pack_predefined_data( opal_convertor_t* CONVERTOR,
104112
* As an epilog do anything left from the last blocklen.
105113
*/
106114
if( 0 != cando_count ) {
115+
116+
do_epilog:
107117
assert( cando_count < _elem->blocklen );
108118
do_now_bytes = cando_count * opal_datatype_basicDatatypes[_elem->common.type]->size;
109119
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf,

opal/datatype/opal_datatype_unpack.h

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,19 +35,24 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR,
3535
size_t* SPACE )
3636
{
3737
const ddt_elem_desc_t* _elem = &((ELEM)->elem);
38-
size_t cando_count = (*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size;
39-
size_t do_now, do_now_bytes;
4038
size_t blocklen_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size;
39+
size_t cando_count = (*COUNT), do_now, do_now_bytes;
4140
unsigned char* _memory = (*memory) + _elem->disp;
4241
unsigned char* _packed = *packed;
4342

4443
assert( *(COUNT) <= (_elem->count * _elem->blocklen));
4544

46-
if( cando_count > *(COUNT) )
47-
cando_count = *(COUNT);
45+
if( (blocklen_bytes * cando_count) > *(SPACE) )
46+
cando_count = (*SPACE) / blocklen_bytes;
4847

48+
do_now = *(COUNT); /* save the COUNT for later */
49+
/* premptively update the number of COUNT we will return. */
50+
*(COUNT) -= cando_count;
51+
52+
if( 1 == _elem->count ) { /* Everything is contiguous, handle it as a prologue */
53+
goto do_epilog;
54+
}
4955
if( 1 == _elem->blocklen ) { /* Do as many full blocklen as possible */
50-
*(COUNT) -= cando_count;
5156
for(; cando_count > 0; cando_count--) {
5257
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf,
5358
(CONVERTOR)->pDesc, (CONVERTOR)->count );
@@ -59,17 +64,20 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR,
5964
}
6065
goto update_and_return;
6166
}
67+
6268
blocklen_bytes *= _elem->blocklen;
69+
if( (_elem->count * _elem->blocklen) == cando_count ) {
70+
goto skip_prolog;
71+
}
6372

6473
/**
6574
* First check if we already did something on this element ? The COUNT is the number
6675
* of remaining predefined types in the current elem, not how many predefined types
6776
* should be manipulated in the current call (this number is instead reflected on the
6877
* SPACE).
6978
*/
70-
do_now = *(COUNT) % _elem->blocklen; /* any partial elements ? */
71-
/* premptively update the number of COUNT we will return. */
72-
*(COUNT) -= cando_count;
79+
do_now = do_now % _elem->blocklen; /* any partial elements ? */
80+
7381
if( 0 != do_now ) {
7482
size_t left_in_block = do_now; /* left in the current blocklen */
7583
do_now = (do_now > cando_count ) ? cando_count : do_now;
@@ -88,6 +96,7 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR,
8896
cando_count -= do_now;
8997
}
9098

99+
skip_prolog:
91100
/* Do as many full blocklen as possible */
92101
for(size_t _i = 0; _elem->blocklen <= cando_count; _i++ ) {
93102
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf,
@@ -104,6 +113,8 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR,
104113
* As an epilog do anything left from the last blocklen.
105114
*/
106115
if( 0 != cando_count ) {
116+
117+
do_epilog:
107118
assert( cando_count < _elem->blocklen );
108119
do_now_bytes = cando_count * opal_datatype_basicDatatypes[_elem->common.type]->size;
109120
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf,

0 commit comments

Comments
 (0)