|
14 | 14 | #ifndef OPAL_DATATYPE_MEMCPY_H_HAS_BEEN_INCLUDED
|
15 | 15 | #define OPAL_DATATYPE_MEMCPY_H_HAS_BEEN_INCLUDED
|
16 | 16 |
|
17 |
| -#define MEMCPY( DST, SRC, BLENGTH ) \ |
18 |
| - memcpy( (DST), (SRC), (BLENGTH) ) |
| 17 | +/* |
| 18 | + * This macro is called whenever we are packing/unpacking a DDT that |
| 19 | + * that is built with basic datatypes. |
| 20 | + * Specifying a fixed size for the memcpy() makes the intel compiler |
| 21 | + * inline it as an assignment operation. |
| 22 | + * This code is a bit hacky, but doing this we can divide the latency |
| 23 | + * by up to 2 during DDT exechanges. |
| 24 | + */ |
| 25 | +#define MEMCPY( DST, SRC, BLENGTH ) \ |
| 26 | + do { \ |
| 27 | + if( (BLENGTH) < 16 ) { \ |
| 28 | + uintptr_t align = ((uintptr_t)(DST)) ^ ((uintptr_t)(SRC)); \ |
| 29 | + if( (4 == (BLENGTH)) && (0 == (align & 0x3)) ) { /* We are copying an int */ \ |
| 30 | + *(int*)(DST) = *(int*)(SRC); \ |
| 31 | + } else if( (8 == (BLENGTH)) && (0 == (align & 0x7)) ) { /* We are copying a double */ \ |
| 32 | + *(double*)(DST) = *(double*)(SRC); \ |
| 33 | + } else if( (16 == (BLENGTH)) && (0 == (align & 0xF)) ) { /* We are copying a long double */ \ |
| 34 | + *(long double*)(DST) = *(long double*)(SRC); \ |
| 35 | + } else { \ |
| 36 | + memcpy((DST), (SRC), (BLENGTH)); \ |
| 37 | + } \ |
| 38 | + } else { \ |
| 39 | + memcpy((DST), (SRC), (BLENGTH)); \ |
| 40 | + } \ |
| 41 | + } while (0) |
19 | 42 |
|
20 | 43 | #endif /* OPAL_DATATYPE_MEMCPY_H_HAS_BEEN_INCLUDED */
|
0 commit comments