diff --git a/opal/datatype/opal_copy_functions_heterogeneous.c b/opal/datatype/opal_copy_functions_heterogeneous.c index 9a88ea3c221..245de3a9af2 100644 --- a/opal/datatype/opal_copy_functions_heterogeneous.c +++ b/opal/datatype/opal_copy_functions_heterogeneous.c @@ -7,6 +7,7 @@ * Copyright (c) 2015-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2021 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -66,6 +67,36 @@ static inline void opal_dt_swap_bytes(void *to_p, const void *from_p, const size } } +static inline void opal_dt_swap_bytes_inplace(void *buf_p, const size_t size, + size_t count) +{ + size_t i; + size_t back_i = size - 1; + uint8_t *buf = (uint8_t *) buf_p; + uint8_t copy[32]; + + assert(size <= 32); + + /* Do the first element */ + for (i = 0; i < size; i++) { + copy[i] = buf[i]; + } + for (i = 0; i < size; i++, back_i--) { + buf[back_i] = copy[i]; + } + /* Do all the others if any */ + while (count > 1) { + buf += size; + count--; + for (i = 0; i < size; i++) { + copy[i] = buf[i]; + } + for (i = 0, back_i = size - 1; i < size; i++, back_i--) { + buf[back_i] = copy[i]; + } + } +} + #ifdef HAVE_IEEE754_H struct bit128 { unsigned int mantissa3 : 32; @@ -133,6 +164,469 @@ static inline void opal_dt_swap_long_double(void *to_p, const void *from_p, cons # define opal_dt_swap_long_double(to_p, from_p, size, count, remoteArch) #endif +union fp_float64 +{ + double value; + struct { +#if defined(WORDS_BIGENDIAN) + unsigned sign : 1; + unsigned exp : 11; + unsigned frac1 : 20; + unsigned frac0 : 32; +#else + unsigned frac0 : 32; + unsigned frac1 : 20; + unsigned exp : 11; + unsigned sign : 1; +#endif + } bits; + char bytes[sizeof(double)]; +}; + +union fp_float80 +{ + long double value; + struct { +#if defined(WORDS_BIGENDIAN) + unsigned sign : 1; + unsigned exp : 15; + unsigned pad : 16; + unsigned frac1 : 32; + unsigned frac0 : 32; +#else + unsigned frac0 : 32; + unsigned frac1 : 32; + unsigned exp : 15; + unsigned sign : 1; + unsigned pad : 16; +#endif + } bits; + char bytes[sizeof(long double)]; +}; + +union fp_float128 +{ + /*__float128 value;*/ + struct { +#if defined(WORDS_BIGENDIAN) + unsigned sign : 1; + unsigned exp : 15; + unsigned frac3 : 16; + unsigned frac2 : 32; + unsigned frac1 : 32; + unsigned frac0 : 32; +#else + unsigned frac0 : 32; + unsigned frac1 : 32; + unsigned frac2 : 32; + unsigned frac3 : 16; + unsigned exp : 15; + unsigned sign : 1; +#endif + } bits; + char bytes[16]; +}; + +// f64_to_f128 (copies a float64(local_endian) to a float128(local_endian)) +static inline +void +f64_to_f128(unsigned char *f128_buf_to, const unsigned char *f64_buf_from, ssize_t count, ptrdiff_t from_extent) +{ + unsigned s,e,f[4],f0,f1; + union fp_float64 ud; + union fp_float128 uq; + int f64_is_aligned; + + f64_is_aligned = 1; + if ((uintptr_t)f64_buf_from & 0x7) { + f64_is_aligned = 0; + } + if ((uintptr_t)from_extent & 0x7) { + f64_is_aligned = 0; + } + + do { + /* input */ + if (f64_is_aligned) { + ud.value = *(double*)f64_buf_from; + } else { + memcpy(&ud.value, f64_buf_from, sizeof(ud)); + } + + /* unpack */ + s = ud.bits.sign; + e = ud.bits.exp; + f0 = ud.bits.frac0; + f1 = ud.bits.frac1; + + /* bias */ + if (e) e += 16383 - 1023; + + /* extend */ + f[3] = (f1 >> 4); + f[2] = (f1 << 28) | (f0 >> 4); + f[1] = (f0 << 28); + f[0] = 0; + + /* pack */ + uq.bits.sign = s; + uq.bits.exp = e; + uq.bits.frac0 = f[0]; + uq.bits.frac1 = f[1]; + uq.bits.frac2 = f[2]; + uq.bits.frac3 = f[3]; + + /* output */ + memcpy(f128_buf_to,uq.bytes,sizeof(uq)); + + f64_buf_from += from_extent; + f128_buf_to += sizeof(uq); + count--; + } while (count > 0); +} + +// f80_to_f128 (copies an intel80(local_endian) to a float128(local_endian)) +static inline +void +f80_to_f128(unsigned char *f128_buf_to, const unsigned char *f80_buf_from, ssize_t count, ptrdiff_t from_extent) +{ + unsigned s,e,f[4],f0,f1; + union fp_float80 ul; + union fp_float128 uq; + int f80_is_aligned; + + f80_is_aligned = 1; + if ((uintptr_t)f80_buf_from & 0xF) { + f80_is_aligned = 0; + } + if ((uintptr_t)from_extent & 0xF) { + f80_is_aligned = 0; + } + + do { + /* input */ + if (f80_is_aligned) { + ul.value = *(long double*)f80_buf_from; + } else { + memcpy(&ul.value, f80_buf_from, sizeof(ul)); + } + + /* unpack */ + s = ul.bits.sign; + e = ul.bits.exp; + f0 = ul.bits.frac0; + f1 = ul.bits.frac1; + + /* implicit bit */ + f1 &= ~(1 << 31); + + /* extend */ + f[3] = (f1 >> 15); + f[2] = (f1 << 17) | (f0 >> 15); + f[1] = (f0 << 17); + f[0] = 0; + + /* pack */ + uq.bits.sign = s; + uq.bits.exp = e; + uq.bits.frac0 = f[0]; + uq.bits.frac1 = f[1]; + uq.bits.frac2 = f[2]; + uq.bits.frac3 = f[3]; + + /* output */ + memcpy(f128_buf_to,uq.bytes,sizeof(uq)); + + f80_buf_from += from_extent; + f128_buf_to += sizeof(uq); + count--; + } while (count > 0); +} + +// f128_to_f64 (copies a float128(local_endian) to a float64(local_endian)) +static inline +void +f128_to_f64(unsigned char *f64_buf_to, const unsigned char *f128_buf_from, ssize_t count, ptrdiff_t to_extent) +{ + unsigned s,e,f[4],f0,f1; + union fp_float64 ud; + union fp_float128 uq; + int f64_is_aligned; + + f64_is_aligned = 1; + if ((uintptr_t)f64_buf_to & 0x7) { + f64_is_aligned = 0; + } + if ((uintptr_t)to_extent & 0x7) { + f64_is_aligned = 0; + } + + do { + /* input */ + memcpy(uq.bytes,f128_buf_from,sizeof(uq)); + + /* unpack */ + s = uq.bits.sign; + e = uq.bits.exp; + f[0] = uq.bits.frac0; + f[1] = uq.bits.frac1; + f[2] = uq.bits.frac2; + f[3] = uq.bits.frac3; + + /* bias */ + if (e) e -= 16383 - 1023; + + /* truncate */ + f1 = (f[3] << 4) | (f[2] >> 28); + f0 = (f[2] << 4) | (f[1] >> 28); + + /* pack */ + ud.bits.sign = s; + ud.bits.exp = e; + ud.bits.frac0 = f0; + ud.bits.frac1 = f1; + + /* output */ + if (f64_is_aligned) { + *(double*)f64_buf_to = ud.value; + } else { + memcpy(f64_buf_to, &ud.value, sizeof(ud)); + } + + f64_buf_to += to_extent; + f128_buf_from += sizeof(uq); + count--; + } while (count > 0); +} + +// f128_to_f80 (copies a float128(local_endian) to an intel80(local_endian)) +static inline +void +f128_to_f80(unsigned char *f80_buf_to, const unsigned char *f128_buf_from, ssize_t count, ptrdiff_t to_extent) +{ + unsigned s,e,f[4],f0,f1; + union fp_float80 ul; + union fp_float128 uq; + int f80_is_aligned; + + f80_is_aligned = 1; + if ((uintptr_t)f80_buf_to & 0xF) { + f80_is_aligned = 0; + } + if ((uintptr_t)to_extent & 0xF) { + f80_is_aligned = 0; + } + + do { + /* input */ + memcpy(uq.bytes,f128_buf_from,sizeof(uq)); + + /* unpack */ + s = uq.bits.sign; + e = uq.bits.exp; + f[0] = uq.bits.frac0; + f[1] = uq.bits.frac1; + f[2] = uq.bits.frac2; + f[3] = uq.bits.frac3; + + /* truncate */ + f1 = (f[3] << 15) | (f[2] >> 17); + f0 = (f[2] << 15) | (f[1] >> 17); + + /* implicit bit */ + if (e) + f1 |= (1 << 31); + else + f1 &= ~(1 << 31); + + /* pack */ + ul.bits.sign = s; + ul.bits.exp = e; + ul.bits.frac0 = f0; + ul.bits.frac1 = f1; + + /* output */ + /* this started as *f80_buf_to = ul.value; + but I'm reluctant to assume alignment */ + if (f80_is_aligned) { + *(long double*)f80_buf_to = ul.value; + } else { + memcpy(f80_buf_to, &ul.value, sizeof(ul)); + } + + f80_buf_to += to_extent; + f128_buf_from += sizeof(uq); + count--; + } while (count > 0); +} + +#define LDBL_IS_F64(arch) \ + ( \ + (((arch) & OPAL_ARCH_LDMANTDIGISxx) == OPAL_ARCH_LDMANTDIGIS53) \ + && \ + (((arch) & OPAL_ARCH_LDEXPSIZEISxx) == OPAL_ARCH_LDEXPSIZEIS11) \ + ) +#define LDBL_IS_F80(arch) \ + ( \ + (((arch) & OPAL_ARCH_LDMANTDIGISxx) == OPAL_ARCH_LDMANTDIGIS64) \ + && \ + (((arch) & OPAL_ARCH_LDEXPSIZEISxx) == OPAL_ARCH_LDEXPSIZEIS15) \ + ) +#define LDBL_IS_F128(arch) \ + ( \ + (((arch) & OPAL_ARCH_LDMANTDIGISxx) == OPAL_ARCH_LDMANTDIGIS113) \ + && \ + (((arch) & OPAL_ARCH_LDEXPSIZEISxx) == OPAL_ARCH_LDEXPSIZEIS15) \ + ) +#define LDBL_INFO_MASK (OPAL_ARCH_LDMANTDIGISxx | OPAL_ARCH_LDEXPSIZEISxx) + +#ifdef HAVE___FLOAT128 +/* + * I'm not sure about the portability of alignof() so I'm handling things + * like the possibility of sizeof(long double) == 12 in a slower way. The + * alignment requirement in that case would be 4 (largest power of 2 that + * divides into the sizeof). + * + * And saving it static to just compute it once without running a loop + * every call. + */ +static inline +size_t +alignment_of_long_double() { + static size_t val = 0; + + if (val == 0) { + val = 1; + while (sizeof(long double) % (val*2) == 0) { + val *= 2; + } + } + return val; +} +#endif + +// ldbl_to_f128 (copies a long double(from_arch format) to a float128(local_endian)) +static inline +void +ldbl_to_f128(unsigned char *f128_buf_to, const unsigned char *ldbl_buf_from, ssize_t count, int from_arch, ptrdiff_t from_extent) +{ +#ifdef HAVE___FLOAT128 + int ldbl_is_aligned; + + ldbl_is_aligned = 1; + int alignment_mask = alignment_of_long_double() - 1; + if ((uintptr_t)ldbl_buf_from & alignment_mask) { + ldbl_is_aligned = 0; + } + if ((uintptr_t)from_extent & alignment_mask) { + ldbl_is_aligned = 0; + } + + int f128_is_aligned; + f128_is_aligned = 1; + if ((uintptr_t)f128_buf_to & 0xF) { + f128_is_aligned = 0; + } + + do { + if (ldbl_is_aligned && f128_is_aligned) { + *(__float128*)f128_buf_to = *(long double*)ldbl_buf_from; + } else { + __float128 f128; + long double ldbl; + memcpy(&ldbl, ldbl_buf_from, sizeof(ldbl)); + f128 = ldbl; + memcpy(f128_buf_to, &f128, sizeof(f128)); + } + + ldbl_buf_from += from_extent; + f128_buf_to += sizeof(__float128); + count--; + } while (count > 0); +#else + if (LDBL_IS_F64(from_arch)) { + f64_to_f128(f128_buf_to, ldbl_buf_from, count, from_extent); + } else if (LDBL_IS_F80(from_arch)) { + f80_to_f128(f128_buf_to, ldbl_buf_from, count, from_extent); + } else { +/* + * This could be an error condition, eg we're trying to process a + * long double from a format that isn't f128 (or doesn't appear to be) + * into f128. But I think the reason not to error out is confidence + * in the detection. I wouldn't want to produce a false failure. + */ + do { + memcpy(f128_buf_to, ldbl_buf_from, from_extent); + + ldbl_buf_from += from_extent; + f128_buf_to += 16; + count--; + } while (count > 0); + } +#endif +} + +// f128_to_ldbl (copies a float128(local_endian) to a long double(to_arch format)) +static inline +void +f128_to_ldbl(unsigned char *ldbl_buf_to, const unsigned char *f128_buf_from, ssize_t count, int to_arch, ptrdiff_t to_extent) +{ +#ifdef HAVE___FLOAT128 + int ldbl_is_aligned; + + ldbl_is_aligned = 1; + int alignment_mask = alignment_of_long_double() - 1; + if ((uintptr_t)ldbl_buf_to & alignment_mask) { + ldbl_is_aligned = 0; + } + if ((uintptr_t)to_extent & alignment_mask) { + ldbl_is_aligned = 0; + } + + int f128_is_aligned; + f128_is_aligned = 1; + if ((uintptr_t)f128_buf_from & 0xF) { + f128_is_aligned = 0; + } + + do { + if (ldbl_is_aligned && f128_is_aligned) { + *(long double*)ldbl_buf_to = *(__float128*)f128_buf_from; + } else { + __float128 f128; + long double ldbl; + memcpy(&f128, f128_buf_from, sizeof(f128)); + ldbl = f128; + memcpy(ldbl_buf_to, &ldbl, sizeof(ldbl)); + } + + ldbl_buf_to += to_extent; + f128_buf_from += sizeof(__float128); + count--; + } while (count > 0); +#else + if (LDBL_IS_F64(to_arch)) { + f128_to_f64(ldbl_buf_to, f128_buf_from, count, to_extent); + } else if (LDBL_IS_F80(to_arch)) { + f128_to_f80(ldbl_buf_to, f128_buf_from, count, to_extent); + } else { +/* + * This could be an error condition, eg we're trying to process an + * f128 into a long double of a format that isn't f128 (or doesn't + * appear to be). But I think the reason not to error out is confidence + * in the detection. I wouldn't want to produce a false failure. + */ + do { + memcpy(ldbl_buf_to, f128_buf_from, to_extent); + + ldbl_buf_to += to_extent; + f128_buf_from += 16; + count--; + } while (count > 0); + } +#endif +} + /** * BEWARE: Do not use the following macro with composed types such as * complex. As the swap is done using the entire type sizeof, the @@ -141,48 +635,109 @@ static inline void opal_dt_swap_long_double(void *to_p, const void *from_p, cons */ #define COPY_TYPE_HETEROGENEOUS(TYPENAME, TYPE) COPY_TYPE_HETEROGENEOUS_INTERNAL(TYPENAME, TYPE, 0) +/* + * Summaryizing the logic of the pFunc copy functions + * with regard to long doubles: + * + * For terminology I'll use + * f64 : float64 which some architectures use as their long double + * f80 : x86 double extended format that uses 80 bytes, commonly used for long double + * f128 : ieee quad precision, sometimes available as __float128 + * + * if !LONG_DOUBLE or both architecture have the same long double format: + * byte swap based on local/remote endianness differing + * else: + * if from_arch is not local endianness: byte swap to local endianness + * if from_arch isn't f128 : ldbl_to_f128 + * if we have __float128 : convert to __float128 + * else if from_arch LDBL is f80 : f80_to_f128 + * else if from_arch LDBL is f64 : f64_to_f128 + * if to_arch isn't f128 : f128_to_ldbl + * if we have __float128 : convert from __float128 to + * if to_arch LDBL is f80 : f128_to_f80 + * if to_arch LDBL is f64 : f128_to_f64 + * if to_arch is not local endianness : byte swap + * + * And for all the above conversions the logic for handling size difference + * between the from/to type is the same: + * if (to_extent == from_extent == sizeof(TYPE)) + * opal_dt_swap_bytes(to, from, sizeof(TYPE), count); + * else + * loop i=0..count-1 + * opal_dt_swap_bytes(to, from, sizeof(TYPE), 1); + * to += to_extent; + * from += from_extent; + * so that's handled by a do while as an outer loop. + */ + #define COPY_TYPE_HETEROGENEOUS_INTERNAL(TYPENAME, TYPE, LONG_DOUBLE) \ static int32_t copy_##TYPENAME##_heterogeneous(opal_convertor_t *pConvertor, size_t count, \ const char *from, size_t from_len, \ ptrdiff_t from_extent, char *to, \ size_t to_length, ptrdiff_t to_extent, \ - ptrdiff_t *advance) \ + ptrdiff_t *advance) \ { \ - size_t i; \ - \ + size_t countperblock, nblocksleft; \ + int from_arch, to_arch ; \ + if (pConvertor->flags & CONVERTOR_SEND_CONVERSION) { /* pack */ \ + from_arch = opal_local_arch; \ + to_arch = pConvertor->remoteArch; \ + } else { /* unpack */ \ + from_arch = pConvertor->remoteArch; \ + to_arch = opal_local_arch; \ + } \ datatype_check(#TYPE, sizeof(TYPE), sizeof(TYPE), &count, from, from_len, from_extent, to, \ to_length, to_extent); \ - \ - if ((pConvertor->remoteArch & OPAL_ARCH_ISBIGENDIAN) \ - != (opal_local_arch & OPAL_ARCH_ISBIGENDIAN)) { \ - if ((to_extent == from_extent) && (to_extent == sizeof(TYPE))) { \ - opal_dt_swap_bytes(to, from, sizeof(TYPE), count); \ - if (LONG_DOUBLE) { \ - opal_dt_swap_long_double(to, from, sizeof(TYPE), count, \ - pConvertor->remoteArch); \ + if ((to_extent == from_extent) && (to_extent == sizeof(TYPE))) { \ + countperblock = count; \ + nblocksleft = 1; \ + } else { \ + countperblock = 1; \ + nblocksleft = count; \ + } \ + do { \ + if (!(LONG_DOUBLE) || ((from_arch & LDBL_INFO_MASK) == (to_arch & LDBL_INFO_MASK))) { \ + if ((from_arch & OPAL_ARCH_ISBIGENDIAN) \ + != (to_arch & OPAL_ARCH_ISBIGENDIAN)) \ + { \ + opal_dt_swap_bytes(to, from, sizeof(TYPE), countperblock); \ + } else { \ + MEMCPY(to, from, countperblock * sizeof(TYPE)); \ } \ } else { \ - for (i = 0; i < count; i++) { \ - opal_dt_swap_bytes(to, from, sizeof(TYPE), 1); \ - if (LONG_DOUBLE) { \ - opal_dt_swap_long_double(to, from, sizeof(TYPE), 1, \ - pConvertor->remoteArch); \ + const char *tmp_from = from; \ + if ((from_arch & OPAL_ARCH_ISBIGENDIAN) \ + != (opal_local_arch & OPAL_ARCH_ISBIGENDIAN)) \ + { \ + opal_dt_swap_bytes(to, tmp_from, sizeof(TYPE), countperblock); \ + tmp_from = to; \ + } \ + if (!LDBL_IS_F128(from_arch)) { \ + ldbl_to_f128((unsigned char*)to, (const unsigned char*)tmp_from, \ + countperblock, from_arch, from_extent); \ + tmp_from = to; \ + } \ + if (!LDBL_IS_F128(to_arch)) { \ + f128_to_ldbl((unsigned char*)to, (const unsigned char*)tmp_from, \ + countperblock, to_arch, to_extent); \ + tmp_from = to; \ + } \ + if ((to_arch & OPAL_ARCH_ISBIGENDIAN) \ + != (opal_local_arch & OPAL_ARCH_ISBIGENDIAN)) \ + { \ + if (tmp_from == from) { \ + opal_dt_swap_bytes(to, from, sizeof(TYPE), countperblock); \ + } else { \ + opal_dt_swap_bytes_inplace(to, sizeof(TYPE), countperblock); \ } \ - to += to_extent; \ - from += from_extent; \ } \ } \ - } else if ((ptrdiff_t) sizeof(TYPE) == to_extent \ - && (ptrdiff_t) sizeof(TYPE) == from_extent) { \ - MEMCPY(to, from, count * sizeof(TYPE)); \ - } else { \ - /* source or destination are non-contiguous */ \ - for (i = 0; i < count; i++) { \ - MEMCPY(to, from, sizeof(TYPE)); \ - to += to_extent; \ - from += from_extent; \ - } \ - } \ + \ + to += to_extent; \ + from += from_extent; \ + nblocksleft--; \ + } while (nblocksleft > 0); \ + \ *advance = count * from_extent; \ return count; \ } @@ -197,41 +752,68 @@ static inline void opal_dt_swap_long_double(void *to_p, const void *from_p, cons size_t to_length, ptrdiff_t to_extent, \ ptrdiff_t *advance) \ { \ - size_t i; \ - \ + size_t countperblock, nblocksleft; \ + int from_arch, to_arch ; \ + if (pConvertor->flags & CONVERTOR_SEND_CONVERSION) { /* pack */ \ + from_arch = opal_local_arch; \ + to_arch = pConvertor->remoteArch; \ + } else { /* unpack */ \ + from_arch = pConvertor->remoteArch; \ + to_arch = opal_local_arch; \ + } \ datatype_check(#TYPE, sizeof(TYPE), sizeof(TYPE), &count, from, from_len, from_extent, to, \ to_length, to_extent); \ + if ((to_extent == from_extent) && (to_extent == 2 * sizeof(TYPE))) { \ + countperblock = count * 2; \ + nblocksleft = 1; \ + } else { \ + countperblock = 2; \ + nblocksleft = count; \ + } \ + do { \ \ - if ((pConvertor->remoteArch & OPAL_ARCH_ISBIGENDIAN) \ - != (opal_local_arch & OPAL_ARCH_ISBIGENDIAN)) { \ - if ((to_extent == from_extent) && (to_extent == (2 * sizeof(TYPE)))) { \ - opal_dt_swap_bytes(to, from, sizeof(TYPE), 2 * count); \ - if (LONG_DOUBLE) { \ - opal_dt_swap_long_double(to, from, sizeof(TYPE), 2 * count, \ - pConvertor->remoteArch); \ + if (!(LONG_DOUBLE) || ((from_arch & LDBL_INFO_MASK) == (to_arch & LDBL_INFO_MASK))) { \ + if ((from_arch & OPAL_ARCH_ISBIGENDIAN) \ + != (to_arch & OPAL_ARCH_ISBIGENDIAN)) \ + { \ + opal_dt_swap_bytes(to, from, sizeof(TYPE), countperblock); \ + } else { \ + MEMCPY(to, from, countperblock * sizeof(TYPE)); \ } \ } else { \ - for (i = 0; i < count; i++) { \ - opal_dt_swap_bytes(to, from, sizeof(TYPE), 2); \ - if (LONG_DOUBLE) { \ - opal_dt_swap_long_double(to, from, sizeof(TYPE), 2, \ - pConvertor->remoteArch); \ + const char *tmp_from = from; \ + if ((from_arch & OPAL_ARCH_ISBIGENDIAN) \ + != (opal_local_arch & OPAL_ARCH_ISBIGENDIAN)) \ + { \ + opal_dt_swap_bytes(to, tmp_from, sizeof(TYPE), countperblock); \ + tmp_from = to; \ + } \ + if (!LDBL_IS_F128(from_arch)) { \ + ldbl_to_f128((unsigned char*)to, (const unsigned char*)tmp_from, \ + countperblock, from_arch, from_extent/2); \ + tmp_from = to; \ + } \ + if (!LDBL_IS_F128(to_arch)) { \ + f128_to_ldbl((unsigned char*)to, (const unsigned char*)tmp_from, \ + countperblock, to_arch, to_extent/2); \ + tmp_from = to; \ + } \ + if ((to_arch & OPAL_ARCH_ISBIGENDIAN) \ + != (opal_local_arch & OPAL_ARCH_ISBIGENDIAN)) \ + { \ + if (tmp_from == from) { \ + opal_dt_swap_bytes(to, from, sizeof(TYPE), countperblock); \ + } else { \ + opal_dt_swap_bytes_inplace(to, sizeof(TYPE), countperblock); \ } \ - to += to_extent; \ - from += from_extent; \ } \ } \ - } else if ((ptrdiff_t) sizeof(TYPE) == to_extent \ - && (ptrdiff_t) sizeof(TYPE) == from_extent) { \ - MEMCPY(to, from, count * sizeof(TYPE)); \ - } else { \ - /* source or destination are non-contiguous */ \ - for (i = 0; i < count; i++) { \ - MEMCPY(to, from, sizeof(TYPE)); \ - to += to_extent; \ - from += from_extent; \ - } \ - } \ + \ + to += to_extent; \ + from += from_extent; \ + nblocksleft--; \ + } while (nblocksleft > 0); \ + \ *advance = count * from_extent; \ return count; \ } @@ -244,7 +826,14 @@ static inline void opal_dt_swap_long_double(void *to_p, const void *from_p, cons ptrdiff_t *advance) \ { \ size_t i; \ - \ + int from_arch, to_arch ; \ + if (pConvertor->flags & CONVERTOR_SEND_CONVERSION) { /* pack */ \ + from_arch = opal_local_arch; \ + to_arch = pConvertor->remoteArch; \ + } else { /* unpack */ \ + from_arch = pConvertor->remoteArch; \ + to_arch = opal_local_arch; \ + } \ datatype_check(#TYPENAME, sizeof(TYPE1) + sizeof(TYPE2), sizeof(TYPE1) + sizeof(TYPE2), \ &count, from, from_len, from_extent, to, to_length, to_extent); \ \ diff --git a/opal/util/arch.h b/opal/util/arch.h index 0bfa1e76767..b5d9050f6ef 100644 --- a/opal/util/arch.h +++ b/opal/util/arch.h @@ -162,7 +162,7 @@ ** To store this in a 32 bit integer, we use the following definition: ** ** 1 2 3 4 -** 12345678 12345678 12345678 12345678 +** 87654321 87654321 87654321 87654321 ** ** 1. Byte: ** bits 1 & 2: 00 (header) (to recognize the correct end) @@ -177,8 +177,8 @@ ** 3. Byte: ** bits 1 & 2: length of long double: 00=64, 01=96,10 = 128 ** bits 3 & 4: no. of rel. bits in the exponent: 00 = 10, 01 = 14) -** bits 5 - 7: no. of bits of mantisse ( 000 = 53, 001 = 64, 010 = 105, -** 011 = 106, 100 = 107,101 = 113 ) +** bits 5 - 7: no. of bits of mantisse ( 000 = 53, 001 = 64, 010 = 105, +** 011 = 106, 100 = 107, 101 = 113 ) ** bit 8: intel or sparc representation of mantisse (0 = sparc, ** 1 = intel ) ** 4. Byte: @@ -198,8 +198,9 @@ #define OPAL_ARCH_ISBIGENDIAN 0x00000008 /* BYTE 2 */ -#define OPAL_ARCH_LONGISxx 0x0000c000 /* mask for sizeof long */ -#define OPAL_ARCH_LONGIS64 0x00001000 +#define OPAL_ARCH_LONGISxx 0x00004000 /* mask for sizeof long */ +#define OPAL_ARCH_LONGIS32 0x00000000 +#define OPAL_ARCH_LONGIS64 0x00004000 #define OPAL_ARCH_LONGLONGISxx 0x00003000 /* mask for sizeof long long */ #define OPAL_ARCH_BOOLISxx 0x00000c00 /* mask for sizeof bool */ @@ -213,15 +214,22 @@ #define OPAL_ARCH_LOGICALIS32 0x00000200 /* logical is 32 bits */ /* BYTE 3 */ -#define OPAL_ARCH_LONGDOUBLEIS96 0x00020000 -#define OPAL_ARCH_LONGDOUBLEIS128 0x00010000 +#define OPAL_ARCH_LONGDOUBLEISxx 0x00030000 +#define OPAL_ARCH_LONGDOUBLEIS64 0x00000000 +#define OPAL_ARCH_LONGDOUBLEIS96 0x00010000 +#define OPAL_ARCH_LONGDOUBLEIS128 0x00020000 +#define OPAL_ARCH_LDEXPSIZEISxx 0x000c0000 +#define OPAL_ARCH_LDEXPSIZEIS11 0x00000000 #define OPAL_ARCH_LDEXPSIZEIS15 0x00080000 -#define OPAL_ARCH_LDMANTDIGIS64 0x00400000 +#define OPAL_ARCH_LDMANTDIGISxx 0x00700000 /* b 0111 0000 */ +/* 53:b000 64:b001 105:b010 106:b011 107:b100 113:b101 */ +#define OPAL_ARCH_LDMANTDIGIS53 0x00000000 +#define OPAL_ARCH_LDMANTDIGIS64 0x00100000 #define OPAL_ARCH_LDMANTDIGIS105 0x00200000 -#define OPAL_ARCH_LDMANTDIGIS106 0x00600000 -#define OPAL_ARCH_LDMANTDIGIS107 0x00100000 +#define OPAL_ARCH_LDMANTDIGIS106 0x00300000 +#define OPAL_ARCH_LDMANTDIGIS107 0x00400000 #define OPAL_ARCH_LDMANTDIGIS113 0x00500000 #define OPAL_ARCH_LDISINTEL 0x00800000