diff --git a/pp.c b/pp.c index 5c39bbf540f2..4d7a6221e78d 100644 --- a/pp.c +++ b/pp.c @@ -6529,7 +6529,6 @@ PP(pp_unshift) return NORMAL; } - PP_wrapped(pp_reverse, 0, 1) { dSP; dMARK; @@ -6679,10 +6678,69 @@ PP_wrapped(pp_reverse, 0, 1) } } } else { + STRLEN i = 0; + STRLEN j = len; + uint32_t u32_1, u32_2; + uint16_t u16_1, u16_2; char * outp= SvPVX(TARG); - const char *p = src + len; - while (p != src) - *outp++ = *--p; + /* Take a chunk of bytes from the front and from the + * back, reverse the bytes in each and and swap the + * chunks over. This should have generally good + * performance but also is likely to be optimised + * into bswap instructions by the compiler. + */ +#ifdef HAS_QUAD + uint64_t u64_1, u64_2; + while (j - i >= 16) { + memcpy(&u64_1, src + j - 8, 8); + memcpy(&u64_2, src + i, 8); + u64_1 = _swab_64_(u64_1); + u64_2 = _swab_64_(u64_2); + memcpy(outp + j - 8, &u64_2, 8); + memcpy(outp + i, &u64_1, 8); + i += 8; + j -= 8; + } + + if (j - i >= 8) { + memcpy(&u32_1, src + j - 4, 4); + memcpy(&u32_2, src + i, 4); + u32_1 = _swab_32_(u32_1); + u32_2 = _swab_32_(u32_2); + memcpy(outp + j - 4, &u32_2, 4); + memcpy(outp + i, &u32_1, 4); + i += 4; + j -= 4; + } +#else + while (j - i >= 8) { + memcpy(&u32_1, src + j - 4, 4); + memcpy(&u32_2, src + i, 4); + u32_1 = _swab_32_(u32_1); + u32_2 = _swab_32_(u32_2); + memcpy(outp + j - 4, &u32_2, 4); + memcpy(outp + i, &u32_1, 4); + i += 4; + j -= 4; + } +#endif + if (j - i >= 4) { + memcpy(&u16_1, src + j - 2, 2); + memcpy(&u16_2, src + i, 2); + u16_1 = _swab_16_(u16_1); + u16_2 = _swab_16_(u16_2); + memcpy(outp + j - 2, &u16_2, 2); + memcpy(outp + i, &u16_1, 2); + i += 2; + j -= 2; + } + + /* Swap any remaining bytes one by one. */ + while (i < j) { + outp[i] = src[j - 1]; + outp[j - 1] = src[i]; + i++; j--; + } } RETURN; } @@ -6695,8 +6753,8 @@ PP_wrapped(pp_reverse, 0, 1) if (len > 1) { /* The traditional way, operate on the current byte buffer */ - char *down; if (DO_UTF8(TARG)) { /* first reverse each character */ + char *down; U8* s = (U8*)SvPVX(TARG); const U8* send = (U8*)(s + len); while (s < send) { @@ -6720,11 +6778,64 @@ PP_wrapped(pp_reverse, 0, 1) } up = SvPVX(TARG); } - down = SvPVX(TARG) + len - 1; - while (down > up) { - const char tmp = *up; - *up++ = *down; - *down-- = tmp; + STRLEN i = 0; + STRLEN j = len; + uint32_t u32_1, u32_2; + uint16_t u16_1, u16_2; + /* Reverse the buffer in place, in chunks where possible */ +#ifdef HAS_QUAD + uint64_t u64_1, u64_2; + while (j - i >= 16) { + memcpy(&u64_1, up + j - 8, 8); + memcpy(&u64_2, up + i, 8); + u64_1 = _swab_64_(u64_1); + u64_2 = _swab_64_(u64_2); + memcpy(up + j - 8, &u64_2, 8); + memcpy(up + i, &u64_1, 8); + i += 8; + j -= 8; + } + + if (j - i >= 8) { + memcpy(&u32_1, up + j - 4, 4); + memcpy(&u32_2, up + i, 4); + u32_1 = _swab_32_(u32_1); + u32_2 = _swab_32_(u32_2); + memcpy(up + j - 4, &u32_2, 4); + memcpy(up + i, &u32_1, 4); + i += 4; + j -= 4; + } +#else + while (j - i >= 8) { + memcpy(&u32_1, up + j - 4, 4); + memcpy(&u32_2, up + i, 4); + u32_1 = _swab_32_(u32_1); + u32_2 = _swab_32_(u32_2); + memcpy(up + j - 4, &u32_2, 4); + memcpy(up + i, &u32_1, 4); + i += 4; + j -= 4; + } +#endif + if (j - i >= 4) { + memcpy(&u16_1, up + j - 2, 2); + memcpy(&u16_2, up + i, 2); + u16_1 = _swab_16_(u16_1); + u16_2 = _swab_16_(u16_2); + memcpy(up + j - 2, &u16_2, 2); + memcpy(up + i, &u16_1, 2); + i += 2; + j -= 2; + } + + /* Finally, swap any remaining bytes one-by-one. */ + while (i < j) { + unsigned char tmp = up[i]; + up[i] = up[j - 1]; + up[j - 1] = tmp; + i++; + j--; } } (void)SvPOK_only_UTF8(TARG);