Skip to content

Optimize bit count polyfills #2914

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jun 18, 2020
87 changes: 66 additions & 21 deletions src/support/bits.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@
#include "../compiler-support.h"
#include "support/utilities.h"

#ifdef _MSC_VER
#include <intrin.h>
#define __builtin_popcount __popcnt
#define __builtin_popcountll __popcnt64
#endif

namespace wasm {

template<> int PopCount<uint8_t>(uint8_t v) {
Expand All @@ -30,19 +36,31 @@ template<> int PopCount<uint8_t>(uint8_t v) {
}

template<> int PopCount<uint16_t>(uint16_t v) {
return PopCount((uint8_t)(v & 0xff)) + PopCount((uint8_t)(v >> 8));
#if __has_builtin(__builtin_popcount) || defined(__GNUC__) || defined(_MSC_VER)
return (int)__builtin_popcount(v);
#else
return PopCount((uint8_t)(v & 0xFF)) + PopCount((uint8_t)(v >> 8));
#endif
}

template<> int PopCount<uint32_t>(uint32_t v) {
#if __has_builtin(__builtin_popcount) || defined(__GNUC__) || defined(_MSC_VER)
return (int)__builtin_popcount(v);
#else
// See Stanford bithacks, counting bits set in parallel, "best method":
// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
v = v - ((v >> 1) & 0x55555555);
v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
return (((v + (v >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
#endif
}

template<> int PopCount<uint64_t>(uint64_t v) {
#if __has_builtin(__builtin_popcount) || defined(__GNUC__) || defined(_MSC_VER)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use __has_builtin(__builtin_popcount) instead __has_builtin(__builtin_popcountll) due to clang-format forcing line terminator and carry defined(_MSC_VER) to new line and this looks weird.

return (int)__builtin_popcountll(v);
#else
return PopCount((uint32_t)v) + PopCount((uint32_t)(v >> 32));
#endif
}

template<> uint32_t BitReverse<uint32_t>(uint32_t v) {
Expand All @@ -55,21 +73,53 @@ template<> uint32_t BitReverse<uint32_t>(uint32_t v) {
}

template<> int CountTrailingZeroes<uint32_t>(uint32_t v) {
if (v == 0) {
return 32;
}
#if __has_builtin(__builtin_ctz) || defined(__GNUC__)
return __builtin_ctz(v);
#elif defined(_MSC_VER)
unsigned long count;
_BitScanForward(&count, v);
return (int)count;
#else
// See Stanford bithacks, count the consecutive zero bits (trailing) on the
// right with multiply and lookup:
// http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightMultLookup
static const uint8_t tbl[32] = {0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20,
15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19,
16, 7, 26, 12, 18, 6, 11, 5, 10, 9};
return v ? (int)tbl[((uint32_t)((v & -v) * 0x077CB531U)) >> 27] : 32;
return (int)tbl[((uint32_t)((v & -v) * 0x077CB531U)) >> 27];
#endif
}

template<> int CountTrailingZeroes<uint64_t>(uint64_t v) {
if (v == 0) {
return 64;
}
#if __has_builtin(__builtin_ctzll) || defined(__GNUC__)
return __builtin_ctzll(v);
#elif defined(_MSC_VER)
unsigned long count;
_BitScanForward64(&count, v);
return (int)count;
#else
return (uint32_t)v ? CountTrailingZeroes((uint32_t)v)
: 32 + CountTrailingZeroes((uint32_t)(v >> 32));
#endif
}

template<> int CountLeadingZeroes<uint32_t>(uint32_t v) {
if (v == 0) {
return 32;
}
#if __has_builtin(__builtin_clz) || defined(__GNUC__)
return __builtin_clz(v);
#elif defined(_MSC_VER)
unsigned long count;
_BitScanReverse(&count, v);
return (int)count;
#else
// See Stanford bithacks, find the log base 2 of an N-bit integer in
// O(lg(N)) operations with multiply and lookup:
// http://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
Expand All @@ -81,12 +131,24 @@ template<> int CountLeadingZeroes<uint32_t>(uint32_t v) {
v = v | (v >> 4);
v = v | (v >> 8);
v = v | (v >> 16);
return v ? (int)tbl[((uint32_t)(v * 0x07C4ACDDU)) >> 27] : 32;
return (int)tbl[((uint32_t)(v * 0x07C4ACDDU)) >> 27];
#endif
}

template<> int CountLeadingZeroes<uint64_t>(uint64_t v) {
if (v == 0) {
return 64;
}
#if __has_builtin(__builtin_clzll) || defined(__GNUC__)
return __builtin_clzll(v);
#elif defined(_MSC_VER)
unsigned long count;
_BitScanReverse64(&count, v);
return (int)count;
#else
return v >> 32 ? CountLeadingZeroes((uint32_t)(v >> 32))
: 32 + CountLeadingZeroes((uint32_t)v);
#endif
}

uint32_t Log2(uint32_t v) {
Expand All @@ -108,23 +170,6 @@ uint32_t Log2(uint32_t v) {
}
}

uint32_t Pow2(uint32_t v) {
switch (v) {
case 0:
return 1;
case 1:
return 2;
case 2:
return 4;
case 3:
return 8;
case 4:
return 16;
case 5:
return 32;
default:
return 1 << v;
}
}
uint32_t Pow2(uint32_t v) { return 1 << v; }

} // namespace wasm
14 changes: 9 additions & 5 deletions src/support/bits.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,17 +65,21 @@ template<typename T> int CountTrailingZeroes(T v) {
template<typename T> int CountLeadingZeroes(T v) {
return CountLeadingZeroes(typename std::make_unsigned<T>::type(v));
}
template<typename T> bool IsPowerOf2(T v) { return v != 0 && PopCount(v) == 1; }
template<typename T> bool IsPowerOf2(T v) {
return v != 0 && (v & (v - 1)) == 0;
}
Comment on lines +68 to +70
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice 👍 I like that this lets us delete code.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

usually popcnt(x) == 1 (btw it include x != 0 case also) faster but only if we have native popcnt support. Alos LLVM & GCC smart enought to replace pattern above to popcnt(x) == 1 if it possible


template<typename T, typename U> inline static T RotateLeft(T val, U count) {
T mask = sizeof(T) * CHAR_BIT - 1;
auto value = typename std::make_unsigned<T>::type(val);
Copy link
Contributor Author

@MaxGraey MaxGraey Jun 17, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pretty important cast to unsigned. Otherwise not LLVM nor GCC can fold this to single rol / rot op

U mask = sizeof(T) * CHAR_BIT - 1;
count &= mask;
return (val << count) | (val >> (-count & mask));
return (value << count) | (value >> (-count & mask));
}
template<typename T, typename U> inline static T RotateRight(T val, U count) {
T mask = sizeof(T) * CHAR_BIT - 1;
auto value = typename std::make_unsigned<T>::type(val);
U mask = sizeof(T) * CHAR_BIT - 1;
count &= mask;
return (val >> count) | (val << (-count & mask));
return (value >> count) | (value << (-count & mask));
}

extern uint32_t Log2(uint32_t v);
Expand Down