From 749d0f8c9bcc1511ce6f6e5d40f4eba92f0b2ebf Mon Sep 17 00:00:00 2001 From: MaxGraey Date: Wed, 17 Jun 2020 00:02:29 +0300 Subject: [PATCH 01/11] optimize zero / one bit counts --- src/support/bits.cpp | 28 +++++++++++----------------- src/support/bits.h | 2 +- 2 files changed, 12 insertions(+), 18 deletions(-) diff --git a/src/support/bits.cpp b/src/support/bits.cpp index 2489a02498c..1dae63ecb05 100644 --- a/src/support/bits.cpp +++ b/src/support/bits.cpp @@ -30,7 +30,7 @@ template<> int PopCount(uint8_t v) { } template<> int PopCount(uint16_t v) { - return PopCount((uint8_t)(v & 0xff)) + PopCount((uint8_t)(v >> 8)); + return PopCount((uint8_t)(v & 0xFF)) + PopCount((uint8_t)(v >> 8)); } template<> int PopCount(uint32_t v) { @@ -42,7 +42,7 @@ template<> int PopCount(uint32_t v) { } template<> int PopCount(uint64_t v) { - return PopCount((uint32_t)v) + PopCount((uint32_t)(v >> 32)); + return PopCount((uint32_t)v) + (v >> 32 ? PopCount((uint32_t)(v >> 32)) : 0); } template<> uint32_t BitReverse(uint32_t v) { @@ -54,21 +54,6 @@ template<> uint32_t BitReverse(uint32_t v) { return v; } -template<> int CountTrailingZeroes(uint32_t v) { - // See Stanford bithacks, count the consecutive zero bits (trailing) on the - // right with multiply and lookup: - // http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightMultLookup - static const uint8_t tbl[32] = {0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, - 15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19, - 16, 7, 26, 12, 18, 6, 11, 5, 10, 9}; - return v ? (int)tbl[((uint32_t)((v & -v) * 0x077CB531U)) >> 27] : 32; -} - -template<> int CountTrailingZeroes(uint64_t v) { - return (uint32_t)v ? CountTrailingZeroes((uint32_t)v) - : 32 + CountTrailingZeroes((uint32_t)(v >> 32)); -} - template<> int CountLeadingZeroes(uint32_t v) { // See Stanford bithacks, find the log base 2 of an N-bit integer in // O(lg(N)) operations with multiply and lookup: @@ -89,6 +74,15 @@ template<> int CountLeadingZeroes(uint64_t v) { : 32 + CountLeadingZeroes((uint32_t)v); } +template<> int CountTrailingZeroes(uint32_t v) { + return 32 - CountLeadingZeroes(~v & (v - 1)); +} + +template<> int CountTrailingZeroes(uint64_t v) { + return (uint32_t)v ? CountTrailingZeroes((uint32_t)v) + : 32 + CountTrailingZeroes((uint32_t)(v >> 32)); +} + uint32_t Log2(uint32_t v) { switch (v) { default: diff --git a/src/support/bits.h b/src/support/bits.h index f2241bea8b4..881d135ee2a 100644 --- a/src/support/bits.h +++ b/src/support/bits.h @@ -65,7 +65,7 @@ template int CountTrailingZeroes(T v) { template int CountLeadingZeroes(T v) { return CountLeadingZeroes(typename std::make_unsigned::type(v)); } -template bool IsPowerOf2(T v) { return v != 0 && PopCount(v) == 1; } +template bool IsPowerOf2(T v) { return v != 0 && (v & (v - 1)) == 0; } template inline static T RotateLeft(T val, U count) { T mask = sizeof(T) * CHAR_BIT - 1; From f0f1d781e301f9efd455fa9b6a9381676866cc23 Mon Sep 17 00:00:00 2001 From: MaxGraey Date: Wed, 17 Jun 2020 00:12:36 +0300 Subject: [PATCH 02/11] lint --- src/support/bits.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/support/bits.h b/src/support/bits.h index 881d135ee2a..7d32c22edcf 100644 --- a/src/support/bits.h +++ b/src/support/bits.h @@ -65,7 +65,9 @@ template int CountTrailingZeroes(T v) { template int CountLeadingZeroes(T v) { return CountLeadingZeroes(typename std::make_unsigned::type(v)); } -template bool IsPowerOf2(T v) { return v != 0 && (v & (v - 1)) == 0; } +template bool IsPowerOf2(T v) { + return v != 0 && (v & (v - 1)) == 0; +} template inline static T RotateLeft(T val, U count) { T mask = sizeof(T) * CHAR_BIT - 1; From cd472775157baab45b4dbcfa3e95a0a85d3d7e93 Mon Sep 17 00:00:00 2001 From: MaxGraey Date: Wed, 17 Jun 2020 01:00:50 +0300 Subject: [PATCH 03/11] revert original CountTrailingZeroes implementation as more efficient --- src/support/bits.cpp | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/src/support/bits.cpp b/src/support/bits.cpp index 1dae63ecb05..85072bb4bbd 100644 --- a/src/support/bits.cpp +++ b/src/support/bits.cpp @@ -54,6 +54,21 @@ template<> uint32_t BitReverse(uint32_t v) { return v; } +template<> int CountTrailingZeroes(uint32_t v) { + // See Stanford bithacks, count the consecutive zero bits (trailing) on the + // right with multiply and lookup: + // http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightMultLookup + static const uint8_t tbl[32] = {0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, + 15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19, + 16, 7, 26, 12, 18, 6, 11, 5, 10, 9}; + return v ? (int)tbl[((uint32_t)((v & -v) * 0x077CB531U)) >> 27] : 32; +} + +template<> int CountTrailingZeroes(uint64_t v) { + return (uint32_t)v ? CountTrailingZeroes((uint32_t)v) + : 32 + CountTrailingZeroes((uint32_t)(v >> 32)); +} + template<> int CountLeadingZeroes(uint32_t v) { // See Stanford bithacks, find the log base 2 of an N-bit integer in // O(lg(N)) operations with multiply and lookup: @@ -74,15 +89,6 @@ template<> int CountLeadingZeroes(uint64_t v) { : 32 + CountLeadingZeroes((uint32_t)v); } -template<> int CountTrailingZeroes(uint32_t v) { - return 32 - CountLeadingZeroes(~v & (v - 1)); -} - -template<> int CountTrailingZeroes(uint64_t v) { - return (uint32_t)v ? CountTrailingZeroes((uint32_t)v) - : 32 + CountTrailingZeroes((uint32_t)(v >> 32)); -} - uint32_t Log2(uint32_t v) { switch (v) { default: From c4049c3d2f6d481396751bf6b06f4c6058c0b145 Mon Sep 17 00:00:00 2001 From: MaxGraey Date: Wed, 17 Jun 2020 01:34:36 +0300 Subject: [PATCH 04/11] simplify Pow2 --- src/support/bits.cpp | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/src/support/bits.cpp b/src/support/bits.cpp index 85072bb4bbd..a6f8fcdfe0b 100644 --- a/src/support/bits.cpp +++ b/src/support/bits.cpp @@ -109,22 +109,7 @@ uint32_t Log2(uint32_t v) { } uint32_t Pow2(uint32_t v) { - switch (v) { - case 0: - return 1; - case 1: - return 2; - case 2: - return 4; - case 3: - return 8; - case 4: - return 16; - case 5: - return 32; - default: - return 1 << v; - } + return 1 << v; } } // namespace wasm From 5fc72c62a6584a9dc7dfd112aad15d8db549a6ee Mon Sep 17 00:00:00 2001 From: MaxGraey Date: Wed, 17 Jun 2020 02:09:25 +0300 Subject: [PATCH 05/11] lint --- src/support/bits.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/support/bits.cpp b/src/support/bits.cpp index a6f8fcdfe0b..b0f0c5f4d90 100644 --- a/src/support/bits.cpp +++ b/src/support/bits.cpp @@ -108,8 +108,6 @@ uint32_t Log2(uint32_t v) { } } -uint32_t Pow2(uint32_t v) { - return 1 << v; -} +uint32_t Pow2(uint32_t v) { return 1 << v; } } // namespace wasm From 29a8c8cbc5f1045d43f52a5e89532a460354ed2b Mon Sep 17 00:00:00 2001 From: MaxGraey Date: Wed, 17 Jun 2020 13:37:25 +0300 Subject: [PATCH 06/11] use builtin bit methods if possible --- src/support/bits.cpp | 60 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/src/support/bits.cpp b/src/support/bits.cpp index b0f0c5f4d90..2b45b88fa7c 100644 --- a/src/support/bits.cpp +++ b/src/support/bits.cpp @@ -22,27 +22,43 @@ namespace wasm { template<> int PopCount(uint8_t v) { +#if __has_builtin(__builtin_popcount) || defined(__GNUC__) + return __builtin_popcount(v); +#else // Small table lookup. static const uint8_t tbl[32] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5}; return tbl[v & 0xf] + tbl[v >> 4]; +#endif } template<> int PopCount(uint16_t v) { +#if __has_builtin(__builtin_popcount) || defined(__GNUC__) + return __builtin_popcount(v); +#else return PopCount((uint8_t)(v & 0xFF)) + PopCount((uint8_t)(v >> 8)); +#endif } template<> int PopCount(uint32_t v) { +#if __has_builtin(__builtin_popcount) || defined(__GNUC__) + return __builtin_popcount(v); +#else // See Stanford bithacks, counting bits set in parallel, "best method": // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel v = v - ((v >> 1) & 0x55555555); v = (v & 0x33333333) + ((v >> 2) & 0x33333333); return (((v + (v >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24; +#endif } template<> int PopCount(uint64_t v) { +#if __has_builtin(__builtin_popcountll) || defined(__GNUC__) + return __builtin_popcountll(v); +#else return PopCount((uint32_t)v) + (v >> 32 ? PopCount((uint32_t)(v >> 32)) : 0); +#endif } template<> uint32_t BitReverse(uint32_t v) { @@ -55,21 +71,50 @@ template<> uint32_t BitReverse(uint32_t v) { } template<> int CountTrailingZeroes(uint32_t v) { + if (v == 0) + return 32; +#if __has_builtin(__builtin_ctz) || defined(__GNUC__) + return __builtin_ctz(v); +#elif defined(_MSC_VER) + unsigned long count; + _BitScanForward(&count, v); + return count; +#else // See Stanford bithacks, count the consecutive zero bits (trailing) on the // right with multiply and lookup: // http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightMultLookup static const uint8_t tbl[32] = {0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9}; - return v ? (int)tbl[((uint32_t)((v & -v) * 0x077CB531U)) >> 27] : 32; + return (int)tbl[((uint32_t)((v & -v) * 0x077CB531U)) >> 27]; +#endif } template<> int CountTrailingZeroes(uint64_t v) { + if (v == 0) + return 64; +#if __has_builtin(__builtin_ctzll) || defined(__GNUC__) + return __builtin_ctzll(v); +#elif defined(_MSC_VER) + unsigned long count; + _BitScanForward64(&count, v); + return count; +#else return (uint32_t)v ? CountTrailingZeroes((uint32_t)v) : 32 + CountTrailingZeroes((uint32_t)(v >> 32)); +#endif } template<> int CountLeadingZeroes(uint32_t v) { + if (v == 0) + return 32; +#if __has_builtin(__builtin_clz) || defined(__GNUC__) + return __builtin_clz(v); +#elif defined(_MSC_VER) + unsigned long count; + _BitScanReverse(&count, v); + return count; +#else // See Stanford bithacks, find the log base 2 of an N-bit integer in // O(lg(N)) operations with multiply and lookup: // http://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn @@ -81,12 +126,23 @@ template<> int CountLeadingZeroes(uint32_t v) { v = v | (v >> 4); v = v | (v >> 8); v = v | (v >> 16); - return v ? (int)tbl[((uint32_t)(v * 0x07C4ACDDU)) >> 27] : 32; + return (int)tbl[((uint32_t)(v * 0x07C4ACDDU)) >> 27]; +#endif } template<> int CountLeadingZeroes(uint64_t v) { + if (v == 0) + return 64; +#if __has_builtin(__builtin_clzll) || defined(__GNUC__) + return __builtin_clzll(v); +#elif defined(_MSC_VER) + unsigned long count; + _BitScanReverse64(&count, v); + return count; +#else return v >> 32 ? CountLeadingZeroes((uint32_t)(v >> 32)) : 32 + CountLeadingZeroes((uint32_t)v); +#endif } uint32_t Log2(uint32_t v) { From e7b5088b03ed781bd5a7be68237a60babc872656 Mon Sep 17 00:00:00 2001 From: MaxGraey Date: Wed, 17 Jun 2020 13:40:55 +0300 Subject: [PATCH 07/11] lint --- src/support/bits.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/support/bits.cpp b/src/support/bits.cpp index 2b45b88fa7c..367d5f4c235 100644 --- a/src/support/bits.cpp +++ b/src/support/bits.cpp @@ -71,8 +71,9 @@ template<> uint32_t BitReverse(uint32_t v) { } template<> int CountTrailingZeroes(uint32_t v) { - if (v == 0) + if (v == 0) { return 32; + } #if __has_builtin(__builtin_ctz) || defined(__GNUC__) return __builtin_ctz(v); #elif defined(_MSC_VER) @@ -91,8 +92,9 @@ template<> int CountTrailingZeroes(uint32_t v) { } template<> int CountTrailingZeroes(uint64_t v) { - if (v == 0) + if (v == 0) { return 64; + } #if __has_builtin(__builtin_ctzll) || defined(__GNUC__) return __builtin_ctzll(v); #elif defined(_MSC_VER) @@ -106,8 +108,9 @@ template<> int CountTrailingZeroes(uint64_t v) { } template<> int CountLeadingZeroes(uint32_t v) { - if (v == 0) + if (v == 0) { return 32; + } #if __has_builtin(__builtin_clz) || defined(__GNUC__) return __builtin_clz(v); #elif defined(_MSC_VER) @@ -131,8 +134,9 @@ template<> int CountLeadingZeroes(uint32_t v) { } template<> int CountLeadingZeroes(uint64_t v) { - if (v == 0) + if (v == 0) { return 64; + } #if __has_builtin(__builtin_clzll) || defined(__GNUC__) return __builtin_clzll(v); #elif defined(_MSC_VER) From d81ce1a664e8daed6c5f7fbbc4002e99d09cdac9 Mon Sep 17 00:00:00 2001 From: MaxGraey Date: Wed, 17 Jun 2020 13:51:37 +0300 Subject: [PATCH 08/11] explicit cast to int for MSC --- src/support/bits.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/support/bits.cpp b/src/support/bits.cpp index 367d5f4c235..90275173c5b 100644 --- a/src/support/bits.cpp +++ b/src/support/bits.cpp @@ -79,7 +79,7 @@ template<> int CountTrailingZeroes(uint32_t v) { #elif defined(_MSC_VER) unsigned long count; _BitScanForward(&count, v); - return count; + return (int)count; #else // See Stanford bithacks, count the consecutive zero bits (trailing) on the // right with multiply and lookup: @@ -100,7 +100,7 @@ template<> int CountTrailingZeroes(uint64_t v) { #elif defined(_MSC_VER) unsigned long count; _BitScanForward64(&count, v); - return count; + return (int)count; #else return (uint32_t)v ? CountTrailingZeroes((uint32_t)v) : 32 + CountTrailingZeroes((uint32_t)(v >> 32)); @@ -116,7 +116,7 @@ template<> int CountLeadingZeroes(uint32_t v) { #elif defined(_MSC_VER) unsigned long count; _BitScanReverse(&count, v); - return count; + return (int)count; #else // See Stanford bithacks, find the log base 2 of an N-bit integer in // O(lg(N)) operations with multiply and lookup: @@ -142,7 +142,7 @@ template<> int CountLeadingZeroes(uint64_t v) { #elif defined(_MSC_VER) unsigned long count; _BitScanReverse64(&count, v); - return count; + return (int)count; #else return v >> 32 ? CountLeadingZeroes((uint32_t)(v >> 32)) : 32 + CountLeadingZeroes((uint32_t)v); From 78633e24c3f850320f9a2bd7ce818de994b05723 Mon Sep 17 00:00:00 2001 From: MaxGraey Date: Wed, 17 Jun 2020 14:10:23 +0300 Subject: [PATCH 09/11] use builtin popcnt for MSC as well --- src/support/bits.cpp | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/support/bits.cpp b/src/support/bits.cpp index 90275173c5b..884325e007b 100644 --- a/src/support/bits.cpp +++ b/src/support/bits.cpp @@ -19,31 +19,33 @@ #include "../compiler-support.h" #include "support/utilities.h" +#ifdef _MSC_VER +#include +#define __builtin_popcount __popcnt +#define __builtin_popcountll __popcnt64 +#endif + namespace wasm { template<> int PopCount(uint8_t v) { -#if __has_builtin(__builtin_popcount) || defined(__GNUC__) - return __builtin_popcount(v); -#else // Small table lookup. static const uint8_t tbl[32] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5}; return tbl[v & 0xf] + tbl[v >> 4]; -#endif } template<> int PopCount(uint16_t v) { -#if __has_builtin(__builtin_popcount) || defined(__GNUC__) - return __builtin_popcount(v); +#if __has_builtin(__builtin_popcount) || defined(__GNUC__) || defined(_MSC_VER) + return (int)__builtin_popcount(v); #else return PopCount((uint8_t)(v & 0xFF)) + PopCount((uint8_t)(v >> 8)); #endif } template<> int PopCount(uint32_t v) { -#if __has_builtin(__builtin_popcount) || defined(__GNUC__) - return __builtin_popcount(v); +#if __has_builtin(__builtin_popcount) || defined(__GNUC__) || defined(_MSC_VER) + return (int)__builtin_popcount(v); #else // See Stanford bithacks, counting bits set in parallel, "best method": // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel @@ -54,8 +56,8 @@ template<> int PopCount(uint32_t v) { } template<> int PopCount(uint64_t v) { -#if __has_builtin(__builtin_popcountll) || defined(__GNUC__) - return __builtin_popcountll(v); +#if __has_builtin(__builtin_popcount) || defined(__GNUC__) || defined(_MSC_VER) + return (int)__builtin_popcountll(v); #else return PopCount((uint32_t)v) + (v >> 32 ? PopCount((uint32_t)(v >> 32)) : 0); #endif From 15bff4a5d631dd9b9fccb30043e78f6caf72b237 Mon Sep 17 00:00:00 2001 From: MaxGraey Date: Wed, 17 Jun 2020 15:35:16 +0300 Subject: [PATCH 10/11] casting to unsigned for value in RotateLeft / RotateRight --- src/support/bits.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/support/bits.h b/src/support/bits.h index 7d32c22edcf..bd91fdec659 100644 --- a/src/support/bits.h +++ b/src/support/bits.h @@ -70,14 +70,16 @@ template bool IsPowerOf2(T v) { } template inline static T RotateLeft(T val, U count) { - T mask = sizeof(T) * CHAR_BIT - 1; + auto value = typename std::make_unsigned::type(val); + U mask = sizeof(T) * CHAR_BIT - 1; count &= mask; - return (val << count) | (val >> (-count & mask)); + return (value << count) | (value >> (-count & mask)); } template inline static T RotateRight(T val, U count) { - T mask = sizeof(T) * CHAR_BIT - 1; + auto value = typename std::make_unsigned::type(val); + U mask = sizeof(T) * CHAR_BIT - 1; count &= mask; - return (val >> count) | (val << (-count & mask)); + return (value >> count) | (value << (-count & mask)); } extern uint32_t Log2(uint32_t v); From abcc646b7a93c60a042064b9550f263d118d6252 Mon Sep 17 00:00:00 2001 From: MaxGraey Date: Wed, 17 Jun 2020 20:24:53 +0300 Subject: [PATCH 11/11] revert changes for popcnt fallback --- src/support/bits.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/support/bits.cpp b/src/support/bits.cpp index 884325e007b..f57a6d0e86e 100644 --- a/src/support/bits.cpp +++ b/src/support/bits.cpp @@ -59,7 +59,7 @@ template<> int PopCount(uint64_t v) { #if __has_builtin(__builtin_popcount) || defined(__GNUC__) || defined(_MSC_VER) return (int)__builtin_popcountll(v); #else - return PopCount((uint32_t)v) + (v >> 32 ? PopCount((uint32_t)(v >> 32)) : 0); + return PopCount((uint32_t)v) + PopCount((uint32_t)(v >> 32)); #endif }