Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
61 commits
Select commit Hold shift + click to select a range
5280b79
Add large bit_width unpack64 tests
AntoinePrv Sep 16, 2025
8c703a4
Handle 16bit unpacking generation
AntoinePrv Sep 3, 2025
38e4428
Use uint8_t* input for simd unpack
AntoinePrv Sep 16, 2025
b4b7a55
Gen: regenerate bpacking_simd
AntoinePrv Sep 16, 2025
bf4a7dd
Exclude Python codegen from doxygen
AntoinePrv Sep 16, 2025
8ed33f5
Make generic scalar unpacking codegen
AntoinePrv Sep 16, 2025
7e39954
Gen: regenerate scalar unpack function in single file
AntoinePrv Sep 16, 2025
e945669
Simplify scalar_unpack increments
AntoinePrv Sep 16, 2025
8c81858
Gen: regenerate bpacking_scalar
AntoinePrv Sep 16, 2025
c9a399f
Try: reinterpret cast
AntoinePrv Sep 18, 2025
76e0782
Revert: reinterpret cast
AntoinePrv Sep 18, 2025
2e93454
Simplify simd generator code
AntoinePrv Sep 18, 2025
c6bc870
Gen: regenerate simd files
AntoinePrv Sep 18, 2025
a703f7b
Use templated method in SimdUnpacker
AntoinePrv Sep 18, 2025
2fcf6f3
Gen: regenerate simd files
AntoinePrv Sep 18, 2025
2382cff
Slight improvement to SIMD codegen
AntoinePrv Sep 18, 2025
5747e95
Use template functions in scalar codegen and factor dispatch
AntoinePrv Sep 18, 2025
3eec943
Gen: regenerate unpack files
AntoinePrv Sep 18, 2025
d84bd5d
Try new simd scheme
AntoinePrv Sep 19, 2025
988ffc7
Fix template specialization
AntoinePrv Sep 19, 2025
8239a9e
Gen: regenerate unpack files
AntoinePrv Sep 19, 2025
51696fd
Only generate simd 32
AntoinePrv Sep 19, 2025
62db1b5
Gen: regenerate unpack files
AntoinePrv Sep 19, 2025
42440d3
Add SSE4.2 instantiation to bpacking
AntoinePrv Sep 19, 2025
62e2e8b
Try: new simd scheme
AntoinePrv Sep 19, 2025
d7a8e4d
WIP: new simd algo
AntoinePrv Sep 19, 2025
4ad8fc5
WIP
AntoinePrv Sep 22, 2025
e83222c
Struct unpacker for width and no generate null/full
AntoinePrv Sep 23, 2025
274f7e1
Gen: regenerate unpack files
AntoinePrv Sep 23, 2025
5fedf96
WIP simd unpack
AntoinePrv Sep 24, 2025
cb19ff4
Generate scalar 16 bit unpacking
AntoinePrv Sep 24, 2025
14babe6
Move scalar unpack functions to their own file
AntoinePrv Sep 24, 2025
36a9d98
Gen: regenerate unpack files
AntoinePrv Sep 24, 2025
61f7067
Test all width
AntoinePrv Sep 24, 2025
4fe0fd8
Add sse2 file
AntoinePrv Sep 24, 2025
ac0a7c7
Simplify bpacking files
AntoinePrv Sep 24, 2025
92584a0
Add simd 16 unpack functions
AntoinePrv Sep 24, 2025
009ee05
Gen: regenerate unpack files
AntoinePrv Sep 24, 2025
b3dcacd
Remove unpack16_avx512
AntoinePrv Sep 25, 2025
60bf97b
Gen: regenerate avx512 file
AntoinePrv Sep 25, 2025
87a209d
Add missing header
AntoinePrv Sep 25, 2025
d4f52f5
Add simd 64
AntoinePrv Sep 25, 2025
5faa90c
Gen: regenerate simd files
AntoinePrv Sep 25, 2025
d199d1a
Missing 64 impl
AntoinePrv Sep 25, 2025
9d36846
BENCHMARK ALL
AntoinePrv Sep 25, 2025
52007eb
Adjust used unpack functions
AntoinePrv Sep 25, 2025
5fecf21
use constexpr jump table
AntoinePrv Sep 25, 2025
2c26245
Reduce number of displayed tests
AntoinePrv Sep 25, 2025
225b031
Fix array CTAD
AntoinePrv Sep 25, 2025
86bc3c0
Revert "BENCHMARK ALL"
AntoinePrv Sep 25, 2025
973312e
Fix missing header guard
AntoinePrv Sep 25, 2025
63740ea
Use template for public unpack functions
AntoinePrv Sep 25, 2025
a03d398
Fix template instanciation declaration
AntoinePrv Sep 25, 2025
f1e7830
Use template for internal unpack functions
AntoinePrv Sep 26, 2025
d57a53b
Fix UB
AntoinePrv Sep 26, 2025
eda2210
Use unpack<uint16_t> in BitReader
AntoinePrv Sep 26, 2025
0a52c0c
Don't use unpack16
AntoinePrv Sep 26, 2025
615dd80
Add missing get_unpack_fn
AntoinePrv Sep 29, 2025
d123368
Refactor BitReader to accomodate unpack fn
AntoinePrv Sep 29, 2025
69c31bb
Use get_unpack_fn in RleBitPacker
AntoinePrv Sep 29, 2025
abf7629
Add missing unpack getter and dyn dispatch
AntoinePrv Sep 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/apidoc/Doxyfile
Original file line number Diff line number Diff line change
Expand Up @@ -1095,6 +1095,7 @@ EXCLUDE_PATTERNS = *-test.cc \
*test* \
*_generated.h \
*-benchmark.cc \
*_codegen.py \
*internal*

# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
Expand Down
9 changes: 4 additions & 5 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,8 @@ set(ARROW_UTIL_SRCS
util/bitmap_builders.cc
util/bitmap_ops.cc
util/bpacking.cc
util/bpacking_scalar.cc
util/bpacking_simd_min.cc
util/byte_size.cc
util/byte_stream_split_internal.cc
util/cancel.cc
Expand Down Expand Up @@ -533,11 +535,8 @@ set(ARROW_UTIL_SRCS

append_runtime_avx2_src(ARROW_UTIL_SRCS util/byte_stream_split_internal_avx2.cc)

append_runtime_avx2_src(ARROW_UTIL_SRCS util/bpacking_avx2.cc)
append_runtime_avx512_src(ARROW_UTIL_SRCS util/bpacking_avx512.cc)
if(ARROW_HAVE_NEON)
list(APPEND ARROW_UTIL_SRCS util/bpacking_neon.cc)
endif()
append_runtime_avx2_src(ARROW_UTIL_SRCS util/bpacking_simd_avx2.cc)
append_runtime_avx512_src(ARROW_UTIL_SRCS util/bpacking_simd_avx512.cc)

if(ARROW_WITH_BROTLI)
list(APPEND ARROW_UTIL_SRCS util/compression_brotli.cc)
Expand Down
59 changes: 44 additions & 15 deletions cpp/src/arrow/util/bit_stream_utils_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,34 @@ inline uint64_t ReadLittleEndianWord(const uint8_t* buffer, int bytes_remaining)
/// bytes in one read (e.g. encoded int).
class BitReader {
public:
template <typename T, typename = void>
struct UnpackFnDetect;

template <typename T>
struct UnpackFnDetect<T, std::enable_if_t<(sizeof(T) >= sizeof(int))>> {
using type = internal::UnpackFn<std::make_unsigned_t<T>>;
};

template <typename T>
struct UnpackFnDetect<T, std::enable_if_t<(sizeof(T) < sizeof(int))>> {
using type = internal::UnpackFn<uint32_t>;
};

/// The type for a function that can extract bit-packed integers.
template <typename T>
using UnpackFn = typename UnpackFnDetect<T>::type;

/// Get the unack function most appropriated for this type and bit width.
template <typename T>
static UnpackFn<T> get_unpack_fn(int num_bits) {
// This is intimately linked to the GetBatch implementation
if constexpr (sizeof(T) >= sizeof(int)) {
return internal::get_unpack_fn<std::make_unsigned_t<T>>(num_bits);
} else {
return internal::get_unpack_fn<uint32_t>(num_bits);
}
}

BitReader() noexcept = default;

/// 'buffer' is the buffer to read from. The buffer's length is 'buffer_len'.
Expand All @@ -148,6 +176,11 @@ class BitReader {
template <typename T>
int GetBatch(int num_bits, T* v, int batch_size);

/// Get a number of values from the buffer. Return the number of values actually read.
/// @param unpack Function pointer to the unpack function for the correct bit width.
template <typename T>
int GetBatch(int num_bits, T* v, int batch_size, UnpackFn<T> unpack);

/// Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T
/// needs to be a little-endian native type and big enough to store
/// 'num_bytes'. The value is assumed to be byte-aligned so the stream will
Expand Down Expand Up @@ -297,7 +330,7 @@ inline bool BitReader::GetValue(int num_bits, T* v) {
}

template <typename T>
inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
int BitReader::GetBatch(int num_bits, T* v, int batch_size, UnpackFn<T> unpack) {
ARROW_DCHECK(buffer_ != NULL);
ARROW_DCHECK_LE(num_bits, static_cast<int>(sizeof(T) * 8)) << "num_bits: " << num_bits;

Expand All @@ -323,19 +356,11 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
}
}

if (sizeof(T) == 4) {
// unpack for uint16_t not as fast as unpack for uint32_t + memcpy.
if constexpr (sizeof(T) >= sizeof(32)) {
int num_unpacked =
internal::unpack32(buffer + byte_offset, reinterpret_cast<uint32_t*>(v + i),
batch_size - i, num_bits);
i += num_unpacked;
byte_offset += num_unpacked * num_bits / 8;
} else if (sizeof(T) == 8 && num_bits > 32) {
// Use unpack64 only if num_bits is larger than 32
// TODO (ARROW-13677): improve the performance of internal::unpack64
// and remove the restriction of num_bits
int num_unpacked =
internal::unpack64(buffer + byte_offset, reinterpret_cast<uint64_t*>(v + i),
batch_size - i, num_bits);
unpack(buffer + byte_offset, reinterpret_cast<std::make_unsigned_t<T>*>(v + i),
batch_size - i);
i += num_unpacked;
byte_offset += num_unpacked * num_bits / 8;
} else {
Expand All @@ -345,8 +370,7 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
uint32_t unpack_buffer[buffer_size];
while (i < batch_size) {
int unpack_size = std::min(buffer_size, batch_size - i);
int num_unpacked =
internal::unpack32(buffer + byte_offset, unpack_buffer, unpack_size, num_bits);
int num_unpacked = unpack(buffer + byte_offset, unpack_buffer, unpack_size);
if (num_unpacked == 0) {
break;
}
Expand Down Expand Up @@ -380,6 +404,11 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
return batch_size;
}

template <typename T>
inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
return GetBatch(num_bits, v, batch_size, get_unpack_fn<T>(num_bits));
}

template <typename T>
inline bool BitReader::GetAligned(int num_bytes, T* v) {
if (ARROW_PREDICT_FALSE(num_bytes > static_cast<int>(sizeof(T)))) {
Expand Down
Loading
Loading