Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cpp/CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -444,7 +444,8 @@
"CMAKE_CXX_COMPILER": "clang++",
"ARROW_IPC": "ON",
"ARROW_PARQUET": "ON",
"ARROW_FUZZING": "ON"
"ARROW_FUZZING": "ON",
"ARROW_WITH_SNAPPY": "ON"
}
},
{
Expand Down
16 changes: 10 additions & 6 deletions cpp/src/arrow/util/rle_encoding_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -657,13 +657,14 @@ auto RleBitPackedParser::PeekImpl(Handler&& handler) const
const auto header_bytes = bit_util::ParseLeadingLEB128(data_, kMaxSize, &run_len_type);

if (ARROW_PREDICT_FALSE(header_bytes == 0)) {
// Malfomrmed LEB128 data
// Malformed LEB128 data
return {0, ControlFlow::Break};
}

const bool is_bit_packed = run_len_type & 1;
const uint32_t count = run_len_type >> 1;
if (is_bit_packed) {
// Bit-packed run
constexpr auto kMaxCount = bit_util::CeilDiv(internal::max_size_for_v<rle_size_t>, 8);
if (ARROW_PREDICT_FALSE(count == 0 || count > kMaxCount)) {
// Illegal number of encoded values
Expand All @@ -672,17 +673,21 @@ auto RleBitPackedParser::PeekImpl(Handler&& handler) const

ARROW_DCHECK_LT(static_cast<uint64_t>(count) * 8,
internal::max_size_for_v<rle_size_t>);
// Count Already divided by 8 for byte size calculations
const auto bytes_read = header_bytes + static_cast<int64_t>(count) * value_bit_width_;
if (ARROW_PREDICT_FALSE(bytes_read > data_size_)) {
// Bit-packed run would overflow data buffer
return {0, ControlFlow::Break};
}
const auto values_count = static_cast<rle_size_t>(count * 8);
// Count Already divided by 8
const auto bytes_read =
header_bytes + static_cast<rle_size_t>(count) * value_bit_width_;

auto control = handler.OnBitPackedRun(
BitPackedRun(data_ + header_bytes, values_count, value_bit_width_));

return {bytes_read, control};
return {static_cast<rle_size_t>(bytes_read), control};
}

// RLE run
if (ARROW_PREDICT_FALSE(count == 0)) {
// Illegal number of encoded values
return {0, ControlFlow::Break};
Expand Down Expand Up @@ -1079,7 +1084,6 @@ auto RleBitPackedDecoder<T>::GetSpaced(Converter converter,
// There may be remaining null if they are not greedily filled by either decoder calls
check_and_handle_fully_null_remaining();

ARROW_DCHECK(batch.is_done() || exhausted());
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This check could trigger if the RLE-bit-packed data is invalid (for example a run of invalid size). @AntoinePrv

return batch.total_read();
}

Expand Down
7 changes: 5 additions & 2 deletions cpp/src/parquet/decoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2082,9 +2082,12 @@ class DeltaByteArrayDecoderImpl : public TypedDecoderImpl<DType> {
int64_t valid_bits_offset,
typename EncodingTraits<DType>::Accumulator* out,
int* out_num_values) {
std::vector<ByteArray> values(num_values);
std::vector<ByteArray> values(num_values - null_count);
const int num_valid_values = GetInternal(values.data(), num_values - null_count);
DCHECK_EQ(num_values - null_count, num_valid_values);
if (ARROW_PREDICT_FALSE(num_values - null_count != num_valid_values)) {
throw ParquetException("Expected to decode ", num_values - null_count,
" values, but decoded ", num_valid_values, " values.");
}

auto visit_binary_helper = [&](auto* helper) {
auto values_ptr = reinterpret_cast<const ByteArray*>(values.data());
Expand Down
Loading