From 5a0a2dbf817b4cbbb7e42d5e6dc3a123b620908e Mon Sep 17 00:00:00 2001 From: furidosu <86096478+furidosu@users.noreply.github.com> Date: Wed, 24 Sep 2025 06:47:50 +0800 Subject: [PATCH 1/5] Add parquet.hexpat for Apache Parquet data file > Apache Parquet is an open source, column-oriented data file format designed for efficient data storage and retrieval. > -- https://parquet.apache.org --- patterns/parquet.hexpat | 737 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 737 insertions(+) create mode 100644 patterns/parquet.hexpat diff --git a/patterns/parquet.hexpat b/patterns/parquet.hexpat new file mode 100644 index 00000000..dd8a0a27 --- /dev/null +++ b/patterns/parquet.hexpat @@ -0,0 +1,737 @@ +/* +Apache Parquet File Format +With help from Claude AI + +Known Limits: +- Not all metadata fields named in Pattern Data +- DataPageHeaderV2 not supported + +References: +https://parquet.apache.org/docs/file-format/ß +https://issues.apache.org/jira/secure/attachment/12399869/compact-proto-spec-2.txt +https://issues.apache.org/jira/secure/attachment/12399879/thrift-110-v12.patch +https://raw.githubusercontent.com/apache/parquet-format/refs/heads/master/src/main/thrift/parquet.thrift +*/ + +#pragma description Apache Parquet File Format +#pragma endian little +#pragma MIME application/x-thrift-compact + +import std.mem; +import std.sys; +import std.core; + +s16 last_field_id = 0; +std::mem::Section last_field_id_stack = + std::mem::create_section("last_field_id_stack"); +u16 last_field_id_stack_size = 0 [[export]]; // Should be 0 at end of parse +u16 last_field_id_stack_size_max = 0; +fn push_last_field_id() { + s16 last_field_id_stack_top @ + last_field_id_stack_size * sizeof(s16) in last_field_id_stack; + last_field_id_stack_top = last_field_id; + last_field_id_stack_size += 1; + if (last_field_id_stack_size_max < last_field_id_stack_size) + last_field_id_stack_size_max = last_field_id_stack_size; +}; +fn pop_last_field_id() { + last_field_id_stack_size -= 1; + s16 last_field_id_stack_top @ + last_field_id_stack_size * sizeof(s16) in last_field_id_stack; + last_field_id = last_field_id_stack_top; +}; + +std::mem::Section column_offset_list = std::mem::create_section("column_offset_list"); +auto column_offset_list_size = 0; +fn push_column_offset(s64 page_offset) { + s64 column_offset_list_end @ column_offset_list_size * sizeof(s64) in column_offset_list; + column_offset_list_end = page_offset; + column_offset_list_size += 1; +}; + +using CompactI16; +using CompactI32; +using CompactI64; +using CompactBinary; +using CompactList; +using CompactMap; +using ThriftStruct; + +// TCompactProtocol Type Constants +enum TCompactType : u8 { + CT_STOP = 0x00, + CT_BOOLEAN_TRUE = 0x01, + CT_BOOLEAN_FALSE = 0x02, + CT_BYTE = 0x03, + CT_I16 = 0x04, + CT_I32 = 0x05, + CT_I64 = 0x06, + CT_DOUBLE = 0x07, + CT_BINARY = 0x08, + CT_LIST = 0x09, + CT_SET = 0x0A, +// CT_MAP = 0x0B, // Thrift Map not used in Parquet Metadata + CT_STRUCT = 0x0C, + CT_EXTENDED = 0x0F +}; + +// Variable-length integer structures that advance cursor properly +struct VarInt { + u8 bytes[while(std::mem::read_unsigned($,1) & 0x80 != 0)]; + u8 final_byte; + + u64 value = decode_varint(addressof(bytes)) [[export]]; +} [[format("varint_format")]]; + +fn varint_format(VarInt vi) { + return std::format("VarInt: {}", vi.value); +}; + +// Variable-length integer decoding (ZigZag + VarInt) +fn decode_varint(auto ptr) { + u64 result = 0; + u8 shift = 0; + u8 byte_val; + + while (true) { + byte_val = std::mem::read_unsigned(ptr, 1); + ptr = ptr + 1; + + result |= (u64(byte_val & 0x7F) << shift); + shift += 7; + + if ((byte_val & 0x80) == 0) { + break; + } + + if (shift >= 64) { + std::error("VarInt too large"); + break; + } + } + + return result; +}; + +// ZigZag decode for signed integers +fn zigzag_decode_32(u32 n) { + return s32((n >> 1) ^ (-(n & 1))); +}; + +fn zigzag_decode_64(u64 n) { + return s64((n >> 1) ^ (-(n & 1))); +}; + +/// Field header structure +/// Do not place ThriftFieldHeader directly +/// Always place ThriftStruct +/// Because ThriftFieldHeader depends on global last_field_id_stack +struct ThriftFieldHeader { + u8 type_and_delta; + // Extract type (lower 4 bits) + TCompactType field_type = type_and_delta & 0x0F [[export]]; + if (type_and_delta == 0x0) break; + + // Extract field ID delta (upper 4 bits) + u8 field_id_delta = (type_and_delta & 0xF0) >> 4 [[export]]; + + // If delta is 0, field ID follows as varint + if (field_id_delta == 0) { + VarInt field_id_varint; + s16 field_id = s16(zigzag_decode_32(u32(field_id_varint.value))) [[export]]; + last_field_id = field_id; + } else { + // Field ID is previous_field_id + delta + s16 field_id = last_field_id + field_id_delta [[export]]; + last_field_id = field_id; + } +} [[format("field_header_format")]]; + +fn field_header_format(ThriftFieldHeader header) { + if (header.type_and_delta == 0) return "STOP field"; + if (header.field_id_delta == 0) { + return std::format("Field ID: {}, Type: {:#02x} {}", + header.field_id, u8(header.field_type), header.field_type); + } else { + return std::format("Field ID Delta: {}, Type: {:#02x} {}", + header.field_id_delta, u8(header.field_type), header.field_type); + } +}; + +// Variable-length string/binary +struct CompactBinary { + VarInt length_varint; + u32 length = u32(length_varint.value) [[export]]; + char data[length]; +} [[format("compact_binary_format")]]; + +fn compact_binary_format(CompactBinary bin) { + return std::format("Length: {}, Data: {}", bin.length, bin.data); +}; + +// Variable-length integer types +struct CompactI32 { + VarInt raw_varint; + s32 value = zigzag_decode_32(u32(raw_varint.value)) [[export]]; +} [[format("compact_i32_format")]]; + +fn compact_i32_format(CompactI32 val) { + return std::format("I32: {}", val.value); +}; + +struct CompactI64 { + VarInt raw_varint; + s64 value = zigzag_decode_64(raw_varint.value) [[export]]; +} [[format("compact_i64_format")]]; + +fn compact_i64_format(CompactI64 val) { + return std::format("I64: {}", val.value); +}; + +struct CompactI16 { + VarInt raw_varint; + s16 value = s16(zigzag_decode_32(u32(raw_varint.value))) [[export]]; +} [[format("compact_i16_format")]]; + +fn compact_i16_format(CompactI16 val) { + return std::format("I16: {}", val.value); +}; + +// List/Set structure +struct CompactList { + u8 size_and_type; + + TCompactType element_type = size_and_type & 0x0F [[export]]; + u8 size_info = (size_and_type & 0xF0) >> 4; + + // If size_info >= 15, actual size follows as varint + u32 size = 0 [[export]]; + if (size_info == 0x0F) { + VarInt size_varint; + size = u32(size_varint.value); + } else { + size = size_info; + } + + match (element_type) { + (TCompactType::CT_BOOLEAN_TRUE): { + bool value[size]; + } + (TCompactType::CT_BOOLEAN_FALSE): { + bool value[size]; + } + (TCompactType::CT_BYTE): { + s8 value[size]; + } + (TCompactType::CT_I16): { + CompactI16 value[size]; + } + (TCompactType::CT_I32): { + CompactI32 value[size]; + } + (TCompactType::CT_I64): { + CompactI64 value[size]; + } + (TCompactType::CT_DOUBLE): { + double value[size]; + } + (TCompactType::CT_BINARY): { + CompactBinary value[size]; + } + (TCompactType::CT_LIST): { + CompactList value[size]; + } + (TCompactType::CT_SET): { + CompactList value[size]; // Same encoding as list + } + (TCompactType::CT_STRUCT): { + ThriftStruct value[size]; + } + } +} [[format("compact_list_format")]]; + +fn compact_list_format(ref CompactList list) { + return std::format("List: {} elements of type {:#02x}", + list.size, u8(list.element_type)); +}; + +/// Thrift field structure +/// Do not place ThriftField directly +/// Either place ThriftStruct or place its value +/// Because FieldHeader depends on global last_field_id_stack +struct ThriftField { + ThriftFieldHeader header; + if (header.field_type == TCompactType::CT_STOP) break; + + // Only parse value if not STOP + if (header.field_type != TCompactType::CT_STOP) { + match (header.field_type) { + (TCompactType::CT_BOOLEAN_TRUE): { + bool value; + } + (TCompactType::CT_BOOLEAN_FALSE): { + bool value; + } + (TCompactType::CT_BYTE): { + s8 value; + } + (TCompactType::CT_I16): { + CompactI16 value; + } + (TCompactType::CT_I32): { + CompactI32 value; + } + (TCompactType::CT_I64): { + CompactI64 value; + } + (TCompactType::CT_DOUBLE): { + double value; + } + (TCompactType::CT_BINARY): { + CompactBinary value; + } + (TCompactType::CT_LIST): { + CompactList value; + } + (TCompactType::CT_SET): { + CompactList value; // Same encoding as list + } + (TCompactType::CT_STRUCT): { + ThriftStruct value; + } + } + } +} [[format("thrift_field_format")]]; + +fn thrift_field_format(ref ThriftField field) { + if (field.header.field_type == TCompactType::CT_STOP) { + return "STOP field"; + } else { + return std::format("Field ID: {}, Value: {}", + field.header.field_id, field.value); + } +}; + +// Thrift struct +struct ThriftStruct { + push_last_field_id(); + last_field_id = 0; + + ThriftField fields[while(!std::mem::eof())]; + + pop_last_field_id(); +} [[format("thrift_struct_format")]]; + +fn thrift_struct_format(ref ThriftStruct thrift_struct) { + return std::format("Thrift Struct with {} fields", + std::core::member_count(thrift_struct.fields)); +}; + +fn ptr_field_value_by_id(ref ThriftStruct s, s16 field_id) { + for (auto i = 0, i < std::core::member_count(s.fields), i += 1) { + if (s.fields[i].header.field_id == field_id) { + return addressof(s.fields[i].value); + } + } + std::error("Cannot find field with id {} in {}", field_id, s); +}; + +fn idx_field_by_id(ref ThriftStruct s, s16 field_id, s16 since_idx = 0) { + for (auto i = since_idx, i < std::core::member_count(s.fields), i += 1) { + if (s.fields[i].header.type_and_delta == 0x0) { + // std::print("is STOP field"); + continue; + } + if (s.fields[i].header.field_id == field_id) { + return i; + } + } + std::error(std::format("Cannot find field with id {} in {}", field_id, s)); +}; + + +/* +struct SchemaElement { + 1: optional Type type; + 2: optional i32 type_length; + 3: optional FieldRepetitionType repetition_type; + 4: required string name; + 5: optional i32 num_children; + 6: optional ConvertedType converted_type; + 7: optional i32 scale + 8: optional i32 precision + 9: optional i32 field_id; + 10: optional LogicalType logicalType +} +*/ +fn set_field_names_SchemaElement(ref auto fields) { + for (auto i = 0, i < std::core::member_count(fields), i += 1) { + if (fields[i].header.type_and_delta == 0) { + std::core::set_display_name(fields[i], "STOP"); + break; + } + match (fields[i].header.field_id) { + (1): std::core::set_display_name(fields[i], "type"); + (2): std::core::set_display_name(fields[i], "type_length"); + (3): std::core::set_display_name(fields[i], "repetition_type"); + (4): std::core::set_display_name(fields[i], "name"); + (5): std::core::set_display_name(fields[i], "num_children"); + (6): std::core::set_display_name(fields[i], "converted_type"); + (7): std::core::set_display_name(fields[i], "scale"); + (8): std::core::set_display_name(fields[i], "precision"); + (9): std::core::set_display_name(fields[i], "field_id"); + (10): std::core::set_display_name(fields[i], "logicalType"); + } + } +}; + +/* +struct ColumnMetaData { + 1: required Type type + 2: required list encodings + 3: required list path_in_schema + 4: required CompressionCodec codec + 5: required i64 num_values + 6: required i64 total_uncompressed_size + 7: required i64 total_compressed_size + 8: optional list key_value_metadata + 9: required i64 data_page_offset + 10: optional i64 index_page_offset + 11: optional i64 dictionary_page_offset + 12: optional Statistics statistics; + 13: optional list encoding_stats; + 14: optional i64 bloom_filter_offset; + 15: optional i32 bloom_filter_length; + 16: optional SizeStatistics size_statistics; + 17: optional GeospatialStatistics geospatial_statistics; +} +*/ +fn set_field_names_ColumnMetaData(ref auto fields) { + for (auto i = 0, i < std::core::member_count(fields), i += 1) { + if (fields[i].header.type_and_delta == 0) { + std::core::set_display_name(fields[i], "STOP"); + break; + } + match (fields[i].header.field_id) { + (1): std::core::set_display_name(fields[i], "type"); + (2): std::core::set_display_name(fields[i], "encodings"); + (3): std::core::set_display_name(fields[i], "path_in_schema"); + (4): std::core::set_display_name(fields[i], "codec"); + (5): std::core::set_display_name(fields[i], "num_values"); + (6): std::core::set_display_name(fields[i], "total_uncompressed_size"); + (7): std::core::set_display_name(fields[i], "total_compressed_size"); + (8): std::core::set_display_name(fields[i], "key_value_metadata"); + (9): std::core::set_display_name(fields[i], "data_page_offset"); + (10): std::core::set_display_name(fields[i], "index_page_offset"); + (11): std::core::set_display_name(fields[i], "dictionary_page_offset"); + (12): std::core::set_display_name(fields[i], "statistics"); + (13): std::core::set_display_name(fields[i], "encoding_stats"); + (14): std::core::set_display_name(fields[i], "bloom_filter_offset"); + (15): std::core::set_display_name(fields[i], "bloom_filter_length"); + (16): std::core::set_display_name(fields[i], "size_statistics"); + (17): std::core::set_display_name(fields[i], "geospatial_statistics"); + } + } +}; + +/* +struct ColumnChunk { + 1: optional string file_path + 2: required i64 file_offset = 0 + 3: optional ColumnMetaData meta_data // actually required + 4: optional i64 offset_index_offset + 5: optional i32 offset_index_length + 6: optional i64 column_index_offset + 7: optional i32 column_index_length + 8: optional ColumnCryptoMetaData crypto_metadata + 9: optional binary encrypted_column_metadata +} +*/ +fn set_field_names_ColumnChunk(ref auto fields) { + for (auto i = 0, i < std::core::member_count(fields), i += 1) { + if (fields[i].header.type_and_delta == 0) { + std::core::set_display_name(fields[i], "STOP"); + break; + } + match (fields[i].header.field_id) { + (1): std::core::set_display_name(fields[i], "file_path"); + (2): std::core::set_display_name(fields[i], "file_offset"); + (3): std::core::set_display_name(fields[i], "meta_data"); + (4): std::core::set_display_name(fields[i], "offset_index_offset"); + (5): std::core::set_display_name(fields[i], "offset_index_length"); + (6): std::core::set_display_name(fields[i], "column_index_offset"); + (7): std::core::set_display_name(fields[i], "column_index_length"); + (8): std::core::set_display_name(fields[i], "crypto_metadata"); + (9): std::core::set_display_name(fields[i], "encrypted_column_metadata"); + } + if (fields[i].header.field_id == 3) { + set_field_names_ColumnMetaData(fields[i].value.fields); + } + } +}; + +/* +struct RowGroup { + 1: required list columns + 2: required i64 total_byte_size + 3: required i64 num_rows + 4: optional list sorting_columns + 5: optional i64 file_offset + 6: optional i64 total_compressed_size + 7: optional i16 ordinal +} +*/ +fn set_field_names_RowGroup(ref auto fields) { + for (auto i = 0, i < std::core::member_count(fields), i += 1) { + if (fields[i].header.type_and_delta == 0) { + std::core::set_display_name(fields[i], "STOP"); + break; + } + match (fields[i].header.field_id) { + (1): std::core::set_display_name(fields[i], "columns"); + (2): std::core::set_display_name(fields[i], "total_byte_size"); + (3): std::core::set_display_name(fields[i], "num_rows"); + (4): std::core::set_display_name(fields[i], "sorting_columns"); + (5): std::core::set_display_name(fields[i], "file_offset"); + (6): std::core::set_display_name(fields[i], "total_compressed_size"); + (7): std::core::set_display_name(fields[i], "ordinal"); + } + if (fields[i].header.field_id == 1) { + auto n_fields = std::core::member_count(fields[i].value.value); + for (auto j = 0, j < n_fields, j += 1) { + set_field_names_ColumnChunk(fields[i].value.value[j].fields); + } + } + } +}; + +/* +struct FileMetaData { + 1: required i32 version + 2: required list schema; + 3: required i64 num_rows + 4: required list row_groups + 5: optional list key_value_metadata + 6: optional string created_by + 7: optional list column_orders; + 8: optional EncryptionAlgorithm encryption_algorithm + 9: optional binary footer_signing_key_metadata +} +*/ +fn set_field_names_FileMetadata(ref auto fields) { + for (auto i = 0, i < std::core::member_count(fields), i += 1) { + if (fields[i].header.type_and_delta == 0) { + std::core::set_display_name(fields[i], "STOP"); + break; + } + // STOP should always be the last field + match (fields[i].header.field_id) { + (1): std::core::set_display_name(fields[i], "version"); + (2): std::core::set_display_name(fields[i], "schema"); + (3): std::core::set_display_name(fields[i], "num_rows"); + (4): std::core::set_display_name(fields[i], "row_groups"); + (5): std::core::set_display_name(fields[i], "key_value_metadata"); + (6): std::core::set_display_name(fields[i], "created_by"); + (7): std::core::set_display_name(fields[i], "column_orders"); + (8): std::core::set_display_name(fields[i], "encryption_algorithm"); + (9): std::core::set_display_name(fields[i], "footer_signing_key_metadata"); + } + if (fields[i].header.field_id == 2) { + auto n_fields = std::core::member_count(fields[i].value.value); + for (auto j = 0, j < n_fields, j += 1) { + set_field_names_SchemaElement(fields[i].value.value[j].fields); + } + } + if (fields[i].header.field_id == 4) { + auto n_fields = std::core::member_count(fields[i].value.value); + for (auto j = 0, j < n_fields, j += 1) { + set_field_names_RowGroup(fields[i].value.value[j].fields); + } + } + } +}; + +struct FileMetadata : ThriftStruct { + //std::core::set_display_name(fields[0], "version"); + set_field_names_FileMetadata(fields); +}; + +fn extract_column_offset_list(ref ThriftStruct file_metadata_struct) { + // Get index for row_groups id 4 + auto idx_row_groups = idx_field_by_id(file_metadata_struct, 4); + //std::print("idx_row_groups: {}", idx_row_groups); + + // For each RowGroup in row_groups + auto n_row_groups = std::core::member_count( + file_metadata_struct + .fields[idx_row_groups].value.value); + //std::print("n_row_groups: {}", n_row_groups); + for (u32 i = 0, i < n_row_groups, i += 1) { + // Get index for columns id 1 + auto idx_columns = idx_field_by_id( + file_metadata_struct + .fields[idx_row_groups].value.value[i], + 1); + //std::print("idx_columns: {}", idx_columns); + + // For each ColumnChunk in columns + auto n_columns = std::core::member_count( + file_metadata_struct + .fields[idx_row_groups].value.value[i] + .fields[idx_columns].value.value); + //std::print("n_columns: {}", n_columns); + for (u32 j = 0, j < n_columns, j += 1) { + + // Get index for meta_data id 3 + auto idx_meta_data = idx_field_by_id( + file_metadata_struct + .fields[idx_row_groups].value.value[i] + .fields[idx_columns].value.value[j], + 3); + //std::print("idx_meta_data: {}", idx_meta_data); + + // For ColumnMetadata in meta_data + // First PageHeader is at: + // dictionary_page_offset if present + // else data_page_offset + + try { + // Get index for dictionary_page_offset id 11 + auto idx_dictionary_page_offset = idx_field_by_id( + file_metadata_struct + .fields[idx_row_groups].value.value[i] + .fields[idx_columns].value.value[j] + .fields[idx_meta_data].value, + 11); + + auto dictionary_page_offset = + file_metadata_struct + .fields[idx_row_groups].value.value[i] + .fields[idx_columns].value.value[j] + .fields[idx_meta_data].value + .fields[idx_dictionary_page_offset].value.value; + + push_column_offset(dictionary_page_offset); + } catch { + + // Get index for data_page_offset id 9 + auto idx_data_page_offset = idx_field_by_id( + file_metadata_struct + .fields[idx_row_groups].value.value[i] + .fields[idx_columns].value.value[j] + .fields[idx_meta_data].value, + 9); + + auto data_page_offset = + file_metadata_struct + .fields[idx_row_groups].value.value[i] + .fields[idx_columns].value.value[j] + .fields[idx_meta_data].value + .fields[idx_data_page_offset].value.value; + + push_column_offset(data_page_offset); + } + } + } +}; + +/* +struct DataPageHeader { + 1: required i32 num_values + 2: required Encoding encoding + 3: required Encoding definition_level_encoding; + 4: required Encoding repetition_level_encoding; + 5: optional Statistics statistics; +} +*/ +fn set_field_names_DataPageHeader(ref auto fields) { + for (auto i = 0, i < std::core::member_count(fields), i += 1) { + if (fields[i].header.type_and_delta == 0) { + std::core::set_display_name(fields[i], "STOP"); + break; + } + match (fields[i].header.field_id) { + (1): std::core::set_display_name(fields[i], "num_values"); + (2): std::core::set_display_name(fields[i], "encoding"); + (3): std::core::set_display_name(fields[i], "definition_level_encoding"); + (4): std::core::set_display_name(fields[i], "repetition_level_encoding"); + (5): std::core::set_display_name(fields[i], "statistics"); + } + } +}; + +/* +struct PageHeader { + 1: required PageType type + 2: required i32 uncompressed_page_size + 3: required i32 compressed_page_size + 4: optional i32 crc + 5: optional DataPageHeader data_page_header; + 6: optional IndexPageHeader index_page_header; + 7: optional DictionaryPageHeader dictionary_page_header; + 8: optional DataPageHeaderV2 data_page_header_v2; +} +*/ +fn set_field_names_PageHeader(ref auto fields) { + for (auto i = 0, i < std::core::member_count(fields), i += 1) { + if (fields[i].header.type_and_delta == 0) { + std::core::set_display_name(fields[i], "STOP"); + break; + } + match (fields[i].header.field_id) { + (1): std::core::set_display_name(fields[i], "type"); + (2): std::core::set_display_name(fields[i], "uncompressed_page_size"); + (3): std::core::set_display_name(fields[i], "compressed_page_size"); + (4): std::core::set_display_name(fields[i], "crc"); + (5): std::core::set_display_name(fields[i], "data_page_header"); + (6): std::core::set_display_name(fields[i], "index_page_header"); + (7): std::core::set_display_name(fields[i], "dictionary_page_header"); + (8): std::core::set_display_name(fields[i], "data_page_header_v2"); + } + if (fields[i].header.field_id == 5) { + // std::print("{}", fields[i].value); + set_field_names_DataPageHeader(fields[i].value.fields); + } + } +}; + + +fn get_compressed_page_size(ref ThriftStruct page_header) { + auto idx = idx_field_by_id(page_header, 3); + return page_header.fields[idx].value.value; +}; +struct DataPage { + ThriftStruct page_header; + try { + auto compressed_page_size = get_compressed_page_size(page_header); + u8 page_data[compressed_page_size]; + } + set_field_names_PageHeader(page_header.fields); +}; + +struct ColumnChunk { + DataPage data_pages[while($ column_chunk @ column_offset_list_cur; +} [[inline]]; + +struct ParquetFile { + char header_magic[4]; + char footer_magic[4] @ sizeof($) - 4; + s32 footer_length @ sizeof($) - 8; + auto footer_begin = sizeof($) - 8 - footer_length; + + FileMetadata file_metadata_struct @ footer_begin; + + extract_column_offset_list(file_metadata_struct); + + ColumnChunkPlacer column_chunks[column_offset_list_size] @ 0x0; + + s16 last_field_id_stack_view[last_field_id_stack_size_max] @ 0x0 in last_field_id_stack; +}; +ParquetFile parquet_file @ 0x0; +//std::print("{}", parquet_file); \ No newline at end of file From 8e9cd13df7d415f3b9b711f41a9e0e3376402159 Mon Sep 17 00:00:00 2001 From: furidosu <86096478+furidosu@users.noreply.github.com> Date: Wed, 24 Sep 2025 06:51:00 +0800 Subject: [PATCH 2/5] Add parquet.hexpat test file --- tests/patterns/test_data/parquet.hexpat.parquet | Bin 0 -> 21004 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/patterns/test_data/parquet.hexpat.parquet diff --git a/tests/patterns/test_data/parquet.hexpat.parquet b/tests/patterns/test_data/parquet.hexpat.parquet new file mode 100644 index 0000000000000000000000000000000000000000..993956caf15f0d2a486bb3819096c04c3e052e79 GIT binary patch literal 21004 zcmeI4e|#Kuo$$Z2nc2)FJLycDO=n9QH>C-iw%g7$O}i;2-B!wNHQ{zaWy@(@5b|u5 z$SyzbR?fqH5H%=0Hi(!iax0>@a-fMt4IZb_Q=a84KIw@Xd{A?UHF%1tAO??zd*0tk zTUzn%b+6~|%d4|9^PTVa^Zoq(e7?V$-gS`|7U5^(;d1MS@RJ(P!EjKB-?g2gns^=4L_6Lz#eQF}MhXd{dzZv|se(#3r7*fZSnPb`5YAt<&nUUK$O9 zoac&eGe&J)Y%ojqIKT96M(ss!mX0?W331q-w{v2HQMDJsy70`3h?`NcZeTHo ziEMxQHDiGed+Hg-UJ#Gc;32NXxYW0FW==d0c*U%!PkhZti{;?-nsC)9sM@qKp?-I# zT}4z4rmjA>(*Qa3yI(UV?6g=B7vhEOB>}9M>>(+BYUb=jW(tnw!M2h~&{YCTRwLU^ zvrwGn$>Mh*o)!YthB9y7%Lyx<;P(mYI^vh+jCcmfQWBB)6fD!lqFEEq#mB`QdJ%@5 ztN8X}+*0-Fo)Wz$#82Z@p~K%sK0Z&Sm!v#Yb-HJUHdUcf<=-g&C9q!NdLB)0Fh|94 zL=}Kq1zco53)B)-kyRHaxq$YkDuRvT>Gy&@&5R#Ia zGb>|xG=`G+_}J7~VQd;k)rwMXE)esOGtoCsKWpOdJUv{ z4An{@1U=*MN6)Tbzfof_TBqPnM6@Y!^k39Z!kxULVa&qe9EgSVz5yTgfhW zR1G}PD59u|)^O}Vx>!G6#pIj85}2^F1m;pP(|$cv_vqZ(9(3zGed3N2 zKJl?5{0%0O!-}FF+ivKg;cZF$WOUO~`-p2ep<}M^1~=28XXtvlH>su6BU^dxH!OAV zTr@@(1H4Kl4<*BvcC{)74C$2|aX6wYf1?i%_Zl&^yxBK)q-zFR32JHj!e zZqBR2Tkt5ZNEohK-i}i|(r4ft_f zLToWgs^~ML@se6TkA5X+AnGC!PIeXSX>r)Bse?PHerPAIx4aAIIh`~dq-ya7GTRVU zzr^xfs0!^e^6GO*Ol~kf9p*;4}Y z1U6R{VqfCQWzw(e@v7R}Z;Y#v?Qo=SI~m8Ut49(>3=FCwV$Xwnd0%-?T^x!-q-R9M z@Q5Xj3}wB97_?{1tesGg>;&$4_|tS>KAsi(hIDawWJ0|)X}Eo}@x|hV=-n6VaWdmx zf#5JLdd*ZkPcSGG2olArmo4V-`3ZZ$){7-NB%Fj}UDIONBx>~y&5xuB3t78jkD95z zsiARfxI)~i+77;^*T9cF5K|9thNFkJ7YN8hi=vm{K?GW)TFOqym_Yo{J%0C2Niq;l zJfR+TjX5DEE5e1YLw3wu5F%3QiBh{4+9291JTQ;1N{bktvUh`#3uiMGTel}5WE>SJ zBi%CWj(Bx*+Kz(R#AFWVFslx4>JjRZ4Vc%)t-9#b#inrSeYzNYtga5PM~gPbg^6CR zGiKC7=kOjkbf}ElX#(ITw0QU&BPWg!(l?-sJ&14}0_h3o4%8{p;|_5ZmW>nL5`0R@ zC`WEklGQyOf-M447OyNlb5@1{$OGRYgptYHQ}ohB`?BZ`TLD8anNFUAhmFq(OBc+4G}d92=1Grl+l8@8CW7p*<%FXqgtF5o)8lE&l| zG#{IsK3-mxGsK~>IoJ~U)r2_QQ8zl~3~{bGZs)|tp~@3t`9zZ_AD>xep}fBmYbs6g zTaKhKUte@%YPUU~5qRKQ6ABDs6J>F~aXT!myd5$h*E~q}blbl8oGV611 zii!Q?j63ySBCalex;rQEF;DC?s%n4Iwg%F|a4x9wBg=HL6>`Zi0HOREf43?;{U${S zYj;%apAZ8IZACbq7BB3c5=IK|UBUC@;Vc;-^onq7ASZepv0=9(URK0c6mfZ3p!h4o zQDXwbn?n$b;+fr!@?&7-$TzY=372feh!RZ+Lx^hut;+A4p|c*D^+=1#Ur-9%s0e%X z&T`-a8Ya<%Q6LEY@}w~4ADI<*D`KxBemXKPB3!+fR|NHYETKM?6qgbp31%;6#Xwes zm4Uos)J6MK*}%(+Pg14ml15A%n`qd(AX;{3l>uGZPcJd?%t%7C+%ryn)k*Fxi4}c0 z!@8gp1h$bNVy7QTiW?LWuHFz6eU>qI!IawLl1@ww&omgZ3!>mAo(toD-(PEh(>;q9 zh{mm;y%3-MOr_DNi0ArZM(Oy1cp~h84FBME$EYq=@2&?x^3fZn#S>Gqx3oBWC=2Ae zXx?4&X{&dS>okCMS?r_o#JJ&Ldd_Ej)YjcoZRm;i7YyszygC4e{*dBf>yN7xiV|w} zhY_ls)RtMjEB%3l-Yy3iqHVi_288+H#qzU{1b1DuZ%6wb-|A)8dL(9yl9fphV96W} zY3igp6`AX~woFT-WDW^rn!*xPo*MAFFYeTSq7KXQ7PKkKo$(7v@gj8{GM80IUb>j>?KsR19(qXL$H6kk zr@D{f+OXrffL#OT>!D^1jqzW0Ri-m^Q38MqNVGmaN+unngO#9|fFLVUvque3$;50d zgSsoj`kpVV&{6g(nJm653=nHOR8KL4Ft(=zASylz$xJm8Jx~DeSz&~s8T)HBN6C+( zJ)@!nlZj;}GBN7Ilij*7T);|pLy33GCXXzT?d?7_f}#3u1zFPUj<{ ziMJW*e{Jli!plwaoA}i;ic{I%nXdO;lC1gu~-pyZmIF?*jX$~ z4_Eko5ge6p0^n*`x08lGIf0%CoQEAV3j4>g6gvGWDf*)bWOQ;?ybvZd7*pW>C|Wcw zjLRALBP109b{)ek8UPY9olLSA-DknY=kPcM{LwStBel16 zqZ+9jL9-XmYwOEgop=H#rbu8-p7VW(4_S8gkSSN?9M;7rSEmWJ&Oms(e zBWBm%wP+L~(}qPtotq>{>JSEjIzO34*-{+~`tj)GDBow%KFs21bYNWb5=ZC6T6ml~ zUYe}GST~{_Q~DTVX1jD;@30J{ndq1?>?Cbg#!Fbif*+Gt4^<*lJ+sGWJF*5rH{X$J zTHA`8)y@{1jiT94Uh|^)chp=BoHv=f%!fo@ zcs+4yZ`W(}KI+}yb&J>*Uk8S3A#%00Ir3iPbLO+ui0crcodfLS{(Q;ip2Ow`?0@CU zp7?JGqj!tmp0e@F?UM?m?UiKpead{J@Zzg&J+6rUp8F-26mRd@_SyDNh1aveTu*}e zHnBBxUDw%=knH*ht$LNgu5U9wcq`WyNfP0!s<(nQkPi)*inPmIG>v^9A@z0MOxc+O}Uy3a~|5h9r`sALq z<`qb0&Cpu0^_FvhRSrM&z8gN>^v3C~qUay{xHcjZdvDT0;Z>h*7yb4otv!BexW4gh zaWE#f*z2_G4_+K!Z{B(EM)S|aR(sv2RrvG<(bwZ~|K8ByaEsWw_rscZ@L_ncweQ%$ z)rfAJvZCu{P22NP^}=0Bcu(x$Rpuq&a4pwy^V*Sj#NUMKUUcQ}wcT-pvfJaNeYXs1 zcgODpmz(2X-SacLdd7UL>(2NQ7`~}%xB1aG?)_E*Ejxz5x0>%auM%#LW4<1G&ZZ;` zwO5Hg`}#w_eQYd#hj~%_;K=`ths@WHydJfFz4kynaQ&e@eemz>zVppryzhDKJMn9W zR+{%daNf|*9(!u&F4Seq`z|luzvlzyyG_r$KzO&bYoB=69hwV%XJHnvgOpttnb(^0 zBS#;ykA6~P^wX2s`WJz&=ipJADz6;_;-}#qP8}} zICt`P;q7e%ht>4|z@^4(#paFQ(EGZLOCGyRZ2jUd9u2^gk?#G*mAf&co6*T<4>pLt zy;sM!ZCvshn>TI^tKr9$@ZHK_cug3af4dvt_rgo0x7l;y9{kM37w=J8b$*mqMH3e* zTI+@K<0a-g<7_dmsX|o(p@2dA zW}X^U*_sxVGy|iVCt&E!v@!xsK|Nro3Uv(HhO{>bQ@cv&%I=UX(sFuJpvBxy!4=vy zQ9WP=meD+*y;+z_Oazr%RU@F>qnoPG-rj10tEMVna@0Tzz?+qNjOvXvFu_8pDXKCr z6y-s!yul0zwNY*6V^CFotn2jrY`T$p8dzu)>rkTuq(Lipwu94C%39?mT?85xU3s=W zV5!Z#nA)7H!yc2W%`nF52`b+|U)7*Rg)(aMye!bvwGjxHqKV)#s8p3pMX*aKZ%|xI z1cJe4#bqeTH%;vfh(@iZS^?z|$E3tmHL7nAs-4z@fq;@!143(8+Bz>1s&<$1R08&b zuHZKSB%$5Zxn21nFuHZb)fHh)W@LORutA-Qw{zp#-t=kl3n()V|H~ z1w-o7QerbzHyi5^=VnT_@wQc4uJ&zHw({U|UC4*4)I_i50LG)vX6=L0=QzCV;OdZ# z)NKjH*|K$;#HK$Xe-kcB`*c(IKI3$y{ereIa6)S#S1Je$ky;n@B`GzrkxR<8iC?LK zA%xa~Y+87d8ltqWl^P+HoR{T#P_;B&2|9+axwe*yE$4%VOjZPzp`nJhW*ix#2nxSJ zptMSr5yJ!Gkn}rl1zo zgGOt#5eO@(wo*$e<1rm&fQ3l}R`H~0SGWfC z44!q$L)t*6W7%-P)DlXEa%pGQ#Q^AIm3kJxLc6PKfm$P%!TVY2DiE^y)(+%Vfz^St z_-63D3^ML=gT9Rffqqt?L%UgdUOAFz0t<_pM&LDpXrQg@O`#5~K>5v<6|m(jiW@;_ zRiN{mop)Fb;Mron-&)3}_O9)r74T4b%9U(uL~o^2F|{M2xvO31o#}R{#f}5EvRSB| z8Y*m8u_osG+CWG(rJPfG(4>NfI%&uU&k8p!sX~gr zAq2^)Zo;f(K_#(PstYa{=c%+Z9=L(;0m%eadO{^7J&D#z^`K@61It%6%~BG$e@nS} z3m6~*DJ|9HOPex~mL_I`uoRcYvor`zJ3k$u7wLE!QCbs!8RA>Bb`g?$&}yc)pz?+A zsIr4)JOeRf5}1z5B7rp1kokTcd(nnN(4Zd~6OhDLYgGRZW>oX=)G&1whY_g~%oJharE1R6Ep6DoumZtomVp4-tQb z5i(Zj#B^-~U@xSKX>_Av&Du_JiShchub^n2R3_2{AB+t7^G5iqXDHnB>fj$JoQx_& z(m6tUPzH196Qz^X(K`_J{iKv;69`N9MVLhGaCt}(^$Ro_10lj;$Y7#L*^cToXcs`K zk~5ZLd26qb$-+`%fnw5E6M}0*&iKC@RWb#~kx>n)Yb~NltF+=o=}q|%UN>(s!-pz< zvlX67{cW+Nn(Er)mUQ>J_M~#Yjsk0=qS@H4eMb2l5&yb&kP8A?V=e<_c8STzmWY64 zmnpv?t*4%!7V3}9t-I~w5?rN)fVL@?KjG1IwHPk_ba+9BMM3c zMx-g)Z%9yYzYEUQ_>8h@hfeeaf#RArwmG5^B%AyCA z7UfIi2r}i8b|(Xm^bwjW6>2mPK7t?HD1-aWSQs}s-6pn3rRa5u+nYht>nDnPp0ZkO zBbVHawf8E`q|kke&D%;D@k81k)!RljB6W}UI(73_#a4TJwV=`C`tiB9-v`-;p54(D zzH4c&_C#v=LG9gV=-HX6r5~a-y)}s9kf^o5OX16?L0Q!fT{0olw6>TGfHJ{C_l)f! zWE#}ofPxrONiSD6J3%ywj7-Ze>4WlSz`c%81y^CUOw3xec7m=Eq?dMOor+@Jp)|=n zQ!C(INFYm!6O-z`%vViAYr=67)3vWC9b%a>K@MiTU#o1839a-}_?94fON;h8(lLC0whD8kH zjIH=QOSyw^H;Cs{p0;FUZqjZf(`^X^_0BxX9K**wVl@-DR&-qy0|+k6AoMk8Q6*1~ zYH17jIU75`6q7=xDZbBa#jd64q+(byU>np{xb__sQb&XEau4I@UZ=|t{N~R03thXz zSQV8dH;~4x=#0YFC!m)<$P>jI%*B|(y0EzqXbT-l)v zSJL=bpdoTMN9gp4l;5>QPiROhkw(}CqYY*>nZ0CIm6WUEa@Xfo{*TXp^R`v!7+*{6Ixy zfWlP7er65FL5sPzp%U;M(3odJSoV~*U3N@>3(G#yF{V11F6=TZ%r@)LCChbB$eOa# zI$&7olaDp82F7x$M~W-QUJw_0boQ3Wo@xWkY>t95n@TKLsI&eD6~!2ZlJXfZ%_o{V zfR+_vesK(vtG1pQ2XxMJY$cOpk~ht_InTAr0~U)?^cAI1R<>vE&s(r#nYuR(0y7M& z9`zYhvh|~LnJm1D+DVVyC_o(ry)xuZ3=|Z*0&uHX<8>73)4&8-kYN4Bi$RG#z=WwK z&WR_1D+L|%N-Q@Z57!d|HQ_Tb?MeL0l5hYxfRYuM#)iNe*R0qQ7hXQY7v__CnC6zk zEWpJonAdo6ePLL3-jghn!p>!C^c4f=1g+-yO(7_Y_f%%WtI-bHn5B~ZVj-~oY}~N? zTe68SnSb0S&X+ckmNsz_nm-^lTwH*z@8oS4Zw?cguVa#aE6*wMTE4`2N{AFs=Tc_} z*Ljp!B1_4i@st;DG%YmDPKrI4&&3qg#BMA@vY6QeCeLePkatIH=eH_LAzv47F70ZBj#knuQxtk(A5+aFV@6 zC}=FBqd9Y#5{kj%D4bnn?e$MgDf5(Pc@_BV!pSl^G4IWb*R$$5%UY){-pb;|ggxs? zxlDsrG(#=uWOqR^$#QB*YH-eD&Dc&4lmvP>E&kLj49tm)>s5;-)-*PmiA>JRBf2S2 zlWOaTeJmjC2X6;))y1VCJMTpiXA}!X-=K9K)GA}xenyC#SsF;9Dd>Up9eF$I<`x6bCgN-Pl3i1JW6;@MtXx9Mwx&c(abEOn3;4l^O?FGg&WbaBqWU51&{Y) zZLB($!c0nVDmyklR+@B*bIgsg{@{t?;-m|a5^GojOnodrmOPsG@O_yC{2!mJPEG)a zO#)JU9_#X~hpt0sdf8f2^+vrKE;!06##;2Lhb;v8qh&DGy#gW!?!b%!(q;UEbSjIG zpcm8L3=KtxYYZe6Z?wKTW2(%kldOM=Ve>gU>o zFfD&}TW%$kt9k6sW9hCH{Nw^)8`ny%Rb1@9Vu=DB^3Cv1c`6f}EU3XcC{6P#bxUq= zTQX7B*nE@5j4@~3=Y$M^ayNhrPo&F`8|~f%Px9E6HIT=r%Pl&H#AOxU@p4$!lIh6} zY|CMp^fxVhiQyA0)`!;cJB2;Pyejt7jybdT=1tjkl%<4!NMLt5Mz6D2#72fpGLu6% zx;>wfE1&w;mAf9rjT9krMVi(z><&|+O(nm_C z9R*H`UgZYClpm~I>QrIwGGaENfhF&9ri{{+GWx(oK%CDq>svq!kCG_^k~rrP*fRwP zWLXUzVtFlt5@eEyzsmPGX}rL5pytf&vvaeB}MU&h=_3Q&5cj)|}UhKf$x zv9UC(YHOf`&4@>xObOje52OeU?7r~IY^g&9yzyd!?K!Vy2ZKJD_VTEt4%Qb)`o(@+ zon0v|*b`%QTo=NbTtw>Unxqt7lxvEvvfijYU$g+9qOr7z5`Lt799zUSJNROj_izft zw_?rCjm;6)FJXaiiKXeWnm3E$C+IrC_RNd6=LF_RoW-;%u%-(C6W)YZ$mEHbF?Fm^ zbTab@ecqlLlUqloClMtLU;|!Tk_*QO5)Rx$+>;AlVQl_5G#thLB$E*IyeB5-@ia3q zDv52&(5XZ48gr&YZ(?27I+`rjGL^A1qABn(hjlJ^Q(kh+Lhhp;t`+iUj=Odqu+x4> z!8aa_wVP)c<}dzWr|-#KZT{Z3AKjMhRg}=DZc8%06Star=&IY~J~#2tsK?8uzBW){ zv7SG@VxVTas3bMHg3101RE=GT>ObAa=8B^%$Nz6y9A@j*Ki+0!MIF_DoLZma{t@mI z7}C#(anEu85%+oSPjJn0J<2sjz0U*SJ!Vzh5Q!?kQ-{A#@q<+T62*720Qw1@XV_FW z&-1UPt%!T6QQ_+muBRhOtsuUNV*f4O$z8~|vRwWd+Q{|Y7b*KE${u0hT4%ZT+gwlZ z?Wm^j3h7H7M>OV0;_+=H+2Ljhk|~R_q(Zc6FvPU_xG^IOArwE%Pp=C)9o=T8zZAa z8>zuAR}EGU%!_{q;xDNkptlBEHkcDEEJrEJil5W#^YHa~z9|5Z*Kb+>UlgD7x#ftL z;_MHh_4l~O_)3F}UK7jWj_=XsAY{=@ zeIohEm1^&I>Gv3YhRk`)G{l?SG1|uVt^&A48I#DJtV!{I&1vxy8vYHk--n)E!}T=p z!px2(;BpKo39z-qk7Xzy%Zh&`yTlZsB1fs|onA2eL)ke-IgdD?X8#K3)^g0*M{D$LkoBb#&n$5?4CnFU*`LH^42BF*%tU(sMDr z8RVPm$my6sROGon4=bNRd~Rl;BgHlXI48GVWC%yYO zXp2uHA6@)w+{G#q#aY45@XW*{o44atu`)hS_$ZCdpd;C$?xhLCCF-ZZt$K7Gc`d@W z9Qsxl-)G9> z=OOI*$3)aRlKuwz^RU!9(_$+_$DEEsiLzzFVH~T2VqW&CT-Q)pvUnQF z-i0haPOY1{K1}(ysrMuRe#p0b(Xlb=-^}$0-~O344|AX3eiL`OJ(Ct% zzJR9w9rsDvOp){C=pl+9OdyXKNGI`14#}7UHV!p)@io5R$8{agS;{Wteh>FiWLW_6 z6yLt;`}7f;}3$-XJ~&H*Nt2oc$dQ^ zspNkS{EzegiHNhOD#n2%d-)a+=6v}}J|t-Ef`x-ma2O9-;vLZXEmwQlgCr)unEJhMnN>&yK2C@t`u?q{o@-z^?5A^RLx)1UEL-2o= z>xW?R_h!8~PoH`CFFp6i=-`8k)-szuJ5ydPrjGMz3e3cL%FK#8cKMhm`ehoIpTVn#KE-8T`GWdqZl)Op(4J` zKD^aaaO!@@_yL7a0dtNDzl=CvZ`?JZB>-6Q{r>~7;%av4*8o}rK$Pf|Rml+2q9M+1 zIT|lHCkq0Ywu+~5A2`{^a=r_ZQ%oV!Y`J?cj{(X&qDc;9uvhp8q^qfiez@!Yy}rmfw?pOj>Xpu#Yj894lbW3)vSY+BeO3? z;~&KjPV2|nUkpcNTC~9RFqD4+q(4fvDax;7KizkDZlV8Xh<}=I|46;txIPbVcw-ba zS(F%-#Am5_KXQ7zsdtnZYfloLhsvaXIX|12hEw07;g29p3j1fK{tuP+E#ea%H%B4u z1v>aH)E9KzMV3S^dNfI4F81{kuJV?X?)BlOGu`XSIm|{mfYjswk}YpUbDcqej;I~j zCKG8H=N&eo?sw#rrkrbIWJW5@kb7W4j%#D$tFB<8Ai-csrxqhNoebI{o&&0GTTB=N zXOwn|oK=h23d!~;-x7YN?($KwqXSd2R*pgj8L~Z>zUyxlg*#YK#o|}aV#zr+N*L}_ z!!ME8P)knJMHx~=J$FCzmkN^y`rQep?&OE0>|_Z8mob6DNMDB_pfQ~)JUW$=ZYepn zD!bGn+3z<`j}pNN3ZoJbgHdKu7+~5lIZG;WW7q`blmsTHL>l=-oIs2M26f)F zobGe^Ch4YrrXZQ(^0h#xC5fnfVZ2D?37#B@4Pz%T(9F1-25MFtg;NnkZsx4ygI~yEU0K~VcZ=R479?sfR#-O!f6IFsuGru7938) zB}N$(>dJYtQ1YZ;=g>YWh7_NoaK0pw-;c&mOVP-AZe7mA%9e8UEN4s|W+VqJsZ&zq zmd`Q7b2lvDw8j3spxJ^XZd5S0Clz9zOY&8YH4}at+1A%uuw;ZPSzg``fC(C5K1|e6 zd6e>mZ$p@xSCg1gDj7QOKacY)(O6)#8w1b!5}sKaqGHU>`5Gmm$X0p3B@-IZOjdi>;yfQ;Lm~&A^`wU zl_nOGwk^RjVE{)whFL(CS_-XmlqQzkN{o4sv_0$hEvF`B6KU>CER!=zOVjrfaGH*y zeodc=erILW5a5flCn-&5i5CU9&1A7$yila7Nz;wOfGi4#IiZ&nO3^Bwgq-qw9K~-z zGa5Sz@Ao5&i6tLmdH!@Kq%mDkhX=knNNOdSo_xqNp^=pc$eG|=yIjMRe@j~Zxj#4s zchV2bQz7~WIZvRMJq{gk6xRxg_5QIpnI$r@#>RHK4Q=o@`!nOlSs;RxOK9Mnl6shl zNOm*H{o$6{yM;+U@FsT8>&kmLW#WN{OV{%Bl88hct>7Pm zsTiJ2zl_{EVJow861mV=w+@iwNg*mKB<8xT82h}{;?au`E|1usR%*O4=@) zFO%xw1nICm(<6|0-mXA##G8mjd!lmY)t~K*a~7pHUKPSia_Ws9Y;VycH&@xff3Y!T!OP#vkc5;#F}`4!LcQ}jhx!cF>zk9j9eW) zlo;8{bGu8^&C^VfSCFn8HmAMxp~YL~#}+}4W8j%+MrX|3MeQCCk!Qr1StZxWGRa$I zy04utatfG3#@+%wBp7zfv`!~!jM)oP?-(h{&F2>V#W*mLpru32k~~IX*BHl?=u6Hr zv>>1;@K%yxPx-CnArnr7Qd_c*83-=NKhdz9(f2&_Hs_FXuH`zLT8u0uY(~jz!DiH+ zVRXC-tk+P^i3ilkVy2g<#JzwZ+Q7jkJ9AkMv`AO;gy(6nN@S8`G&%4n`+b>Gae$am zU)uR-x<9*jT772cE$qm`d{$v2%fMMP9rgL0w@jln66uC`#V1)Fs0dNTb-AI1I5Ek|f5P$`pXsBS+kN zCPg}U7W33;WIBx;^nK-?RB`5A6Ft?jQM)|0K%$Um)^t=v!dWAgDP(lII0sEHKQJo} zGf%LIk$iicBRtp_JvpngMn_iX*vzcKH(YC`+7WA-oVIh_b-gFe2&|hk2~)K$oaoWt?sh`g&ly;Jl=N!;xwwrC`XOevTR0&u6j(4oO^Mz!{_3p2jh9Qdlx<+ z5>Ik27C|22^B}@M#*xh<+xa5@BZNULV8P2nOi4}3^TI3;WI5zq;KXSe&r#mS(PR!1 z#tZ79&9JA>Obn&e-mN`VahR~o9{Ri(9Ez!j2Z$)4p%_A+^@sW#$t6}z4;3&9htr-@ z^1yQ%v__dVt;mzgoT{A_z1y7t42&}}f%VfPx+G|hHVF~gm)e7?PxQbRjw|EmaH)~B z5sS~kjwFtMB3yPF(gAy|)fc}CWQTNzQ5;Vy;`Cl7(e<04YDimb{H&2V0p4~GucM7hZS zF;L7B9bz0N?PU?AObo%vjpXp<b_7?z88o8r}j4irmPaFQ1?4#5`A9;=63 z*nX%pE|<4ba&cH5EyJ4flZ5AXBanOQ!*0?JUA1na6M3y3)ExEv2I-J~-*AIr$T&J}$l>oKG#i{3n)R%E@R{4L8BHk{r$@$66q z;vFO@rAcmzcR6CGU$mZ)WhGEdV#)FpTpq_N^EbKp#R`ZQeV^mM$*<%Ff!zExNiAj%9}c>kHr8nIHKbPb8Di8v^Y^7n`Xr|T`V6hV|S9LOf>Od zE*#omrbZSf#|gCAYW#Nq$W+dY^^WC4Jf1UCUo6{|2gb$V1Jk0n zyFjF$oPVGuj{Rx|vBn;&iNRmR5Wj_z4w8pgkJpapDe1NA@-%S&(lq6v9Z9yetqOMx za-&Dn^|v=(Eq4#?yYgD{!IuKz`vU$b=fv)_uPUh%dy4`AhA0fk*`OORb%%XWJhLm1 zomdRjHOrDa&^k;5vC3(^&bZjEPRz=?; z&`nN_NQ*#QeT(4Tw+JlcltoZmWuq(>A$*Gv%Xwc#O)NsWBG$y4#ak=8eP3k{d~>il z?13lQTZn6|5)MROTG>Y;cg6K@d&d=*zxCR;<*yvMEWAO4o4Nu{7hQJXox_)1b=A_x<~@IYS>ftyF1zl))$bJH5RLVVuYc#Y`2+8~vWxMeENH&u zz&o$_lk2a&?)(GSoGdd>mR(%9^2#etmNv;!YV}_>@`pveg;PZ#S<7#CqMk}8uN2Cf zYmt!bId%QWfh!JN_byrS56e!}lk_&HuetKtYp=iNO6r{;>tqSFmSE|xz3S@guQ(0R zQo(6Z8YPoXLGa)F){+N?w&oR^yp3r@ztd#AclP2ol?mqaF55@p0vAf-@U^LT)4|o zTp%nxoNCl1tjHO9yVUU6=CILe+j(YEH~^O^0>)bS+g4m85Kgm)rRIMfYJ1k`uDqlB*n$570ws@~Y4CS6ZLyUMg*< z``!;I$Z-pm<=>f_@#<2lNX=M+dAdC>Aq2=Do83n~EJ41_2Pi?VJ@F(}+MuP!}4>QYLij6)IMIlSV~mmtnIw0*Fl`y1nd?t8}r7#GPgR&h1fp`Lm< z0pSFFC;a~@gzh%FKlgEokT1kzr@P1h)M=hT=!^)m1oZKSwwoHde{pZ1`^dfj2~Obn XB Date: Sun, 28 Sep 2025 23:52:17 +0800 Subject: [PATCH 3/5] Fix parquet.hexpat boolean parsing --- patterns/parquet.hexpat | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/patterns/parquet.hexpat b/patterns/parquet.hexpat index dd8a0a27..d71400e5 100644 --- a/patterns/parquet.hexpat +++ b/patterns/parquet.hexpat @@ -215,10 +215,10 @@ struct CompactList { match (element_type) { (TCompactType::CT_BOOLEAN_TRUE): { - bool value[size]; + bool value = true [[export]]; } (TCompactType::CT_BOOLEAN_FALSE): { - bool value[size]; + bool value = false [[export]]; } (TCompactType::CT_BYTE): { s8 value[size]; @@ -734,4 +734,4 @@ struct ParquetFile { s16 last_field_id_stack_view[last_field_id_stack_size_max] @ 0x0 in last_field_id_stack; }; ParquetFile parquet_file @ 0x0; -//std::print("{}", parquet_file); \ No newline at end of file +//std::print("{}", parquet_file); From f721607e54f2418881375952f7c1fc11daa37d70 Mon Sep 17 00:00:00 2001 From: furidosu <86096478+furidosu@users.noreply.github.com> Date: Mon, 29 Sep 2025 00:22:38 +0800 Subject: [PATCH 4/5] Fix parquet.hexpat ColumnChunkPlacer not placing last chunk --- patterns/parquet.hexpat | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/patterns/parquet.hexpat b/patterns/parquet.hexpat index d71400e5..43cc5e27 100644 --- a/patterns/parquet.hexpat +++ b/patterns/parquet.hexpat @@ -714,10 +714,18 @@ struct ColumnChunk { }; struct ColumnChunkPlacer { - s64 column_offset_list_cur @ std::core::array_index() * sizeof(s64) in column_offset_list; - s64 column_offset_list_next @ (std::core::array_index() + 1) * sizeof(s64) in column_offset_list; + auto i = std::core::array_index(); + s64 column_offset_list_cur @ i * sizeof(s64) in column_offset_list; + + //std::print("i + 1: {}, size: {}", i + 1, column_offset_list_size); + if ((i + 1) < column_offset_list_size) { + s64 column_offset_list_next @ (i + 1) * sizeof(s64) in column_offset_list; + } else { + s64 column_offset_list_next = parent.footer_begin; + } + ColumnChunk column_chunk @ column_offset_list_cur; -} [[inline]]; +}; struct ParquetFile { char header_magic[4]; From caaf593e4ce1705a85b1001da77c72dcc65bbd50 Mon Sep 17 00:00:00 2001 From: furidosu <86096478+furidosu@users.noreply.github.com> Date: Sun, 5 Oct 2025 01:54:00 +0800 Subject: [PATCH 5/5] Fix parquet.hexpat using VarInt = LEB128 --- patterns/parquet.hexpat | 52 +++++++---------------------------------- 1 file changed, 8 insertions(+), 44 deletions(-) diff --git a/patterns/parquet.hexpat b/patterns/parquet.hexpat index 43cc5e27..6306c728 100644 --- a/patterns/parquet.hexpat +++ b/patterns/parquet.hexpat @@ -20,6 +20,7 @@ https://raw.githubusercontent.com/apache/parquet-format/refs/heads/master/src/ma import std.mem; import std.sys; import std.core; +import type.leb128; s16 last_field_id = 0; std::mem::Section last_field_id_stack = @@ -75,43 +76,7 @@ enum TCompactType : u8 { CT_EXTENDED = 0x0F }; -// Variable-length integer structures that advance cursor properly -struct VarInt { - u8 bytes[while(std::mem::read_unsigned($,1) & 0x80 != 0)]; - u8 final_byte; - - u64 value = decode_varint(addressof(bytes)) [[export]]; -} [[format("varint_format")]]; - -fn varint_format(VarInt vi) { - return std::format("VarInt: {}", vi.value); -}; - -// Variable-length integer decoding (ZigZag + VarInt) -fn decode_varint(auto ptr) { - u64 result = 0; - u8 shift = 0; - u8 byte_val; - - while (true) { - byte_val = std::mem::read_unsigned(ptr, 1); - ptr = ptr + 1; - - result |= (u64(byte_val & 0x7F) << shift); - shift += 7; - - if ((byte_val & 0x80) == 0) { - break; - } - - if (shift >= 64) { - std::error("VarInt too large"); - break; - } - } - - return result; -}; +using VarInt = type::LEB128; // ZigZag decode for signed integers fn zigzag_decode_32(u32 n) { @@ -138,7 +103,7 @@ struct ThriftFieldHeader { // If delta is 0, field ID follows as varint if (field_id_delta == 0) { VarInt field_id_varint; - s16 field_id = s16(zigzag_decode_32(u32(field_id_varint.value))) [[export]]; + s16 field_id = s16(zigzag_decode_32(u32(field_id_varint))) [[export]]; last_field_id = field_id; } else { // Field ID is previous_field_id + delta @@ -160,8 +125,7 @@ fn field_header_format(ThriftFieldHeader header) { // Variable-length string/binary struct CompactBinary { - VarInt length_varint; - u32 length = u32(length_varint.value) [[export]]; + VarInt length; char data[length]; } [[format("compact_binary_format")]]; @@ -172,7 +136,7 @@ fn compact_binary_format(CompactBinary bin) { // Variable-length integer types struct CompactI32 { VarInt raw_varint; - s32 value = zigzag_decode_32(u32(raw_varint.value)) [[export]]; + s32 value = zigzag_decode_32(u32(raw_varint)) [[export]]; } [[format("compact_i32_format")]]; fn compact_i32_format(CompactI32 val) { @@ -181,7 +145,7 @@ fn compact_i32_format(CompactI32 val) { struct CompactI64 { VarInt raw_varint; - s64 value = zigzag_decode_64(raw_varint.value) [[export]]; + s64 value = zigzag_decode_64(raw_varint) [[export]]; } [[format("compact_i64_format")]]; fn compact_i64_format(CompactI64 val) { @@ -208,7 +172,7 @@ struct CompactList { u32 size = 0 [[export]]; if (size_info == 0x0F) { VarInt size_varint; - size = u32(size_varint.value); + size = u32(size_varint); } else { size = size_info; } @@ -724,7 +688,7 @@ struct ColumnChunkPlacer { s64 column_offset_list_next = parent.footer_begin; } - ColumnChunk column_chunk @ column_offset_list_cur; + ColumnChunk column_chunk @ column_offset_list_cur; }; struct ParquetFile {