@@ -830,16 +830,6 @@ pub trait ObjectStorage: Debug + Send + Sync + 'static {
830
830
let custom_partition = stream. get_custom_partition ( ) ;
831
831
let schema = stream. get_schema ( ) ;
832
832
for path in stream. parquet_files ( ) {
833
- if stream. get_stream_type ( ) != StreamType :: Internal
834
- && PARSEABLE . options . collect_dataset_stats
835
- {
836
- if let Err ( err) = calculate_field_stats ( stream_name, & path, & schema) . await {
837
- warn ! (
838
- "Error calculating field stats for stream {}: {}" ,
839
- stream_name, err
840
- ) ;
841
- }
842
- }
843
833
let filename = path
844
834
. file_name ( )
845
835
. expect ( "only parquet files are returned by iterator" )
@@ -889,6 +879,18 @@ pub trait ObjectStorage: Debug + Send + Sync + 'static {
889
879
let manifest = catalog:: create_from_parquet_file ( absolute_path. clone ( ) , & path) ?;
890
880
catalog:: update_snapshot ( store, stream_name, manifest) . await ?;
891
881
882
+ // If the stream is not internal and stats collection is enabled, calculate field stats
883
+ // before removing the parquet file
884
+ if stream. get_stream_type ( ) != StreamType :: Internal
885
+ && PARSEABLE . options . collect_dataset_stats
886
+ {
887
+ if let Err ( err) = calculate_field_stats ( stream_name, & path, & schema) . await {
888
+ warn ! (
889
+ "Error calculating field stats for stream {}: {}" ,
890
+ stream_name, err
891
+ ) ;
892
+ }
893
+ }
892
894
if let Err ( e) = remove_file ( path) {
893
895
warn ! ( "Failed to remove staged file: {e}" ) ;
894
896
}
@@ -968,7 +970,13 @@ async fn calculate_field_stats(
968
970
)
969
971
. await ?;
970
972
let ctx = SessionContext :: new_with_state ( QUERY_SESSION_STATE . clone ( ) ) ;
971
- let ctx_table_name = format ! ( "{}_{}" , stream_name, parquet_path. display( ) ) ;
973
+ let parquet_file_name = parquet_path
974
+ . file_name ( )
975
+ . expect ( "only parquet files are returned by iterator" )
976
+ . to_str ( )
977
+ . expect ( "filename is valid string" ) ;
978
+ let parquet_file_name = str:: replace ( parquet_file_name, "." , "_" ) ;
979
+ let ctx_table_name = format ! ( "{}_{}" , stream_name, parquet_file_name) ;
972
980
ctx. register_parquet (
973
981
& ctx_table_name,
974
982
parquet_path. to_str ( ) . expect ( "valid path" ) ,
@@ -1062,81 +1070,118 @@ async fn calculate_single_field_stats(
1062
1070
/// This is used for fetching record count for a field and distinct count.
1063
1071
async fn query_single_i64 ( ctx : & SessionContext , sql : & str ) -> Option < i64 > {
1064
1072
let df = ctx. sql ( sql) . await . ok ( ) ?;
1065
- let mut stream = df. execute_stream ( ) . await . ok ( ) ?;
1066
- let mut count = 0 ;
1067
- while let Some ( batch_result) = stream. next ( ) . await {
1068
- let batch = batch_result. ok ( ) ?;
1069
- if batch. num_rows ( ) == 0 {
1070
- return None ;
1071
- }
1072
- let array = batch. column ( 0 ) . as_any ( ) . downcast_ref :: < Int64Array > ( ) ?;
1073
- count += array. value ( 0 ) ;
1073
+ let batches = df. collect ( ) . await . ok ( ) ?;
1074
+ let batch = batches. first ( ) ?;
1075
+ if batch. num_rows ( ) == 0 {
1076
+ return None ;
1074
1077
}
1075
- Some ( count)
1078
+ let array = batch. column ( 0 ) . as_any ( ) . downcast_ref :: < Int64Array > ( ) ?;
1079
+
1080
+ Some ( array. value ( 0 ) )
1081
+ }
1082
+
1083
+ macro_rules! try_downcast {
1084
+ ( $ty: ty, $arr: expr, $body: expr) => {
1085
+ if let Some ( arr) = $arr. as_any( ) . downcast_ref:: <$ty>( ) {
1086
+ $body( arr)
1087
+ } else {
1088
+ warn!(
1089
+ "Expected {} for {:?}, but found {:?}" ,
1090
+ stringify!( $ty) ,
1091
+ $arr. data_type( ) ,
1092
+ $arr. data_type( )
1093
+ ) ;
1094
+ "UNSUPPORTED" . to_string( )
1095
+ }
1096
+ } ;
1076
1097
}
1077
1098
1078
- /// Helper function to format an Arrow value at a given index into a string.
1079
- /// Handles null values and different data types like String, Int64, Float64, Timestamp, Date32, and Boolean .
1099
+ /// Function to format an Arrow value at a given index into a string.
1100
+ /// Handles null values and different data types by downcasting the array to the appropriate type .
1080
1101
fn format_arrow_value ( array : & dyn Array , idx : usize ) -> String {
1081
1102
if array. is_null ( idx) {
1082
1103
return "NULL" . to_string ( ) ;
1083
1104
}
1084
1105
1085
1106
match array. data_type ( ) {
1086
- DataType :: Utf8 => array
1087
- . as_any ( )
1088
- . downcast_ref :: < StringArray > ( )
1089
- . unwrap ( )
1107
+ DataType :: Utf8 => try_downcast ! ( StringArray , array, |arr: & StringArray | arr
1090
1108
. value( idx)
1091
- . to_string ( ) ,
1092
- DataType :: Utf8View => array
1093
- . as_any ( )
1094
- . downcast_ref :: < StringViewArray > ( )
1095
- . unwrap ( )
1109
+ . to_string( ) ) ,
1110
+ DataType :: Utf8View => try_downcast ! ( StringViewArray , array, |arr: & StringViewArray | arr
1096
1111
. value( idx)
1097
- . to_string ( ) ,
1098
- DataType :: Binary => {
1099
- let arr = array. as_any ( ) . downcast_ref :: < BinaryArray > ( ) . unwrap ( ) ;
1112
+ . to_string( ) ) ,
1113
+ DataType :: Binary => try_downcast ! ( BinaryArray , array, |arr: & BinaryArray | {
1100
1114
String :: from_utf8_lossy( arr. value( idx) ) . to_string( )
1101
- }
1102
- DataType :: BinaryView => {
1103
- let arr = array. as_any ( ) . downcast_ref :: < BinaryViewArray > ( ) . unwrap ( ) ;
1115
+ } ) ,
1116
+ DataType :: BinaryView => try_downcast ! ( BinaryViewArray , array, |arr: & BinaryViewArray | {
1104
1117
String :: from_utf8_lossy( arr. value( idx) ) . to_string( )
1105
- }
1106
- DataType :: Int64 => array
1107
- . as_any ( )
1108
- . downcast_ref :: < Int64Array > ( )
1109
- . unwrap ( )
1118
+ } ) ,
1119
+ DataType :: Int64 => try_downcast ! ( Int64Array , array, |arr: & Int64Array | arr
1110
1120
. value( idx)
1111
- . to_string ( ) ,
1112
- DataType :: Float64 => array
1113
- . as_any ( )
1114
- . downcast_ref :: < Float64Array > ( )
1115
- . unwrap ( )
1121
+ . to_string( ) ) ,
1122
+ DataType :: Int32 => try_downcast ! (
1123
+ arrow_array:: Int32Array ,
1124
+ array,
1125
+ |arr: & arrow_array:: Int32Array | arr. value( idx) . to_string( )
1126
+ ) ,
1127
+ DataType :: Int16 => try_downcast ! (
1128
+ arrow_array:: Int16Array ,
1129
+ array,
1130
+ |arr: & arrow_array:: Int16Array | arr. value( idx) . to_string( )
1131
+ ) ,
1132
+ DataType :: Int8 => try_downcast ! (
1133
+ arrow_array:: Int8Array ,
1134
+ array,
1135
+ |arr: & arrow_array:: Int8Array | arr. value( idx) . to_string( )
1136
+ ) ,
1137
+ DataType :: UInt64 => try_downcast ! (
1138
+ arrow_array:: UInt64Array ,
1139
+ array,
1140
+ |arr: & arrow_array:: UInt64Array | arr. value( idx) . to_string( )
1141
+ ) ,
1142
+ DataType :: UInt32 => try_downcast ! (
1143
+ arrow_array:: UInt32Array ,
1144
+ array,
1145
+ |arr: & arrow_array:: UInt32Array | arr. value( idx) . to_string( )
1146
+ ) ,
1147
+ DataType :: UInt16 => try_downcast ! (
1148
+ arrow_array:: UInt16Array ,
1149
+ array,
1150
+ |arr: & arrow_array:: UInt16Array | arr. value( idx) . to_string( )
1151
+ ) ,
1152
+ DataType :: UInt8 => try_downcast ! (
1153
+ arrow_array:: UInt8Array ,
1154
+ array,
1155
+ |arr: & arrow_array:: UInt8Array | arr. value( idx) . to_string( )
1156
+ ) ,
1157
+ DataType :: Float64 => try_downcast ! ( Float64Array , array, |arr: & Float64Array | arr
1116
1158
. value( idx)
1117
- . to_string ( ) ,
1118
- DataType :: Timestamp ( TimeUnit :: Millisecond , _) => {
1119
- let arr = array
1120
- . as_any ( )
1121
- . downcast_ref :: < TimestampMillisecondArray > ( )
1122
- . unwrap ( ) ;
1123
- let timestamp = arr. value ( idx) ;
1124
- DateTime :: from_timestamp_millis ( timestamp)
1125
- . map ( |dt| dt. to_string ( ) )
1126
- . unwrap_or_else ( || "INVALID_TIMESTAMP" . to_string ( ) )
1127
- }
1128
- DataType :: Date32 => array
1129
- . as_any ( )
1130
- . downcast_ref :: < Date32Array > ( )
1131
- . unwrap ( )
1159
+ . to_string( ) ) ,
1160
+ DataType :: Float32 => try_downcast ! (
1161
+ arrow_array:: Float32Array ,
1162
+ array,
1163
+ |arr: & arrow_array:: Float32Array | arr. value( idx) . to_string( )
1164
+ ) ,
1165
+ DataType :: Timestamp ( TimeUnit :: Millisecond , _) => try_downcast ! (
1166
+ TimestampMillisecondArray ,
1167
+ array,
1168
+ |arr: & TimestampMillisecondArray | {
1169
+ let timestamp = arr. value( idx) ;
1170
+ chrono:: DateTime :: from_timestamp_millis( timestamp)
1171
+ . map( |dt| dt. to_string( ) )
1172
+ . unwrap_or_else( || "INVALID_TIMESTAMP" . to_string( ) )
1173
+ }
1174
+ ) ,
1175
+ DataType :: Date32 => try_downcast ! ( Date32Array , array, |arr: & Date32Array | arr
1132
1176
. value( idx)
1133
- . to_string ( ) ,
1134
- DataType :: Boolean => array
1135
- . as_any ( )
1136
- . downcast_ref :: < BooleanArray > ( )
1137
- . unwrap ( )
1177
+ . to_string( ) ) ,
1178
+ DataType :: Boolean => try_downcast ! ( BooleanArray , array, |arr: & BooleanArray | if arr
1138
1179
. value( idx)
1139
- . to_string ( ) ,
1180
+ {
1181
+ "true" . to_string( )
1182
+ } else {
1183
+ "false" . to_string( )
1184
+ } ) ,
1140
1185
DataType :: Null => "NULL" . to_string ( ) ,
1141
1186
_ => {
1142
1187
warn ! (
@@ -1162,20 +1207,20 @@ async fn query_distinct_stats(
1162
1207
) ;
1163
1208
let mut distinct_stats = Vec :: new ( ) ;
1164
1209
if let Ok ( df) = ctx. sql ( & sql) . await {
1165
- if let Ok ( batches ) = df. collect ( ) . await {
1166
- for rb in batches {
1167
- let Some ( counts ) = rb . column ( 0 ) . as_any ( ) . downcast_ref :: < Int64Array > ( ) else {
1168
- warn ! ( "Unexpected type for count column in stats query" ) ;
1169
- continue ;
1170
- } ;
1171
- let values = rb . column ( 1 ) . as_ref ( ) ;
1172
- for i in 0 .. rb. num_rows ( ) {
1173
- let value = format_arrow_value ( values , i ) ;
1174
- distinct_stats . push ( DistinctStat {
1175
- distinct_value : value ,
1176
- count : counts . value ( i ) ,
1177
- } ) ;
1178
- }
1210
+ let mut stream = df. execute_stream ( ) . await . expect ( "Failed to execute stream" ) ;
1211
+ while let Some ( batch_result ) = stream . next ( ) . await {
1212
+ let rb = batch_result . expect ( "Failed to execute stream" ) ;
1213
+ let Some ( counts ) = rb . column ( 0 ) . as_any ( ) . downcast_ref :: < Int64Array > ( ) else {
1214
+ warn ! ( "Unexpected type for count column in stats query" ) ;
1215
+ continue ;
1216
+ } ;
1217
+ let values = rb. column ( 1 ) . as_ref ( ) ;
1218
+ for i in 0 ..rb . num_rows ( ) {
1219
+ let value = format_arrow_value ( values , i ) ;
1220
+ distinct_stats . push ( DistinctStat {
1221
+ distinct_value : value,
1222
+ count : counts . value ( i ) ,
1223
+ } ) ;
1179
1224
}
1180
1225
}
1181
1226
}
0 commit comments