19
19
20
20
use std:: any:: Any ;
21
21
use std:: cell:: RefCell ;
22
- use std:: fmt;
23
22
use std:: fmt:: Debug ;
24
23
use std:: ops:: Range ;
25
24
use std:: rc:: Rc ;
26
25
use std:: sync:: Arc ;
26
+ use std:: { fmt, vec} ;
27
27
28
- use arrow:: array:: RecordBatch ;
28
+ use arrow:: array:: { ArrayRef , BooleanArray , RecordBatch } ;
29
29
use arrow:: datatypes:: { Fields , Schema , SchemaRef , TimeUnit } ;
30
30
use datafusion_datasource:: file_compression_type:: FileCompressionType ;
31
31
use datafusion_datasource:: file_sink_config:: { FileSink , FileSinkConfig } ;
@@ -36,7 +36,8 @@ use datafusion_datasource::write::{
36
36
use datafusion_datasource:: file_format:: { FileFormat , FileFormatFactory } ;
37
37
use datafusion_datasource:: write:: demux:: DemuxedStreamReceiver ;
38
38
39
- use arrow:: compute:: sum;
39
+ use arrow:: compute:: kernels:: cmp:: eq;
40
+ use arrow:: compute:: { and, sum} ;
40
41
use arrow:: datatypes:: { DataType , Field , FieldRef } ;
41
42
use datafusion_common:: config:: { ConfigField , ConfigFileType , TableParquetOptions } ;
42
43
#[ cfg( feature = "parquet_encryption" ) ]
@@ -46,7 +47,7 @@ use datafusion_common::parsers::CompressionTypeVariant;
46
47
use datafusion_common:: stats:: Precision ;
47
48
use datafusion_common:: {
48
49
internal_datafusion_err, internal_err, not_impl_err, ColumnStatistics ,
49
- DataFusionError , GetExt , HashSet , Result , DEFAULT_PARQUET_EXTENSION ,
50
+ DataFusionError , GetExt , HashSet , Result , ScalarValue , DEFAULT_PARQUET_EXTENSION ,
50
51
} ;
51
52
use datafusion_common:: { HashMap , Statistics } ;
52
53
use datafusion_common_runtime:: { JoinSet , SpawnedTask } ;
@@ -1170,7 +1171,8 @@ pub async fn fetch_statistics(
1170
1171
/// # When only some columns have statistics:
1171
1172
///
1172
1173
/// For columns with statistics:
1173
- /// - Min/max values are properly extracted and represented as Precision::Exact
1174
+ /// - Min/max values are properly extracted and represented as [Precision::Exact] or [Precision::Inexact]
1175
+ /// depending on the `is_max_value_exact` and `is_min_value_exact` flags.
1174
1176
/// - Null counts are calculated by summing across row groups
1175
1177
///
1176
1178
/// For columns without statistics,
@@ -1216,6 +1218,8 @@ pub fn statistics_from_parquet_meta_calc(
1216
1218
let ( mut max_accs, mut min_accs) = create_max_min_accs ( & table_schema) ;
1217
1219
let mut null_counts_array =
1218
1220
vec ! [ Precision :: Exact ( 0 ) ; table_schema. fields( ) . len( ) ] ;
1221
+ let mut is_max_value_exact = vec ! [ Some ( true ) ; table_schema. fields( ) . len( ) ] ;
1222
+ let mut is_min_value_exact = vec ! [ Some ( true ) ; table_schema. fields( ) . len( ) ] ;
1219
1223
1220
1224
table_schema
1221
1225
. fields ( )
@@ -1232,6 +1236,8 @@ pub fn statistics_from_parquet_meta_calc(
1232
1236
& mut min_accs,
1233
1237
& mut max_accs,
1234
1238
& mut null_counts_array,
1239
+ & mut is_min_value_exact,
1240
+ & mut is_max_value_exact,
1235
1241
idx,
1236
1242
num_rows,
1237
1243
& stats_converter,
@@ -1251,6 +1257,8 @@ pub fn statistics_from_parquet_meta_calc(
1251
1257
null_counts_array,
1252
1258
& mut max_accs,
1253
1259
& mut min_accs,
1260
+ & mut is_max_value_exact,
1261
+ & mut is_min_value_exact,
1254
1262
)
1255
1263
} else {
1256
1264
Statistics :: unknown_column ( & table_schema)
@@ -1264,21 +1272,39 @@ fn get_col_stats(
1264
1272
null_counts : Vec < Precision < usize > > ,
1265
1273
max_values : & mut [ Option < MaxAccumulator > ] ,
1266
1274
min_values : & mut [ Option < MinAccumulator > ] ,
1275
+ is_max_value_exact : & mut [ Option < bool > ] ,
1276
+ is_min_value_exact : & mut [ Option < bool > ] ,
1267
1277
) -> Vec < ColumnStatistics > {
1268
1278
( 0 ..schema. fields ( ) . len ( ) )
1269
1279
. map ( |i| {
1270
- let max_value = match max_values. get_mut ( i) . unwrap ( ) {
1271
- Some ( max_value) => max_value. evaluate ( ) . ok ( ) ,
1272
- None => None ,
1280
+ let max_value = match (
1281
+ max_values. get_mut ( i) . unwrap ( ) ,
1282
+ is_max_value_exact. get ( i) . unwrap ( ) ,
1283
+ ) {
1284
+ ( Some ( max_value) , Some ( true ) ) => {
1285
+ max_value. evaluate ( ) . ok ( ) . map ( Precision :: Exact )
1286
+ }
1287
+ ( Some ( max_value) , Some ( false ) ) | ( Some ( max_value) , None ) => {
1288
+ max_value. evaluate ( ) . ok ( ) . map ( Precision :: Inexact )
1289
+ }
1290
+ ( None , _) => None ,
1273
1291
} ;
1274
- let min_value = match min_values. get_mut ( i) . unwrap ( ) {
1275
- Some ( min_value) => min_value. evaluate ( ) . ok ( ) ,
1276
- None => None ,
1292
+ let min_value = match (
1293
+ min_values. get_mut ( i) . unwrap ( ) ,
1294
+ is_min_value_exact. get ( i) . unwrap ( ) ,
1295
+ ) {
1296
+ ( Some ( min_value) , Some ( true ) ) => {
1297
+ min_value. evaluate ( ) . ok ( ) . map ( Precision :: Exact )
1298
+ }
1299
+ ( Some ( min_value) , Some ( false ) ) | ( Some ( min_value) , None ) => {
1300
+ min_value. evaluate ( ) . ok ( ) . map ( Precision :: Inexact )
1301
+ }
1302
+ ( None , _) => None ,
1277
1303
} ;
1278
1304
ColumnStatistics {
1279
1305
null_count : null_counts[ i] ,
1280
- max_value : max_value. map ( Precision :: Exact ) . unwrap_or ( Precision :: Absent ) ,
1281
- min_value : min_value. map ( Precision :: Exact ) . unwrap_or ( Precision :: Absent ) ,
1306
+ max_value : max_value. unwrap_or ( Precision :: Absent ) ,
1307
+ min_value : min_value. unwrap_or ( Precision :: Absent ) ,
1282
1308
sum_value : Precision :: Absent ,
1283
1309
distinct_count : Precision :: Absent ,
1284
1310
}
@@ -1290,6 +1316,8 @@ fn summarize_min_max_null_counts(
1290
1316
min_accs : & mut [ Option < MinAccumulator > ] ,
1291
1317
max_accs : & mut [ Option < MaxAccumulator > ] ,
1292
1318
null_counts_array : & mut [ Precision < usize > ] ,
1319
+ is_min_value_exact : & mut [ Option < bool > ] ,
1320
+ is_max_value_exact : & mut [ Option < bool > ] ,
1293
1321
arrow_schema_index : usize ,
1294
1322
num_rows : usize ,
1295
1323
stats_converter : & StatisticsConverter ,
@@ -1298,13 +1326,29 @@ fn summarize_min_max_null_counts(
1298
1326
let max_values = stats_converter. row_group_maxes ( row_groups_metadata) ?;
1299
1327
let min_values = stats_converter. row_group_mins ( row_groups_metadata) ?;
1300
1328
let null_counts = stats_converter. row_group_null_counts ( row_groups_metadata) ?;
1329
+ let is_max_value_exact_stat =
1330
+ stats_converter. row_group_is_max_value_exact ( row_groups_metadata) ?;
1331
+ let is_min_value_exact_stat =
1332
+ stats_converter. row_group_is_min_value_exact ( row_groups_metadata) ?;
1301
1333
1302
1334
if let Some ( max_acc) = & mut max_accs[ arrow_schema_index] {
1303
- max_acc. update_batch ( & [ max_values] ) ?;
1335
+ max_acc. update_batch ( & [ Arc :: clone ( & max_values) ] ) ?;
1336
+ let mut cur_max_acc = max_acc. clone ( ) ;
1337
+ is_max_value_exact[ arrow_schema_index] = has_any_exact_match (
1338
+ cur_max_acc. evaluate ( ) ?,
1339
+ max_values,
1340
+ is_max_value_exact_stat,
1341
+ ) ;
1304
1342
}
1305
1343
1306
1344
if let Some ( min_acc) = & mut min_accs[ arrow_schema_index] {
1307
- min_acc. update_batch ( & [ min_values] ) ?;
1345
+ min_acc. update_batch ( & [ Arc :: clone ( & min_values) ] ) ?;
1346
+ let mut cur_min_acc = min_acc. clone ( ) ;
1347
+ is_min_value_exact[ arrow_schema_index] = has_any_exact_match (
1348
+ cur_min_acc. evaluate ( ) ?,
1349
+ min_values,
1350
+ is_min_value_exact_stat,
1351
+ ) ;
1308
1352
}
1309
1353
1310
1354
null_counts_array[ arrow_schema_index] = Precision :: Exact ( match sum ( & null_counts) {
@@ -1967,6 +2011,31 @@ fn create_max_min_accs(
1967
2011
( max_values, min_values)
1968
2012
}
1969
2013
2014
+ /// Checks if any occurrence of `value` in `array` corresponds to a `true`
2015
+ /// entry in the `exactness` array.
2016
+ ///
2017
+ /// This is used to determine if a calculated statistic (e.g., min or max)
2018
+ /// is exact, by checking if at least one of its source values was exact.
2019
+ ///
2020
+ /// # Example
2021
+ /// - `value`: `0`
2022
+ /// - `array`: `[0, 1, 0, 3, 0, 5]`
2023
+ /// - `exactness`: `[true, false, false, false, false, false]`
2024
+ ///
2025
+ /// The value `0` appears at indices `[0, 2, 4]`. The corresponding exactness
2026
+ /// values are `[true, false, false]`. Since at least one is `true`, the
2027
+ /// function returns `Some(true)`.
2028
+ fn has_any_exact_match (
2029
+ value : ScalarValue ,
2030
+ array : ArrayRef ,
2031
+ exactness : BooleanArray ,
2032
+ ) -> Option < bool > {
2033
+ let scalar_array = value. to_scalar ( ) . ok ( ) ?;
2034
+ let eq_mask = eq ( & scalar_array, & array) . ok ( ) ?;
2035
+ let combined_mask = and ( & eq_mask, & exactness) . ok ( ) ?;
2036
+ Some ( combined_mask. true_count ( ) > 0 )
2037
+ }
2038
+
1970
2039
#[ cfg( test) ]
1971
2040
mod tests {
1972
2041
use std:: sync:: Arc ;
0 commit comments