-
Notifications
You must be signed in to change notification settings - Fork 1.6k
Consolidate Parquet Metadata handling into its own module and struct DFParquetMetadata
#17127
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
alamb
merged 8 commits into
apache:main
from
alamb:alamb/extract_parquet_metadata_handling
Aug 22, 2025
Merged
Changes from all commits
Commits
Show all changes
8 commits
Select commit
Hold shift + click to select a range
ef90d05
Consolidate Parquet Metadata handling
alamb 8033362
Merge remote-tracking branch 'apache/main' into alamb/extract_parquet…
alamb f313e14
Merge remote-tracking branch 'apache/main' into alamb/extract_parquet…
alamb e0a49f0
Apply suggestions from code review
alamb 3f31ad3
Merge branch 'alamb/extract_parquet_metadata_handling' of github.com:…
alamb 8b402f8
Update datafusion/datasource-parquet/src/metadata.rs
alamb f345778
Merge branch 'alamb/extract_parquet_metadata_handling' of github.com:…
alamb b0d1e5d
Improved comments
alamb File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -133,8 +133,7 @@ mod tests { | |
use datafusion_datasource::file_sink_config::{FileSink, FileSinkConfig}; | ||
use datafusion_datasource::{ListingTableUrl, PartitionedFile}; | ||
use datafusion_datasource_parquet::{ | ||
fetch_parquet_metadata, fetch_statistics, statistics_from_parquet_meta_calc, | ||
ObjectStoreFetch, ParquetFormat, ParquetFormatFactory, ParquetSink, | ||
ParquetFormat, ParquetFormatFactory, ParquetSink, | ||
}; | ||
use datafusion_execution::object_store::ObjectStoreUrl; | ||
use datafusion_execution::runtime_env::RuntimeEnv; | ||
|
@@ -143,13 +142,15 @@ mod tests { | |
use datafusion_physical_plan::stream::RecordBatchStreamAdapter; | ||
use datafusion_physical_plan::{collect, ExecutionPlan}; | ||
|
||
use crate::test_util::bounded_stream; | ||
use arrow::array::{ | ||
types::Int32Type, Array, ArrayRef, DictionaryArray, Int32Array, Int64Array, | ||
StringArray, | ||
}; | ||
use arrow::datatypes::{DataType, Field}; | ||
use async_trait::async_trait; | ||
use datafusion_datasource::file_groups::FileGroup; | ||
use datafusion_datasource_parquet::metadata::DFParquetMetadata; | ||
use futures::stream::BoxStream; | ||
use futures::StreamExt; | ||
use insta::assert_snapshot; | ||
|
@@ -167,8 +168,6 @@ mod tests { | |
use parquet::format::FileMetaData; | ||
use tokio::fs::File; | ||
|
||
use crate::test_util::bounded_stream; | ||
|
||
enum ForceViews { | ||
Yes, | ||
No, | ||
|
@@ -195,31 +194,24 @@ mod tests { | |
let format = ParquetFormat::default().with_force_view_types(force_views); | ||
let schema = format.infer_schema(&ctx, &store, &meta).await?; | ||
|
||
let stats = fetch_statistics( | ||
store.as_ref(), | ||
schema.clone(), | ||
&meta[0], | ||
None, | ||
None, | ||
Some(ctx.runtime_env().cache_manager.get_file_metadata_cache()), | ||
) | ||
.await?; | ||
let file_metadata_cache = | ||
ctx.runtime_env().cache_manager.get_file_metadata_cache(); | ||
let stats = DFParquetMetadata::new(&store, &meta[0]) | ||
.with_file_metadata_cache(Some(Arc::clone(&file_metadata_cache))) | ||
.fetch_statistics(&schema) | ||
.await?; | ||
|
||
assert_eq!(stats.num_rows, Precision::Exact(3)); | ||
let c1_stats = &stats.column_statistics[0]; | ||
let c2_stats = &stats.column_statistics[1]; | ||
assert_eq!(c1_stats.null_count, Precision::Exact(1)); | ||
assert_eq!(c2_stats.null_count, Precision::Exact(3)); | ||
|
||
let stats = fetch_statistics( | ||
store.as_ref(), | ||
schema, | ||
&meta[1], | ||
None, | ||
None, | ||
Some(ctx.runtime_env().cache_manager.get_file_metadata_cache()), | ||
) | ||
.await?; | ||
let stats = DFParquetMetadata::new(&store, &meta[1]) | ||
.with_file_metadata_cache(Some(Arc::clone(&file_metadata_cache))) | ||
.fetch_statistics(&schema) | ||
.await?; | ||
|
||
assert_eq!(stats.num_rows, Precision::Exact(3)); | ||
let c1_stats = &stats.column_statistics[0]; | ||
let c2_stats = &stats.column_statistics[1]; | ||
|
@@ -392,51 +384,27 @@ mod tests { | |
|
||
// Use a size hint larger than the parquet footer but smaller than the actual metadata, requiring a second fetch | ||
// for the remaining metadata | ||
fetch_parquet_metadata( | ||
ObjectStoreFetch::new(store.as_ref() as &dyn ObjectStore, &meta[0]), | ||
&meta[0], | ||
Some(9), | ||
None, | ||
None, | ||
) | ||
.await | ||
.expect("error reading metadata with hint"); | ||
let file_metadata_cache = | ||
ctx.runtime_env().cache_manager.get_file_metadata_cache(); | ||
let df_meta = DFParquetMetadata::new(store.as_ref(), &meta[0]) | ||
.with_metadata_size_hint(Some(9)); | ||
df_meta.fetch_metadata().await?; | ||
assert_eq!(store.request_count(), 2); | ||
|
||
let df_meta = | ||
df_meta.with_file_metadata_cache(Some(Arc::clone(&file_metadata_cache))); | ||
|
||
// Increases by 3 because cache has no entries yet | ||
fetch_parquet_metadata( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the new struct makes it much clearer what is being tested vs what is test setup functionality and I find the updated tests to be much easier to read |
||
ObjectStoreFetch::new(store.as_ref() as &dyn ObjectStore, &meta[0]), | ||
&meta[0], | ||
Some(9), | ||
None, | ||
Some(ctx.runtime_env().cache_manager.get_file_metadata_cache()), | ||
) | ||
.await | ||
.expect("error reading metadata with hint"); | ||
df_meta.fetch_metadata().await?; | ||
assert_eq!(store.request_count(), 5); | ||
|
||
// No increase because cache has an entry | ||
fetch_parquet_metadata( | ||
ObjectStoreFetch::new(store.as_ref() as &dyn ObjectStore, &meta[0]), | ||
&meta[0], | ||
Some(9), | ||
None, | ||
Some(ctx.runtime_env().cache_manager.get_file_metadata_cache()), | ||
) | ||
.await | ||
.expect("error reading metadata with hint"); | ||
df_meta.fetch_metadata().await?; | ||
assert_eq!(store.request_count(), 5); | ||
|
||
// Increase by 2 because `get_file_metadata_cache()` is None | ||
fetch_parquet_metadata( | ||
ObjectStoreFetch::new(store.as_ref() as &dyn ObjectStore, &meta[0]), | ||
&meta[0], | ||
Some(9), | ||
None, | ||
None, | ||
) | ||
.await | ||
.expect("error reading metadata with hint"); | ||
let df_meta = df_meta.with_file_metadata_cache(None); | ||
df_meta.fetch_metadata().await?; | ||
assert_eq!(store.request_count(), 7); | ||
|
||
let force_views = match force_views { | ||
|
@@ -454,15 +422,9 @@ mod tests { | |
assert_eq!(store.request_count(), 10); | ||
|
||
// No increase, cache being used | ||
let stats = fetch_statistics( | ||
store.upcast().as_ref(), | ||
schema.clone(), | ||
&meta[0], | ||
Some(9), | ||
None, | ||
Some(ctx.runtime_env().cache_manager.get_file_metadata_cache()), | ||
) | ||
.await?; | ||
let df_meta = | ||
df_meta.with_file_metadata_cache(Some(Arc::clone(&file_metadata_cache))); | ||
let stats = df_meta.fetch_statistics(&schema).await?; | ||
assert_eq!(store.request_count(), 10); | ||
|
||
assert_eq!(stats.num_rows, Precision::Exact(3)); | ||
|
@@ -477,55 +439,30 @@ mod tests { | |
|
||
// Use the file size as the hint so we can get the full metadata from the first fetch | ||
let size_hint = meta[0].size as usize; | ||
let df_meta = DFParquetMetadata::new(store.as_ref(), &meta[0]) | ||
.with_metadata_size_hint(Some(size_hint)); | ||
|
||
fetch_parquet_metadata( | ||
ObjectStoreFetch::new(store.upcast().as_ref(), &meta[0]), | ||
&meta[0], | ||
Some(size_hint), | ||
None, | ||
None, | ||
) | ||
.await | ||
.expect("error reading metadata with hint"); | ||
df_meta.fetch_metadata().await?; | ||
// ensure the requests were coalesced into a single request | ||
assert_eq!(store.request_count(), 1); | ||
|
||
let session = SessionContext::new(); | ||
let ctx = session.state(); | ||
let file_metadata_cache = | ||
ctx.runtime_env().cache_manager.get_file_metadata_cache(); | ||
let df_meta = | ||
df_meta.with_file_metadata_cache(Some(Arc::clone(&file_metadata_cache))); | ||
// Increases by 1 because cache has no entries yet and new session context | ||
fetch_parquet_metadata( | ||
ObjectStoreFetch::new(store.upcast().as_ref(), &meta[0]), | ||
&meta[0], | ||
Some(size_hint), | ||
None, | ||
Some(ctx.runtime_env().cache_manager.get_file_metadata_cache()), | ||
) | ||
.await | ||
.expect("error reading metadata with hint"); | ||
df_meta.fetch_metadata().await?; | ||
assert_eq!(store.request_count(), 2); | ||
|
||
// No increase because cache has an entry | ||
fetch_parquet_metadata( | ||
ObjectStoreFetch::new(store.upcast().as_ref(), &meta[0]), | ||
&meta[0], | ||
Some(size_hint), | ||
None, | ||
Some(ctx.runtime_env().cache_manager.get_file_metadata_cache()), | ||
) | ||
.await | ||
.expect("error reading metadata with hint"); | ||
df_meta.fetch_metadata().await?; | ||
assert_eq!(store.request_count(), 2); | ||
|
||
// Increase by 1 because `get_file_metadata_cache` is None | ||
fetch_parquet_metadata( | ||
ObjectStoreFetch::new(store.upcast().as_ref(), &meta[0]), | ||
&meta[0], | ||
Some(size_hint), | ||
None, | ||
None, | ||
) | ||
.await | ||
.expect("error reading metadata with hint"); | ||
let df_meta = df_meta.with_file_metadata_cache(None); | ||
df_meta.fetch_metadata().await?; | ||
assert_eq!(store.request_count(), 3); | ||
|
||
let format = ParquetFormat::default() | ||
|
@@ -538,15 +475,9 @@ mod tests { | |
let schema = format.infer_schema(&ctx, &store.upcast(), &meta).await?; | ||
assert_eq!(store.request_count(), 4); | ||
// No increase, cache being used | ||
let stats = fetch_statistics( | ||
store.upcast().as_ref(), | ||
schema.clone(), | ||
&meta[0], | ||
Some(size_hint), | ||
None, | ||
Some(ctx.runtime_env().cache_manager.get_file_metadata_cache()), | ||
) | ||
.await?; | ||
let df_meta = | ||
df_meta.with_file_metadata_cache(Some(Arc::clone(&file_metadata_cache))); | ||
let stats = df_meta.fetch_statistics(&schema).await?; | ||
assert_eq!(store.request_count(), 4); | ||
|
||
assert_eq!(stats.num_rows, Precision::Exact(3)); | ||
|
@@ -559,29 +490,18 @@ mod tests { | |
LocalFileSystem::new(), | ||
))); | ||
|
||
// Use the a size hint larger than the file size to make sure we don't panic | ||
// Use a size hint larger than the file size to make sure we don't panic | ||
let size_hint = (meta[0].size + 100) as usize; | ||
fetch_parquet_metadata( | ||
ObjectStoreFetch::new(store.upcast().as_ref(), &meta[0]), | ||
&meta[0], | ||
Some(size_hint), | ||
None, | ||
None, | ||
) | ||
.await | ||
.expect("error reading metadata with hint"); | ||
let df_meta = DFParquetMetadata::new(store.as_ref(), &meta[0]) | ||
.with_metadata_size_hint(Some(size_hint)); | ||
|
||
df_meta.fetch_metadata().await?; | ||
assert_eq!(store.request_count(), 1); | ||
|
||
// No increase because cache has an entry | ||
fetch_parquet_metadata( | ||
ObjectStoreFetch::new(store.upcast().as_ref(), &meta[0]), | ||
&meta[0], | ||
Some(size_hint), | ||
None, | ||
Some(ctx.runtime_env().cache_manager.get_file_metadata_cache()), | ||
) | ||
.await | ||
.expect("error reading metadata with hint"); | ||
let df_meta = | ||
df_meta.with_file_metadata_cache(Some(Arc::clone(&file_metadata_cache))); | ||
df_meta.fetch_metadata().await?; | ||
assert_eq!(store.request_count(), 1); | ||
|
||
Ok(()) | ||
|
@@ -634,16 +554,12 @@ mod tests { | |
assert_eq!(store.request_count(), 3); | ||
|
||
// No increase in request count because cache is not empty | ||
let pq_meta = fetch_parquet_metadata( | ||
ObjectStoreFetch::new(store.as_ref(), &files[0]), | ||
&files[0], | ||
None, | ||
None, | ||
Some(state.runtime_env().cache_manager.get_file_metadata_cache()), | ||
) | ||
.await?; | ||
assert_eq!(store.request_count(), 3); | ||
let stats = statistics_from_parquet_meta_calc(&pq_meta, schema.clone())?; | ||
let file_metadata_cache = | ||
state.runtime_env().cache_manager.get_file_metadata_cache(); | ||
let stats = DFParquetMetadata::new(store.as_ref(), &files[0]) | ||
.with_file_metadata_cache(Some(Arc::clone(&file_metadata_cache))) | ||
.fetch_statistics(&schema) | ||
.await?; | ||
assert_eq!(stats.num_rows, Precision::Exact(4)); | ||
|
||
// column c_dic | ||
|
@@ -716,16 +632,13 @@ mod tests { | |
}; | ||
|
||
// No increase in request count because cache is not empty | ||
let pq_meta = fetch_parquet_metadata( | ||
ObjectStoreFetch::new(store.as_ref(), &files[0]), | ||
&files[0], | ||
None, | ||
None, | ||
Some(state.runtime_env().cache_manager.get_file_metadata_cache()), | ||
) | ||
.await?; | ||
let file_metadata_cache = | ||
state.runtime_env().cache_manager.get_file_metadata_cache(); | ||
let stats = DFParquetMetadata::new(store.as_ref(), &files[0]) | ||
.with_file_metadata_cache(Some(Arc::clone(&file_metadata_cache))) | ||
.fetch_statistics(&schema) | ||
.await?; | ||
assert_eq!(store.request_count(), 6); | ||
let stats = statistics_from_parquet_meta_calc(&pq_meta, schema.clone())?; | ||
assert_eq!(stats.num_rows, Precision::Exact(3)); | ||
// column c1 | ||
let c1_stats = &stats.column_statistics[0]; | ||
|
@@ -750,16 +663,11 @@ mod tests { | |
assert_eq!(c2_stats.min_value, Precision::Exact(null_i64.clone())); | ||
|
||
// No increase in request count because cache is not empty | ||
let pq_meta = fetch_parquet_metadata( | ||
ObjectStoreFetch::new(store.as_ref(), &files[1]), | ||
&files[1], | ||
None, | ||
None, | ||
Some(state.runtime_env().cache_manager.get_file_metadata_cache()), | ||
) | ||
.await?; | ||
let stats = DFParquetMetadata::new(store.as_ref(), &files[1]) | ||
.with_file_metadata_cache(Some(Arc::clone(&file_metadata_cache))) | ||
.fetch_statistics(&schema) | ||
.await?; | ||
assert_eq!(store.request_count(), 6); | ||
let stats = statistics_from_parquet_meta_calc(&pq_meta, schema.clone())?; | ||
assert_eq!(stats.num_rows, Precision::Exact(3)); | ||
// column c1: missing from the file so the table treats all 3 rows as null | ||
let c1_stats = &stats.column_statistics[0]; | ||
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this shows the key API difference -- instead of calling a bunch of free functions, you now construct a
DFParquetMetadata
and call methods on that struct insteadThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks way cleaner now.