Add DataFileBuilder and remove **WriteResult trait

ZENOTME · ZENOTME · commit 26f305f2754b · 2024-01-12T12:32:41.000+08:00
diff --git a/crates/iceberg/src/spec/manifest.rs b/crates/iceberg/src/spec/manifest.rs
@@ -32,7 +32,6 @@ use serde_json::to_vec;
 use std::cmp::min;
 use std::collections::HashMap;
 use std::str::FromStr;
-
 /// A manifest contains metadata and a list of entries.
 #[derive(Debug, PartialEq, Eq, Clone)]
 pub struct Manifest {
@@ -851,7 +850,11 @@ impl TryFrom<i32> for ManifestStatus {
 }
 
 /// Data file carries data file path, partition tuple, metrics, …
-#[derive(Debug, PartialEq, Clone, Eq)]
+#[derive(Debug, PartialEq, Clone, Eq, Builder)]
+/// For optional field, we use `#[builder(default)]` or `#[builder(setter(strip_option), default)]` so that the field
+/// will be set to `Default` when it is not set.
+/// For required field, the build will fail if it is not set.
+#[builder(name = "DataFileBuilder", setter(prefix = "with"))]
 pub struct DataFile {
     /// field id: 134
     ///
@@ -886,25 +889,29 @@ pub struct DataFile {
     /// Map from column id to the total size on disk of all regions that
     /// store the column. Does not include bytes necessary to read other
     /// columns, like footers. Leave null for row-oriented formats (Avro)
+    #[builder(default)]
     column_sizes: HashMap<i32, u64>,
     /// field id: 109
     /// key field id: 119
     /// value field id: 120
     ///
     /// Map from column id to number of values in the column (including null
     /// and NaN values)
+    #[builder(default)]
     value_counts: HashMap<i32, u64>,
     /// field id: 110
     /// key field id: 121
     /// value field id: 122
     ///
     /// Map from column id to number of null values in the column
+    #[builder(default)]
     null_value_counts: HashMap<i32, u64>,
     /// field id: 137
     /// key field id: 138
     /// value field id: 139
     ///
     /// Map from column id to number of NaN values in the column
+    #[builder(default)]
     nan_value_counts: HashMap<i32, u64>,
     /// field id: 125
     /// key field id: 126
@@ -917,6 +924,7 @@ pub struct DataFile {
     /// Reference:
     ///
     /// - [Binary single-value serialization](https://iceberg.apache.org/spec/#binary-single-value-serialization)
+    #[builder(default)]
     lower_bounds: HashMap<i32, Literal>,
     /// field id: 128
     /// key field id: 129
@@ -929,16 +937,19 @@ pub struct DataFile {
     /// Reference:
     ///
     /// - [Binary single-value serialization](https://iceberg.apache.org/spec/#binary-single-value-serialization)
+    #[builder(default)]
     upper_bounds: HashMap<i32, Literal>,
     /// field id: 131
     ///
     /// Implementation-specific key metadata for encryption
+    #[builder(default)]
     key_metadata: Vec<u8>,
     /// field id: 132
     /// element field id: 133
     ///
     /// Split offsets for the data file. For example, all row group offsets
     /// in a Parquet file. Must be sorted ascending
+    #[builder(default)]
     split_offsets: Vec<i64>,
     /// field id: 135
     /// element field id: 136
@@ -947,6 +958,7 @@ pub struct DataFile {
     /// Required when content is EqualityDeletes and should be null
     /// otherwise. Fields with ids listed in this column must be present
     /// in the delete file
+    #[builder(default)]
     equality_ids: Vec<i32>,
     /// field id: 140
     ///
@@ -958,6 +970,7 @@ pub struct DataFile {
     /// sorted by file and position, not a table order, and should set sort
     /// order id to null. Readers must ignore sort order id for position
     /// delete files.
+    #[builder(setter(strip_option), default)]
     sort_order_id: Option<i32>,
 }
 
diff --git a/crates/iceberg/src/writer/file_writer/mod.rs b/crates/iceberg/src/writer/file_writer/mod.rs
@@ -17,8 +17,8 @@
 
 //! Iceberg File Writer
 
-use super::{CurrentFileStatus, IcebergWriteResult};
-use crate::Result;
+use super::CurrentFileStatus;
+use crate::{spec::DataFileBuilder, Result};
 use arrow_array::RecordBatch;
 use arrow_schema::SchemaRef;
 
@@ -34,18 +34,8 @@ pub trait FileWriterBuilder: Send + Clone + 'static {
 /// File writer focus on writing record batch to different physical file format.(Such as parquet. orc)
 #[async_trait::async_trait]
 pub trait FileWriter: Send + 'static + CurrentFileStatus {
-    /// The associated file write result type.
-    type R: FileWriteResult;
     /// Write record batch to file.
     async fn write(&mut self, batch: &RecordBatch) -> Result<()>;
     /// Close file writer.
-    async fn close(self) -> Result<Vec<Self::R>>;
-}
-
-/// File write result.
-pub trait FileWriteResult: Send + 'static {
-    /// The associated iceberg write result type.
-    type R: IcebergWriteResult;
-    /// Convert to iceberg write result.
-    fn to_iceberg_result(self) -> Self::R;
+    async fn close(self) -> Result<Vec<DataFileBuilder>>;
 }
diff --git a/crates/iceberg/src/writer/mod.rs b/crates/iceberg/src/writer/mod.rs
@@ -48,6 +48,8 @@
 //! iceberg_writer.write(input).await?;
 //!
 //! let write_result = iceberg_writer.flush().await?;
+//!
+//! let data_file = write_result.into_iter().map(|builder|builder.build()).collect::<Vec<_>>();
 //! ```
 //!
 //! # Complex Case 2: Create a fanout partition data file writer using parquet file format.
@@ -65,12 +67,11 @@
 //! iceberg_writer.write(input).await?;
 //!
 //! let write_result = iceberg_writer.flush().await?;
+//!
+//! let data_file = write_result.into_iter().map(|builder|builder.build()).collect::<Vec<_>>();
 //! ```
 
-use crate::{
-    spec::{DataContentType, Struct},
-    Result,
-};
+use crate::{spec::DataFileBuilder, Result};
 use arrow_array::RecordBatch;
 use arrow_schema::SchemaRef;
 
@@ -90,22 +91,10 @@ pub trait IcebergWriterBuilder<I = DefaultInput>: Send + Clone + 'static {
 /// The iceberg writer used to write data to iceberg table.
 #[async_trait::async_trait]
 pub trait IcebergWriter<I = DefaultInput>: Send + 'static {
-    /// The associated write result type.
-    type R: IcebergWriteResult;
     /// Write data to iceberg table.
     async fn write(&mut self, input: I) -> Result<()>;
     /// Flush the writer and return the write result.
-    async fn flush(&mut self) -> Result<Vec<Self::R>>;
-}
-
-/// The write result of iceberg writer.
-pub trait IcebergWriteResult: Send + Sync + 'static {
-    /// Set the content type of the write result.
-    fn set_content(&mut self, content: DataContentType) -> &mut Self;
-    /// Set the equality ids of the write result.
-    fn set_equality_ids(&mut self, equality_ids: Vec<i32>) -> &mut Self;
-    /// Set the partition of the write result.
-    fn set_partition(&mut self, partition_value: Struct) -> &mut Self;
+    async fn flush(&mut self) -> Result<Vec<DataFileBuilder>>;
 }
 
 /// The current file status of iceberg writer. It implement for the writer which write a single