Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions crates/iceberg/src/writer/base_writer/data_file_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use arrow_array::RecordBatch;
use itertools::Itertools;

use crate::Result;
use crate::io::OutputFile;
use crate::spec::{DataContentType, DataFile, Struct};
use crate::writer::file_writer::{FileWriter, FileWriterBuilder};
use crate::writer::{CurrentFileStatus, IcebergWriter, IcebergWriterBuilder};
Expand Down Expand Up @@ -48,9 +49,9 @@ impl<B: FileWriterBuilder> DataFileWriterBuilder<B> {
impl<B: FileWriterBuilder> IcebergWriterBuilder for DataFileWriterBuilder<B> {
type R = DataFileWriter<B>;

async fn build(self) -> Result<Self::R> {
async fn build(self, output_file: OutputFile) -> Result<Self::R> {
Ok(DataFileWriter {
inner_writer: Some(self.inner.clone().build().await?),
inner_writer: Some(self.inner.clone().build(output_file).await?),
partition_value: self.partition_value.unwrap_or(Struct::empty()),
partition_spec_id: self.partition_spec_id,
})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ use parquet::arrow::PARQUET_FIELD_ID_META_KEY;

use crate::arrow::record_batch_projector::RecordBatchProjector;
use crate::arrow::schema_to_arrow_schema;
use crate::io::OutputFile;
use crate::spec::{DataFile, SchemaRef, Struct};
use crate::writer::file_writer::{FileWriter, FileWriterBuilder};
use crate::writer::{IcebergWriter, IcebergWriterBuilder};
Expand Down Expand Up @@ -113,9 +114,9 @@ impl EqualityDeleteWriterConfig {
impl<B: FileWriterBuilder> IcebergWriterBuilder for EqualityDeleteFileWriterBuilder<B> {
type R = EqualityDeleteFileWriter<B>;

async fn build(self) -> Result<Self::R> {
async fn build(self, output_file: OutputFile) -> Result<Self::R> {
Ok(EqualityDeleteFileWriter {
inner_writer: Some(self.inner.clone().build().await?),
inner_writer: Some(self.inner.clone().build(output_file).await?),
projector: self.config.projector,
equality_ids: self.config.equality_ids,
partition_value: self.config.partition_value,
Expand Down
5 changes: 3 additions & 2 deletions crates/iceberg/src/writer/file_writer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ use futures::Future;

use super::CurrentFileStatus;
use crate::Result;
use crate::io::OutputFile;
use crate::spec::DataFileBuilder;

mod parquet_writer;
Expand All @@ -37,8 +38,8 @@ type DefaultOutput = Vec<DataFileBuilder>;
pub trait FileWriterBuilder<O = DefaultOutput>: Send + Clone + 'static {
/// The associated file writer type.
type R: FileWriter<O>;
/// Build file writer.
fn build(self) -> impl Future<Output = Result<Self::R>> + Send;
/// Build file writer with the provided output file.
fn build(self, output_file: OutputFile) -> impl Future<Output = Result<Self::R>> + Send;
}

/// File writer focus on writing record batch to different physical file format.(Such as parquet. orc)
Expand Down
68 changes: 16 additions & 52 deletions crates/iceberg/src/writer/file_writer/parquet_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ use parquet::format::FileMetaData;
use parquet::thrift::{TCompactOutputProtocol, TSerializable};
use thrift::protocol::TOutputProtocol;

use super::location_generator::{FileNameGenerator, LocationGenerator};
use super::{FileWriter, FileWriterBuilder};
use crate::arrow::{
ArrowFileReader, DEFAULT_MAP_FIELD_NAME, FieldMatchMode, NanValueCountVisitor,
Expand All @@ -43,87 +42,52 @@ use crate::arrow::{
use crate::io::{FileIO, FileWrite, OutputFile};
use crate::spec::{
DataContentType, DataFileBuilder, DataFileFormat, Datum, ListType, Literal, MapType,
NestedFieldRef, PartitionKey, PartitionSpec, PrimitiveType, Schema, SchemaRef, SchemaVisitor,
Struct, StructType, TableMetadata, Type, visit_schema,
NestedFieldRef, PartitionSpec, PrimitiveType, Schema, SchemaRef, SchemaVisitor, Struct,
StructType, TableMetadata, Type, visit_schema,
};
use crate::transform::create_transform_function;
use crate::writer::{CurrentFileStatus, DataFile};
use crate::{Error, ErrorKind, Result};

/// ParquetWriterBuilder is used to builder a [`ParquetWriter`]
#[derive(Clone, Debug)]
pub struct ParquetWriterBuilder<T: LocationGenerator, F: FileNameGenerator> {
pub struct ParquetWriterBuilder {
props: WriterProperties,
schema: SchemaRef,
partition_key: Option<PartitionKey>,
match_mode: FieldMatchMode,

file_io: FileIO,
location_generator: T,
file_name_generator: F,
}

impl<T: LocationGenerator, F: FileNameGenerator> ParquetWriterBuilder<T, F> {
impl ParquetWriterBuilder {
/// Create a new `ParquetWriterBuilder`
/// To construct the write result, the schema should contain the `PARQUET_FIELD_ID_META_KEY` metadata for each field.
pub fn new(
props: WriterProperties,
schema: SchemaRef,
partition_key: Option<PartitionKey>,
file_io: FileIO,
location_generator: T,
file_name_generator: F,
) -> Self {
Self::new_with_match_mode(
props,
schema,
partition_key,
FieldMatchMode::Id,
file_io,
location_generator,
file_name_generator,
)
pub fn new(props: WriterProperties, schema: SchemaRef) -> Self {
Self::new_with_match_mode(props, schema, FieldMatchMode::Id)
}

/// Create a new `ParquetWriterBuilder` with custom match mode
pub fn new_with_match_mode(
props: WriterProperties,
schema: SchemaRef,
partition_key: Option<PartitionKey>,
match_mode: FieldMatchMode,
file_io: FileIO,
location_generator: T,
file_name_generator: F,
) -> Self {
Self {
props,
schema,
partition_key,
match_mode,
file_io,
location_generator,
file_name_generator,
}
}
}

impl<T: LocationGenerator, F: FileNameGenerator> FileWriterBuilder for ParquetWriterBuilder<T, F> {
impl FileWriterBuilder for ParquetWriterBuilder {
type R = ParquetWriter;

async fn build(self) -> Result<Self::R> {
let out_file = self
.file_io
.new_output(self.location_generator.generate_location(
self.partition_key.as_ref(),
&self.file_name_generator.generate_file_name(),
))?;

async fn build(self, output_file: OutputFile) -> Result<Self::R> {
Ok(ParquetWriter {
schema: self.schema.clone(),
inner_writer: None,
writer_properties: self.props,
current_row_num: 0,
out_file,
output_file,
nan_value_count_visitor: NanValueCountVisitor::new_with_match_mode(self.match_mode),
})
}
Expand Down Expand Up @@ -250,7 +214,7 @@ impl SchemaVisitor for IndexByParquetPathName {
/// `ParquetWriter`` is used to write arrow data into parquet file on storage.
pub struct ParquetWriter {
schema: SchemaRef,
out_file: OutputFile,
output_file: OutputFile,
inner_writer: Option<AsyncArrowWriter<AsyncFileWriter<Box<dyn FileWrite>>>>,
writer_properties: WriterProperties,
current_row_num: usize,
Expand Down Expand Up @@ -555,7 +519,7 @@ impl FileWriter for ParquetWriter {
writer
} else {
let arrow_schema: ArrowSchemaRef = Arc::new(self.schema.as_ref().try_into()?);
let inner_writer = self.out_file.writer().await?;
let inner_writer = self.output_file.writer().await?;
let async_writer = AsyncFileWriter::new(inner_writer);
let writer = AsyncArrowWriter::try_new(
async_writer,
Expand Down Expand Up @@ -594,7 +558,7 @@ impl FileWriter for ParquetWriter {
let written_size = writer.bytes_written();

if self.current_row_num == 0 {
self.out_file.delete().await.map_err(|err| {
self.output_file.delete().await.map_err(|err| {
Error::new(
ErrorKind::Unexpected,
"Failed to delete empty parquet file.",
Expand All @@ -616,7 +580,7 @@ impl FileWriter for ParquetWriter {
self.schema,
parquet_metadata,
written_size,
self.out_file.location().to_string(),
self.output_file.location().to_string(),
self.nan_value_count_visitor.nan_value_counts,
)?])
}
Expand All @@ -625,7 +589,7 @@ impl FileWriter for ParquetWriter {

impl CurrentFileStatus for ParquetWriter {
fn current_file_path(&self) -> String {
self.out_file.location().to_string()
self.output_file.location().to_string()
}

fn current_row_num(&self) -> usize {
Expand Down Expand Up @@ -1681,7 +1645,7 @@ mod tests {
.build()
.await?;
pw.write(&to_write).await?;
let file_path = pw.out_file.location().to_string();
let file_path = pw.output_file.location().to_string();
pw.close().await.unwrap();
assert!(file_io.exists(file_path).await.unwrap());

Expand All @@ -1698,7 +1662,7 @@ mod tests {
)
.build()
.await?;
let file_path = pw.out_file.location().to_string();
let file_path = pw.output_file.location().to_string();
pw.close().await.unwrap();
assert!(!file_io.exists(file_path).await.unwrap());

Expand Down
Loading
Loading