@@ -108,10 +108,13 @@ impl<B: FileWriterBuilder> CurrentFileStatus for DataFileWriter<B> {
108
108
109
109
#[ cfg( test) ]
110
110
mod test {
111
- use std:: { collections :: HashMap , sync:: Arc } ;
111
+ use std:: sync:: Arc ;
112
112
113
- use arrow_array:: { types:: Int64Type , ArrayRef , Int64Array , RecordBatch , StructArray } ;
114
- use parquet:: { arrow:: PARQUET_FIELD_ID_META_KEY , file:: properties:: WriterProperties } ;
113
+ use crate :: {
114
+ spec:: { DataContentType , Schema , Struct } ,
115
+ Result ,
116
+ } ;
117
+ use parquet:: file:: properties:: WriterProperties ;
115
118
use tempfile:: TempDir ;
116
119
117
120
use crate :: {
@@ -123,195 +126,35 @@ mod test {
123
126
location_generator:: { test:: MockLocationGenerator , DefaultFileNameGenerator } ,
124
127
ParquetWriterBuilder ,
125
128
} ,
126
- tests:: check_parquet_data_file,
127
129
IcebergWriter , IcebergWriterBuilder ,
128
130
} ,
129
131
} ;
130
132
131
133
#[ tokio:: test]
132
- async fn test_data_file_writer ( ) -> Result < ( ) , anyhow :: Error > {
134
+ async fn test_parquet_writer ( ) -> Result < ( ) > {
133
135
let temp_dir = TempDir :: new ( ) . unwrap ( ) ;
134
136
let file_io = FileIOBuilder :: new_fs_io ( ) . build ( ) . unwrap ( ) ;
135
- let location_gen =
137
+ let loccation_gen =
136
138
MockLocationGenerator :: new ( temp_dir. path ( ) . to_str ( ) . unwrap ( ) . to_string ( ) ) ;
137
139
let file_name_gen =
138
140
DefaultFileNameGenerator :: new ( "test" . to_string ( ) , None , DataFileFormat :: Parquet ) ;
139
141
140
- // prepare data
141
- // Int, Struct(Int), String, List(Int), Struct(Struct(Int))
142
- let schema = {
143
- let fields = vec ! [
144
- arrow_schema:: Field :: new( "col0" , arrow_schema:: DataType :: Int64 , true )
145
- . with_metadata( HashMap :: from( [ (
146
- PARQUET_FIELD_ID_META_KEY . to_string( ) ,
147
- "0" . to_string( ) ,
148
- ) ] ) ) ,
149
- arrow_schema:: Field :: new(
150
- "col1" ,
151
- arrow_schema:: DataType :: Struct (
152
- vec![ arrow_schema:: Field :: new(
153
- "sub_col" ,
154
- arrow_schema:: DataType :: Int64 ,
155
- true ,
156
- )
157
- . with_metadata( HashMap :: from( [ (
158
- PARQUET_FIELD_ID_META_KEY . to_string( ) ,
159
- "5" . to_string( ) ,
160
- ) ] ) ) ]
161
- . into( ) ,
162
- ) ,
163
- true ,
164
- )
165
- . with_metadata( HashMap :: from( [ (
166
- PARQUET_FIELD_ID_META_KEY . to_string( ) ,
167
- "1" . to_string( ) ,
168
- ) ] ) ) ,
169
- arrow_schema:: Field :: new( "col2" , arrow_schema:: DataType :: Utf8 , true ) . with_metadata(
170
- HashMap :: from( [ ( PARQUET_FIELD_ID_META_KEY . to_string( ) , "2" . to_string( ) ) ] ) ,
171
- ) ,
172
- arrow_schema:: Field :: new(
173
- "col3" ,
174
- arrow_schema:: DataType :: List ( Arc :: new(
175
- arrow_schema:: Field :: new( "item" , arrow_schema:: DataType :: Int64 , true )
176
- . with_metadata( HashMap :: from( [ (
177
- PARQUET_FIELD_ID_META_KEY . to_string( ) ,
178
- "6" . to_string( ) ,
179
- ) ] ) ) ,
180
- ) ) ,
181
- true ,
182
- )
183
- . with_metadata( HashMap :: from( [ (
184
- PARQUET_FIELD_ID_META_KEY . to_string( ) ,
185
- "3" . to_string( ) ,
186
- ) ] ) ) ,
187
- arrow_schema:: Field :: new(
188
- "col4" ,
189
- arrow_schema:: DataType :: Struct (
190
- vec![ arrow_schema:: Field :: new(
191
- "sub_col" ,
192
- arrow_schema:: DataType :: Struct (
193
- vec![ arrow_schema:: Field :: new(
194
- "sub_sub_col" ,
195
- arrow_schema:: DataType :: Int64 ,
196
- true ,
197
- )
198
- . with_metadata( HashMap :: from( [ (
199
- PARQUET_FIELD_ID_META_KEY . to_string( ) ,
200
- "7" . to_string( ) ,
201
- ) ] ) ) ]
202
- . into( ) ,
203
- ) ,
204
- true ,
205
- )
206
- . with_metadata( HashMap :: from( [ (
207
- PARQUET_FIELD_ID_META_KEY . to_string( ) ,
208
- "8" . to_string( ) ,
209
- ) ] ) ) ]
210
- . into( ) ,
211
- ) ,
212
- true ,
213
- )
214
- . with_metadata( HashMap :: from( [ (
215
- PARQUET_FIELD_ID_META_KEY . to_string( ) ,
216
- "4" . to_string( ) ,
217
- ) ] ) ) ,
218
- ] ;
219
- Arc :: new ( arrow_schema:: Schema :: new ( fields) )
220
- } ;
221
- let col0 = Arc :: new ( Int64Array :: from_iter_values ( vec ! [ 1 ; 1024 ] ) ) as ArrayRef ;
222
- let col1 = Arc :: new ( StructArray :: new (
223
- vec ! [
224
- arrow_schema:: Field :: new( "sub_col" , arrow_schema:: DataType :: Int64 , true )
225
- . with_metadata( HashMap :: from( [ (
226
- PARQUET_FIELD_ID_META_KEY . to_string( ) ,
227
- "5" . to_string( ) ,
228
- ) ] ) ) ,
229
- ]
230
- . into ( ) ,
231
- vec ! [ Arc :: new( Int64Array :: from_iter_values( vec![ 1 ; 1024 ] ) ) ] ,
232
- None ,
233
- ) ) ;
234
- let col2 = Arc :: new ( arrow_array:: StringArray :: from_iter_values ( vec ! [
235
- "test" ;
236
- 1024
237
- ] ) ) as ArrayRef ;
238
- let col3 = Arc :: new ( {
239
- let list_parts = arrow_array:: ListArray :: from_iter_primitive :: < Int64Type , _ , _ > ( vec ! [
240
- Some (
241
- vec![ Some ( 1 ) , ]
242
- ) ;
243
- 1024
244
- ] )
245
- . into_parts ( ) ;
246
- arrow_array:: ListArray :: new (
247
- Arc :: new ( list_parts. 0 . as_ref ( ) . clone ( ) . with_metadata ( HashMap :: from ( [ (
248
- PARQUET_FIELD_ID_META_KEY . to_string ( ) ,
249
- "6" . to_string ( ) ,
250
- ) ] ) ) ) ,
251
- list_parts. 1 ,
252
- list_parts. 2 ,
253
- list_parts. 3 ,
254
- )
255
- } ) as ArrayRef ;
256
- let col4 = Arc :: new ( StructArray :: new (
257
- vec ! [ arrow_schema:: Field :: new(
258
- "sub_col" ,
259
- arrow_schema:: DataType :: Struct (
260
- vec![ arrow_schema:: Field :: new(
261
- "sub_sub_col" ,
262
- arrow_schema:: DataType :: Int64 ,
263
- true ,
264
- )
265
- . with_metadata( HashMap :: from( [ (
266
- PARQUET_FIELD_ID_META_KEY . to_string( ) ,
267
- "7" . to_string( ) ,
268
- ) ] ) ) ]
269
- . into( ) ,
270
- ) ,
271
- true ,
272
- )
273
- . with_metadata( HashMap :: from( [ (
274
- PARQUET_FIELD_ID_META_KEY . to_string( ) ,
275
- "8" . to_string( ) ,
276
- ) ] ) ) ]
277
- . into ( ) ,
278
- vec ! [ Arc :: new( StructArray :: new(
279
- vec![
280
- arrow_schema:: Field :: new( "sub_sub_col" , arrow_schema:: DataType :: Int64 , true )
281
- . with_metadata( HashMap :: from( [ (
282
- PARQUET_FIELD_ID_META_KEY . to_string( ) ,
283
- "7" . to_string( ) ,
284
- ) ] ) ) ,
285
- ]
286
- . into( ) ,
287
- vec![ Arc :: new( Int64Array :: from_iter_values( vec![ 1 ; 1024 ] ) ) ] ,
288
- None ,
289
- ) ) ] ,
290
- None ,
291
- ) ) ;
292
- let to_write =
293
- RecordBatch :: try_new ( schema. clone ( ) , vec ! [ col0, col1, col2, col3, col4] ) . unwrap ( ) ;
294
-
295
- // prepare writer
296
- let pb = ParquetWriterBuilder :: new (
142
+ let pw = ParquetWriterBuilder :: new (
297
143
WriterProperties :: builder ( ) . build ( ) ,
298
- to_write . schema ( ) ,
144
+ Arc :: new ( Schema :: builder ( ) . build ( ) . unwrap ( ) ) ,
299
145
file_io. clone ( ) ,
300
- location_gen ,
146
+ loccation_gen ,
301
147
file_name_gen,
302
148
) ;
303
- let mut data_file_writer = DataFileWriterBuilder :: new ( pb )
149
+ let mut data_file_writer = DataFileWriterBuilder :: new ( pw )
304
150
. build ( DataFileWriterConfig :: new ( None ) )
305
151
. await ?;
306
152
307
- // write
308
- data_file_writer. write ( to_write. clone ( ) ) . await ?;
309
- let res = data_file_writer. close ( ) . await ?;
310
- assert_eq ! ( res. len( ) , 1 ) ;
311
- let data_file = res. into_iter ( ) . next ( ) . unwrap ( ) ;
312
-
313
- // check
314
- check_parquet_data_file ( & file_io, & data_file, & to_write) . await ;
153
+ let data_file = data_file_writer. close ( ) . await . unwrap ( ) ;
154
+ assert_eq ! ( data_file. len( ) , 1 ) ;
155
+ assert_eq ! ( data_file[ 0 ] . file_format, DataFileFormat :: Parquet ) ;
156
+ assert_eq ! ( data_file[ 0 ] . content, DataContentType :: Data ) ;
157
+ assert_eq ! ( data_file[ 0 ] . partition, Struct :: empty( ) ) ;
315
158
316
159
Ok ( ( ) )
317
160
}
0 commit comments