Skip to content

Commit 7123b9f

Browse files
committed
add more tests
1 parent 6cf617d commit 7123b9f

File tree

2 files changed

+31
-11
lines changed

2 files changed

+31
-11
lines changed

tests/conftest.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1965,6 +1965,7 @@ def spark() -> SparkSession:
19651965
.config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
19661966
.config("spark.sql.catalog.integration", "org.apache.iceberg.spark.SparkCatalog")
19671967
.config("spark.sql.catalog.integration.catalog-impl", "org.apache.iceberg.rest.RESTCatalog")
1968+
.config("spark.sql.catalog.integration.cache-enabled", "false")
19681969
.config("spark.sql.catalog.integration.uri", "http://localhost:8181")
19691970
.config("spark.sql.catalog.integration.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
19701971
.config("spark.sql.catalog.integration.warehouse", "s3://warehouse/wh/")

tests/integration/test_writes.py

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
from pyiceberg.catalog.sql import SqlCatalog
3737
from pyiceberg.exceptions import NoSuchTableError
3838
from pyiceberg.schema import Schema
39-
from pyiceberg.table import TableProperties, _dataframe_to_data_files
39+
from pyiceberg.table import SetPropertiesUpdate, TableProperties, _dataframe_to_data_files
4040
from pyiceberg.types import (
4141
BinaryType,
4242
BooleanType,
@@ -356,31 +356,50 @@ def test_data_files(spark: SparkSession, session_catalog: Catalog, arrow_table_w
356356

357357

358358
@pytest.mark.integration
359-
@pytest.mark.parametrize("format_version", [1, 2])
360-
def test_write_multiple_data_files(
361-
spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int
362-
) -> None:
363-
identifier = "default.write_multiple_arrow_data_files"
364-
tbl = _create_table(session_catalog, identifier, {"format-version": format_version}, [])
359+
def test_write_bin_pack_data_files(spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table) -> None:
360+
identifier = "default.write_bin_pack_data_files"
361+
tbl = _create_table(session_catalog, identifier, {"format-version": "1"}, [])
365362

366363
def get_data_files_count(identifier: str) -> int:
367364
return spark.sql(
368365
f"""
369366
SELECT *
370-
FROM {identifier}.all_data_files
367+
FROM {identifier}.files
371368
"""
372369
).count()
373370

374-
# writes to 1 data file since the table is small
371+
def set_table_properties(tbl: Table, properties: Properties) -> Table:
372+
with tbl.transaction() as transaction:
373+
transaction._apply((SetPropertiesUpdate(updates=properties),))
374+
return tbl
375+
376+
# writes 1 data file since the table is smaller than default target file size
377+
assert arrow_table_with_null.nbytes < TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT
375378
tbl.overwrite(arrow_table_with_null)
376379
assert get_data_files_count(identifier) == 1
377380

378-
# writes to 1 data file as long as table is smaller than default target file size
381+
# writes 1 data file as long as table is smaller than default target file size
379382
bigger_arrow_tbl = pa.concat_tables([arrow_table_with_null] * 10)
380-
tbl.overwrite(bigger_arrow_tbl)
381383
assert bigger_arrow_tbl.nbytes < TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT
384+
tbl.overwrite(bigger_arrow_tbl)
382385
assert get_data_files_count(identifier) == 1
383386

387+
# writes multiple data files once target file size is overridden
388+
target_file_size = arrow_table_with_null.nbytes
389+
tbl = set_table_properties(tbl, {TableProperties.WRITE_TARGET_FILE_SIZE_BYTES: str(target_file_size)})
390+
assert str(target_file_size) == tbl.properties.get(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES)
391+
assert target_file_size < bigger_arrow_tbl.nbytes
392+
tbl.overwrite(bigger_arrow_tbl)
393+
assert get_data_files_count(identifier) == 10
394+
395+
# writes half the number of data files when target file size doubles
396+
target_file_size = arrow_table_with_null.nbytes * 2
397+
tbl = set_table_properties(tbl, {TableProperties.WRITE_TARGET_FILE_SIZE_BYTES: str(target_file_size)})
398+
assert str(target_file_size) == tbl.properties.get(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES)
399+
assert target_file_size < bigger_arrow_tbl.nbytes
400+
tbl.overwrite(bigger_arrow_tbl)
401+
assert get_data_files_count(identifier) == 5
402+
384403

385404
@pytest.mark.integration
386405
@pytest.mark.parametrize("format_version", [1, 2])

0 commit comments

Comments
 (0)