Skip to content

Commit debda66

Browse files
Fokkosungwy
andauthored
Allow setting write.parquet.row-group-limit (#1016)
* Allow setting `write.parquet.row-group-limit` And update the docs * Add test * Make ruff happy --------- Co-authored-by: Sung Yun <[email protected]>
1 parent 50077af commit debda66

File tree

4 files changed

+39
-3
lines changed

4 files changed

+39
-3
lines changed

mkdocs/docs/configuration.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ Iceberg tables support table properties to configure table behavior.
3232
| -------------------------------------- | --------------------------------- | ------- | ------------------------------------------------------------------------------------------- |
3333
| `write.parquet.compression-codec` | `{uncompressed,zstd,gzip,snappy}` | zstd | Sets the Parquet compression coddec. |
3434
| `write.parquet.compression-level` | Integer | null | Parquet compression level for the codec. If not set, it is up to PyIceberg |
35+
| `write.parquet.row-group-limit` | Number of rows | 1048576 | The upper bound of the number of entries within a single row group |
3536
| `write.parquet.page-size-bytes` | Size in bytes | 1MB | Set a target threshold for the approximate encoded size of data pages within a column chunk |
3637
| `write.parquet.page-row-limit` | Number of rows | 20000 | Set a target threshold for the approximate encoded size of data pages within a column chunk |
3738
| `write.parquet.dict-size-bytes` | Size in bytes | 2MB | Set the dictionary page size limit per row group |

pyiceberg/io/pyarrow.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2197,8 +2197,8 @@ def write_file(io: FileIO, table_metadata: TableMetadata, tasks: Iterator[WriteT
21972197
parquet_writer_kwargs = _get_parquet_writer_kwargs(table_metadata.properties)
21982198
row_group_size = property_as_int(
21992199
properties=table_metadata.properties,
2200-
property_name=TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES,
2201-
default=TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT,
2200+
property_name=TableProperties.PARQUET_ROW_GROUP_LIMIT,
2201+
default=TableProperties.PARQUET_ROW_GROUP_LIMIT_DEFAULT,
22022202
)
22032203

22042204
def write_parquet(task: WriteTask) -> DataFile:

pyiceberg/table/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ class TableProperties:
176176
PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT = 128 * 1024 * 1024 # 128 MB
177177

178178
PARQUET_ROW_GROUP_LIMIT = "write.parquet.row-group-limit"
179-
PARQUET_ROW_GROUP_LIMIT_DEFAULT = 128 * 1024 * 1024 # 128 MB
179+
PARQUET_ROW_GROUP_LIMIT_DEFAULT = 1048576
180180

181181
PARQUET_PAGE_SIZE_BYTES = "write.parquet.page-size-bytes"
182182
PARQUET_PAGE_SIZE_BYTES_DEFAULT = 1024 * 1024 # 1 MB

tests/integration/test_reads.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
BinaryType,
5151
BooleanType,
5252
IntegerType,
53+
LongType,
5354
NestedField,
5455
StringType,
5556
TimestampType,
@@ -670,6 +671,40 @@ def another_task() -> None:
670671

671672

672673
@pytest.mark.integration
674+
def test_configure_row_group_batch_size(session_catalog: Catalog) -> None:
675+
from pyiceberg.table import TableProperties
676+
677+
table_name = "default.test_small_row_groups"
678+
try:
679+
session_catalog.drop_table(table_name)
680+
except NoSuchTableError:
681+
pass # Just to make sure that the table doesn't exist
682+
683+
tbl = session_catalog.create_table(
684+
table_name,
685+
Schema(
686+
NestedField(1, "number", LongType()),
687+
),
688+
properties={TableProperties.PARQUET_ROW_GROUP_LIMIT: "1"},
689+
)
690+
691+
# Write 10 row groups, that should end up as 10 batches
692+
entries = 10
693+
tbl.append(
694+
pa.Table.from_pylist(
695+
[
696+
{
697+
"number": number,
698+
}
699+
for number in range(entries)
700+
],
701+
)
702+
)
703+
704+
batches = list(tbl.scan().to_arrow_batch_reader())
705+
assert len(batches) == entries
706+
707+
673708
@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")])
674709
def test_table_scan_default_to_large_types(catalog: Catalog) -> None:
675710
identifier = "default.test_table_scan_default_to_large_types"

0 commit comments

Comments
 (0)