From 0cad231cdeac7c9833e08289a2dca70a452fa2c8 Mon Sep 17 00:00:00 2001 From: Sung Yun <107272191+syun64@users.noreply.github.com> Date: Sat, 4 May 2024 02:20:39 +0000 Subject: [PATCH 01/80] checkpoint --- pyiceberg/transforms.py | 25 ++++++ .../test_writes/test_partitioned_writes.py | 89 +++++++++++++++++++ 2 files changed, 114 insertions(+) diff --git a/pyiceberg/transforms.py b/pyiceberg/transforms.py index 6dcae59e49..c75f7861c0 100644 --- a/pyiceberg/transforms.py +++ b/pyiceberg/transforms.py @@ -433,6 +433,31 @@ def __repr__(self) -> str: """Return the string representation of the MonthTransform class.""" return "MonthTransform()" + def pyarrow_transform(self, source: IcebergType) -> Callable: + import pyarrow as pa + import pyarrow.compute as pc + + if isinstance(source, DateType): + + def month_func(v: Any) -> int: + return pc.add( + pc.multiply(pc.years_between(pa.scalar(date(1970, 1, 1)), v), pa.scalar(12)), + pc.add(pc.month(v), pa.scalar(-1)), + ) + + elif isinstance(source, (TimestampType, TimestamptzType)): + + def month_func(v: Any) -> int: + return pc.add( + pc.multiply(pc.years_between(pa.scalar(datetime(1970, 1, 1)), pc.local_timestamp(v)), pa.scalar(12)), + pc.add(pc.month(v), pa.scalar(-1)), + ) + + else: + raise ValueError(f"Cannot apply month transform for type: {source}") + + return lambda v: month_func(v) if v is not None else None + class DayTransform(TimeTransform[S]): """Transforms a datetime value into a day value. diff --git a/tests/integration/test_writes/test_partitioned_writes.py b/tests/integration/test_writes/test_partitioned_writes.py index d84b9745a7..91856a4790 100644 --- a/tests/integration/test_writes/test_partitioned_writes.py +++ b/tests/integration/test_writes/test_partitioned_writes.py @@ -16,6 +16,8 @@ # under the License. # pylint:disable=redefined-outer-name +from datetime import date, datetime, timezone + import pyarrow as pa import pytest from pyspark.sql import SparkSession @@ -36,6 +38,54 @@ from utils import TABLE_SCHEMA, _create_table +@pytest.fixture(scope="session") +def arrow_table_dates() -> pa.Table: + """Pyarrow table with only null values.""" + TEST_DATES = [date(2023, 12, 31), date(2024, 1, 1), date(2024, 1, 31), date(2024, 2, 1)] + return pa.Table.from_pydict( + {"dates": TEST_DATES}, + schema=pa.schema([ + ("dates", pa.date32()), + ]), + ) + + +@pytest.fixture(scope="session") +def arrow_table_timestamp() -> pa.Table: + """Pyarrow table with only null values.""" + TEST_DATETIMES = [ + datetime(2023, 12, 31, 0, 0, 0), + datetime(2024, 1, 1, 0, 0, 0), + datetime(2024, 1, 31, 0, 0, 0), + datetime(2024, 2, 1, 0, 0, 0), + datetime(2024, 2, 1, 6, 0, 0), + ] + return pa.Table.from_pydict( + {"dates": TEST_DATETIMES}, + schema=pa.schema([ + ("timestamp", pa.timestamp(unit="us")), + ]), + ) + + +@pytest.fixture(scope="session") +def arrow_table_timestamptz() -> pa.Table: + """Pyarrow table with only null values.""" + TEST_DATETIMES_WITH_TZ = [ + datetime(2023, 12, 31, 0, 0, 0, tzinfo=timezone.utc), + datetime(2024, 1, 1, 0, 0, 0, tzinfo=timezone.utc), + datetime(2024, 1, 31, 0, 0, 0, tzinfo=timezone.utc), + datetime(2024, 2, 1, 0, 0, 0, tzinfo=timezone.utc), + datetime(2024, 2, 1, 6, 0, 0, tzinfo=timezone.utc), + ] + return pa.Table.from_pydict( + {"dates": TEST_DATETIMES_WITH_TZ}, + schema=pa.schema([ + ("timestamptz", pa.timestamp(unit="us", tz="UTC")), + ]), + ) + + @pytest.mark.integration @pytest.mark.parametrize( "part_col", ['int', 'bool', 'string', "string_long", "long", "float", "double", "date", 'timestamp', 'timestamptz', 'binary'] @@ -384,3 +434,42 @@ def test_unsupported_transform( with pytest.raises(ValueError, match="All transforms are not supported.*"): tbl.append(arrow_table_with_null) + + +@pytest.mark.integration +@pytest.mark.parametrize( + "part_col", ['int', 'bool', 'string', "string_long", "long", "float", "double", "date", "timestamptz", "timestamp", "binary"] +) +@pytest.mark.parametrize("format_version", [1, 2]) +def test_append_time_transform_partitioned_table( + session_catalog: Catalog, spark: SparkSession, arrow_table_with_null: pa.Table, part_col: str, format_version: int +) -> None: + # Given + identifier = f"default.arrow_table_v{format_version}_appended_with_null_partitioned_on_col_{part_col}" + nested_field = TABLE_SCHEMA.find_field(part_col) + partition_spec = PartitionSpec( + PartitionField(source_id=nested_field.field_id, field_id=1001, transform=IdentityTransform(), name=part_col) + ) + + # When + tbl = _create_table( + session_catalog=session_catalog, + identifier=identifier, + properties={"format-version": str(format_version)}, + data=[], + partition_spec=partition_spec, + ) + # Append with arrow_table_1 with lines [A,B,C] and then arrow_table_2 with lines[A,B,C,A,B,C] + tbl.append(arrow_table_with_null) + tbl.append(pa.concat_tables([arrow_table_with_null, arrow_table_with_null])) + + # Then + assert tbl.format_version == format_version, f"Expected v{format_version}, got: v{tbl.format_version}" + df = spark.table(identifier) + for col in TEST_DATA_WITH_NULL.keys(): + df = spark.table(identifier) + assert df.where(f"{col} is not null").count() == 6, f"Expected 6 non-null rows for {col}" + assert df.where(f"{col} is null").count() == 3, f"Expected 3 null rows for {col}" + # expecting 6 files: first append with [A], [B], [C], second append with [A, A], [B, B], [C, C] + rows = spark.sql(f"select partition from {identifier}.files").collect() + assert len(rows) == 6 From 96e55334d95f7f3d7aea6f6d8c220b2d1a7aa73d Mon Sep 17 00:00:00 2001 From: Sung Yun <107272191+syun64@users.noreply.github.com> Date: Sun, 5 May 2024 16:27:57 +0000 Subject: [PATCH 02/80] checkpoint2 --- pyiceberg/transforms.py | 78 ++++++++++++++----- .../test_writes/test_partitioned_writes.py | 76 +++--------------- tests/test_transforms.py | 71 ++++++++++++++++- 3 files changed, 141 insertions(+), 84 deletions(-) diff --git a/pyiceberg/transforms.py b/pyiceberg/transforms.py index c75f7861c0..0cf26fe2a2 100644 --- a/pyiceberg/transforms.py +++ b/pyiceberg/transforms.py @@ -20,7 +20,7 @@ from abc import ABC, abstractmethod from enum import IntEnum from functools import singledispatch -from typing import Any, Callable, Generic, Optional, TypeVar +from typing import TYPE_CHECKING, Any, Callable, Generic, Optional, TypeVar from typing import Literal as LiteralType from uuid import UUID @@ -82,6 +82,9 @@ from pyiceberg.utils.parsing import ParseNumberFromBrackets from pyiceberg.utils.singleton import Singleton +if TYPE_CHECKING: + import pyarrow as pa + S = TypeVar("S") T = TypeVar("T") @@ -391,6 +394,21 @@ def __repr__(self) -> str: """Return the string representation of the YearTransform class.""" return "YearTransform()" + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": + import pyarrow as pa + import pyarrow.compute as pc + + if isinstance(source, DateType): + epoch = datetime.EPOCH_DATE + elif isinstance(source, TimestampType): + epoch = datetime.EPOCH_TIMESTAMP + elif isinstance(source, TimestamptzType): + epoch = datetime.EPOCH_TIMESTAMPTZ + else: + raise ValueError(f"Cannot apply year transform for type: {source}") + + return lambda v: pc.years_between(pa.scalar(epoch), v) if v is not None else None + class MonthTransform(TimeTransform[S]): """Transforms a datetime value into a month value. @@ -433,29 +451,25 @@ def __repr__(self) -> str: """Return the string representation of the MonthTransform class.""" return "MonthTransform()" - def pyarrow_transform(self, source: IcebergType) -> Callable: + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": import pyarrow as pa import pyarrow.compute as pc - - if isinstance(source, DateType): - - def month_func(v: Any) -> int: - return pc.add( - pc.multiply(pc.years_between(pa.scalar(date(1970, 1, 1)), v), pa.scalar(12)), - pc.add(pc.month(v), pa.scalar(-1)), - ) - - elif isinstance(source, (TimestampType, TimestamptzType)): - - def month_func(v: Any) -> int: - return pc.add( - pc.multiply(pc.years_between(pa.scalar(datetime(1970, 1, 1)), pc.local_timestamp(v)), pa.scalar(12)), - pc.add(pc.month(v), pa.scalar(-1)), - ) + if isinstance(source, DateType): + epoch = datetime.EPOCH_DATE + elif isinstance(source, TimestampType): + epoch = datetime.EPOCH_TIMESTAMP + elif isinstance(source, TimestamptzType): + epoch = datetime.EPOCH_TIMESTAMPTZ else: raise ValueError(f"Cannot apply month transform for type: {source}") + def month_func(v: pa.Array) -> pa.Array: + return pc.add( + pc.multiply(pc.years_between(pa.scalar(epoch), v), pa.scalar(12)), + pc.add(pc.month(v), pa.scalar(-1)), + ) + return lambda v: month_func(v) if v is not None else None @@ -503,6 +517,21 @@ def __repr__(self) -> str: """Return the string representation of the DayTransform class.""" return "DayTransform()" + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": + import pyarrow as pa + import pyarrow.compute as pc + + if isinstance(source, DateType): + epoch = datetime.EPOCH_DATE + elif isinstance(source, TimestampType): + epoch = datetime.EPOCH_TIMESTAMP + elif isinstance(source, TimestamptzType): + epoch = datetime.EPOCH_TIMESTAMPTZ + else: + raise ValueError(f"Cannot apply day transform for type: {source}") + + return lambda v: pc.days_between(pa.scalar(epoch), v) if v is not None else None + class HourTransform(TimeTransform[S]): """Transforms a datetime value into a hour value. @@ -540,6 +569,19 @@ def __repr__(self) -> str: """Return the string representation of the HourTransform class.""" return "HourTransform()" + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": + import pyarrow as pa + import pyarrow.compute as pc + + if isinstance(source, TimestampType): + epoch = datetime.EPOCH_TIMESTAMP + elif isinstance(source, TimestamptzType): + epoch = datetime.EPOCH_TIMESTAMPTZ + else: + raise ValueError(f"Cannot apply month transform for type: {source}") + + return lambda v: pc.hours_between(pa.scalar(epoch), v) if v is not None else None + def _base64encode(buffer: bytes) -> str: """Convert bytes to base64 string.""" diff --git a/tests/integration/test_writes/test_partitioned_writes.py b/tests/integration/test_writes/test_partitioned_writes.py index 91856a4790..07c3c43a2c 100644 --- a/tests/integration/test_writes/test_partitioned_writes.py +++ b/tests/integration/test_writes/test_partitioned_writes.py @@ -16,11 +16,11 @@ # under the License. # pylint:disable=redefined-outer-name -from datetime import date, datetime, timezone import pyarrow as pa import pytest from pyspark.sql import SparkSession +from typing import Any from pyiceberg.catalog import Catalog from pyiceberg.exceptions import NoSuchTableError @@ -31,6 +31,7 @@ HourTransform, IdentityTransform, MonthTransform, + Transform, TruncateTransform, YearTransform, ) @@ -38,54 +39,6 @@ from utils import TABLE_SCHEMA, _create_table -@pytest.fixture(scope="session") -def arrow_table_dates() -> pa.Table: - """Pyarrow table with only null values.""" - TEST_DATES = [date(2023, 12, 31), date(2024, 1, 1), date(2024, 1, 31), date(2024, 2, 1)] - return pa.Table.from_pydict( - {"dates": TEST_DATES}, - schema=pa.schema([ - ("dates", pa.date32()), - ]), - ) - - -@pytest.fixture(scope="session") -def arrow_table_timestamp() -> pa.Table: - """Pyarrow table with only null values.""" - TEST_DATETIMES = [ - datetime(2023, 12, 31, 0, 0, 0), - datetime(2024, 1, 1, 0, 0, 0), - datetime(2024, 1, 31, 0, 0, 0), - datetime(2024, 2, 1, 0, 0, 0), - datetime(2024, 2, 1, 6, 0, 0), - ] - return pa.Table.from_pydict( - {"dates": TEST_DATETIMES}, - schema=pa.schema([ - ("timestamp", pa.timestamp(unit="us")), - ]), - ) - - -@pytest.fixture(scope="session") -def arrow_table_timestamptz() -> pa.Table: - """Pyarrow table with only null values.""" - TEST_DATETIMES_WITH_TZ = [ - datetime(2023, 12, 31, 0, 0, 0, tzinfo=timezone.utc), - datetime(2024, 1, 1, 0, 0, 0, tzinfo=timezone.utc), - datetime(2024, 1, 31, 0, 0, 0, tzinfo=timezone.utc), - datetime(2024, 2, 1, 0, 0, 0, tzinfo=timezone.utc), - datetime(2024, 2, 1, 6, 0, 0, tzinfo=timezone.utc), - ] - return pa.Table.from_pydict( - {"dates": TEST_DATETIMES_WITH_TZ}, - schema=pa.schema([ - ("timestamptz", pa.timestamp(unit="us", tz="UTC")), - ]), - ) - - @pytest.mark.integration @pytest.mark.parametrize( "part_col", ['int', 'bool', 'string', "string_long", "long", "float", "double", "date", 'timestamp', 'timestamptz', 'binary'] @@ -437,18 +390,19 @@ def test_unsupported_transform( @pytest.mark.integration +@pytest.mark.parametrize('transform', [YearTransform(), MonthTransform(), DayTransform()]) @pytest.mark.parametrize( - "part_col", ['int', 'bool', 'string', "string_long", "long", "float", "double", "date", "timestamptz", "timestamp", "binary"] + "part_col", ["date", "timestamp", "timestamptz"] ) @pytest.mark.parametrize("format_version", [1, 2]) -def test_append_time_transform_partitioned_table( - session_catalog: Catalog, spark: SparkSession, arrow_table_with_null: pa.Table, part_col: str, format_version: int +def test_append_ymd_transform_partitioned( + session_catalog: Catalog, spark: SparkSession, arrow_table_with_null: pa.Table, transform: Transform[Any, Any], part_col: str, format_version: int ) -> None: # Given - identifier = f"default.arrow_table_v{format_version}_appended_with_null_partitioned_on_col_{part_col}" + identifier = f"default.arrow_table_v{format_version}_with_ymd_transform_partitioned_on_col_{part_col}" nested_field = TABLE_SCHEMA.find_field(part_col) partition_spec = PartitionSpec( - PartitionField(source_id=nested_field.field_id, field_id=1001, transform=IdentityTransform(), name=part_col) + PartitionField(source_id=nested_field.field_id, field_id=1001, transform=transform, name=part_col) ) # When @@ -456,20 +410,14 @@ def test_append_time_transform_partitioned_table( session_catalog=session_catalog, identifier=identifier, properties={"format-version": str(format_version)}, - data=[], + data=[arrow_table_with_null], partition_spec=partition_spec, ) - # Append with arrow_table_1 with lines [A,B,C] and then arrow_table_2 with lines[A,B,C,A,B,C] - tbl.append(arrow_table_with_null) - tbl.append(pa.concat_tables([arrow_table_with_null, arrow_table_with_null])) # Then assert tbl.format_version == format_version, f"Expected v{format_version}, got: v{tbl.format_version}" df = spark.table(identifier) + assert df.count() == 3, f"Expected 3 total rows for {identifier}" for col in TEST_DATA_WITH_NULL.keys(): - df = spark.table(identifier) - assert df.where(f"{col} is not null").count() == 6, f"Expected 6 non-null rows for {col}" - assert df.where(f"{col} is null").count() == 3, f"Expected 3 null rows for {col}" - # expecting 6 files: first append with [A], [B], [C], second append with [A, A], [B, B], [C, C] - rows = spark.sql(f"select partition from {identifier}.files").collect() - assert len(rows) == 6 + assert df.where(f"{col} is not null").count() == 2, f"Expected 2 non-null rows for {col}" + assert df.where(f"{col} is null").count() == 1, f"Expected 1 null row for {col} is null" \ No newline at end of file diff --git a/tests/test_transforms.py b/tests/test_transforms.py index 4dc3d9819f..1f3c47a8d9 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -15,9 +15,9 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=eval-used,protected-access,redefined-outer-name -from datetime import date +from datetime import date, datetime, timezone from decimal import Decimal -from typing import Any, Callable, Optional +from typing import TYPE_CHECKING, Any, Callable, Optional from uuid import UUID import mmh3 as mmh3 @@ -69,6 +69,7 @@ TimestampLiteral, literal, ) +from pyiceberg.partitioning import _to_partition_representation from pyiceberg.schema import Accessor from pyiceberg.transforms import ( BucketTransform, @@ -111,6 +112,9 @@ timestamptz_to_micros, ) +if TYPE_CHECKING: + import pyarrow as pa + @pytest.mark.parametrize( "test_input,test_type,expected", @@ -1808,3 +1812,66 @@ def test_strict_binary(bound_reference_binary: BoundReference[str]) -> None: _test_projection( lhs=transform.strict_project(name="name", pred=BoundIn(term=bound_reference_binary, literals=set_of_literals)), rhs=None ) + + +@pytest.fixture(scope="session") +def arrow_table_date_timestamps() -> "pa.Table": + """Pyarrow table with only date, timestamp and timestamptz values.""" + import pyarrow as pa + + return pa.Table.from_pydict( + { + "date": [date(2023, 12, 31), date(2024, 1, 1), date(2024, 1, 31), date(2024, 2, 1), date(2024, 2, 1), None], + "timestamp": [ + datetime(2023, 12, 31, 0, 0, 0), + datetime(2024, 1, 1, 0, 0, 0), + datetime(2024, 1, 31, 0, 0, 0), + datetime(2024, 2, 1, 0, 0, 0), + datetime(2024, 2, 1, 6, 0, 0), + None, + ], + "timestamptz": [ + datetime(2023, 12, 31, 0, 0, 0, tzinfo=timezone.utc), + datetime(2024, 1, 1, 0, 0, 0, tzinfo=timezone.utc), + datetime(2024, 1, 31, 0, 0, 0, tzinfo=timezone.utc), + datetime(2024, 2, 1, 0, 0, 0, tzinfo=timezone.utc), + datetime(2024, 2, 1, 6, 0, 0, tzinfo=timezone.utc), + None, + ], + }, + schema=pa.schema([ + ("date", pa.date32()), + ("timestamp", pa.timestamp(unit="us")), + ("timestamptz", pa.timestamp(unit="us", tz="UTC")), + ]), + ) + + +@pytest.mark.parametrize('transform', [YearTransform(), MonthTransform(), DayTransform()]) +@pytest.mark.parametrize( + "source_col, source_type", [("date", DateType()), ("timestamp", TimestampType()), ("timestamptz", TimestamptzType())] +) +def test_ymd_pyarrow_transforms( + arrow_table_date_timestamps: "pa.Table", + source_col: str, + source_type: PrimitiveType, + transform: Transform[Any, Any], +) -> None: + assert transform.pyarrow_transform(source_type)(arrow_table_date_timestamps[source_col]).to_pylist() == [ + transform.transform(source_type)(_to_partition_representation(source_type, v)) + for v in arrow_table_date_timestamps[source_col].to_pylist() + ] + + +@pytest.mark.parametrize("source_col, source_type", [("timestamp", TimestampType()), ("timestamptz", TimestamptzType())]) +def test_hour_pyarrow_transforms(arrow_table_date_timestamps: "pa.Table", source_col: str, source_type: PrimitiveType) -> None: + assert HourTransform().pyarrow_transform(source_type)(arrow_table_date_timestamps[source_col]).to_pylist() == [ + HourTransform().transform(source_type)(_to_partition_representation(source_type, v)) + for v in arrow_table_date_timestamps[source_col].to_pylist() + ] + + +def test_hour_pyarrow_transforms_throws_with_dates(arrow_table_date_timestamps: "pa.Table") -> None: + # HourTransform is not supported for DateType + with pytest.raises(ValueError): + HourTransform().pyarrow_transform(DateType())(arrow_table_date_timestamps["date"]) From ddfa9ac2af145f2c5d7c4a3b841c220c10fc280e Mon Sep 17 00:00:00 2001 From: Sung Yun <107272191+syun64@users.noreply.github.com> Date: Sun, 5 May 2024 19:51:42 +0000 Subject: [PATCH 03/80] todo: sort with pyarrow_transform vals --- pyiceberg/table/__init__.py | 7 +++-- pyiceberg/transforms.py | 18 +++++++++++ .../test_writes/test_partitioned_writes.py | 31 ++++++++++++++++--- tests/test_transforms.py | 28 ++++++----------- 4 files changed, 57 insertions(+), 27 deletions(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 13186c42cc..85c3c3360b 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -381,10 +381,11 @@ def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT) if not isinstance(df, pa.Table): raise ValueError(f"Expected PyArrow table, got: {df}") - supported_transforms = {IdentityTransform} - if not all(type(field.transform) in supported_transforms for field in self.table_metadata.spec().fields): + if unsupported_partitions := [ + field for field in self.table_metadata.spec().fields if not field.transform.supports_pyarrow_transform + ]: raise ValueError( - f"All transforms are not supported, expected: {supported_transforms}, but get: {[str(field) for field in self.table_metadata.spec().fields if field.transform not in supported_transforms]}." + f"Not all partition types are supported for writes. Following partitions cannot be written using pyarrow: {unsupported_partitions}." ) _check_schema_compatible(self._table.schema(), other_schema=df.schema) diff --git a/pyiceberg/transforms.py b/pyiceberg/transforms.py index 0cf26fe2a2..c8af97c301 100644 --- a/pyiceberg/transforms.py +++ b/pyiceberg/transforms.py @@ -178,6 +178,10 @@ def __eq__(self, other: Any) -> bool: return self.root == other.root return False + @property + def supports_pyarrow_transform(self) -> bool: + return False + class BucketTransform(Transform[S, int]): """Base Transform class to transform a value into a bucket partition value. @@ -352,6 +356,13 @@ def dedup_name(self) -> str: def preserves_order(self) -> bool: return True + @abstractmethod + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": ... + + @property + def supports_pyarrow_transform(self) -> bool: + return True + class YearTransform(TimeTransform[S]): """Transforms a datetime value into a year value. @@ -652,6 +663,13 @@ def __repr__(self) -> str: """Return the string representation of the IdentityTransform class.""" return "IdentityTransform()" + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": + return lambda v: v + + @property + def supports_pyarrow_transform(self) -> bool: + return True + class TruncateTransform(Transform[S, S]): """A transform for truncating a value to a specified width. diff --git a/tests/integration/test_writes/test_partitioned_writes.py b/tests/integration/test_writes/test_partitioned_writes.py index 07c3c43a2c..62c241b0eb 100644 --- a/tests/integration/test_writes/test_partitioned_writes.py +++ b/tests/integration/test_writes/test_partitioned_writes.py @@ -17,10 +17,11 @@ # pylint:disable=redefined-outer-name +from typing import Any + import pyarrow as pa import pytest from pyspark.sql import SparkSession -from typing import Any from pyiceberg.catalog import Catalog from pyiceberg.exceptions import NoSuchTableError @@ -390,13 +391,24 @@ def test_unsupported_transform( @pytest.mark.integration -@pytest.mark.parametrize('transform', [YearTransform(), MonthTransform(), DayTransform()]) @pytest.mark.parametrize( - "part_col", ["date", "timestamp", "timestamptz"] + "transform,expected_rows", + [ + pytest.param(YearTransform(), 2, id="year_transform"), + pytest.param(MonthTransform(), 3, id="month_transform"), + pytest.param(DayTransform(), 3, id="day_transform"), + ], ) +@pytest.mark.parametrize("part_col", ["date", "timestamp", "timestamptz"]) @pytest.mark.parametrize("format_version", [1, 2]) def test_append_ymd_transform_partitioned( - session_catalog: Catalog, spark: SparkSession, arrow_table_with_null: pa.Table, transform: Transform[Any, Any], part_col: str, format_version: int + session_catalog: Catalog, + spark: SparkSession, + arrow_table_with_null: pa.Table, + transform: Transform[Any, Any], + expected_rows: int, + part_col: str, + format_version: int, ) -> None: # Given identifier = f"default.arrow_table_v{format_version}_with_ymd_transform_partitioned_on_col_{part_col}" @@ -420,4 +432,13 @@ def test_append_ymd_transform_partitioned( assert df.count() == 3, f"Expected 3 total rows for {identifier}" for col in TEST_DATA_WITH_NULL.keys(): assert df.where(f"{col} is not null").count() == 2, f"Expected 2 non-null rows for {col}" - assert df.where(f"{col} is null").count() == 1, f"Expected 1 null row for {col} is null" \ No newline at end of file + assert df.where(f"{col} is null").count() == 1, f"Expected 1 null row for {col} is null" + + assert tbl.inspect.partitions().num_rows == expected_rows + files_df = spark.sql( + f""" + SELECT * + FROM {identifier}.files + """ + ) + assert files_df.count() == expected_rows diff --git a/tests/test_transforms.py b/tests/test_transforms.py index 1f3c47a8d9..4f926e4fb4 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -1847,7 +1847,7 @@ def arrow_table_date_timestamps() -> "pa.Table": ) -@pytest.mark.parametrize('transform', [YearTransform(), MonthTransform(), DayTransform()]) +@pytest.mark.parametrize('transform', [YearTransform(), MonthTransform(), DayTransform(), HourTransform()]) @pytest.mark.parametrize( "source_col, source_type", [("date", DateType()), ("timestamp", TimestampType()), ("timestamptz", TimestamptzType())] ) @@ -1857,21 +1857,11 @@ def test_ymd_pyarrow_transforms( source_type: PrimitiveType, transform: Transform[Any, Any], ) -> None: - assert transform.pyarrow_transform(source_type)(arrow_table_date_timestamps[source_col]).to_pylist() == [ - transform.transform(source_type)(_to_partition_representation(source_type, v)) - for v in arrow_table_date_timestamps[source_col].to_pylist() - ] - - -@pytest.mark.parametrize("source_col, source_type", [("timestamp", TimestampType()), ("timestamptz", TimestamptzType())]) -def test_hour_pyarrow_transforms(arrow_table_date_timestamps: "pa.Table", source_col: str, source_type: PrimitiveType) -> None: - assert HourTransform().pyarrow_transform(source_type)(arrow_table_date_timestamps[source_col]).to_pylist() == [ - HourTransform().transform(source_type)(_to_partition_representation(source_type, v)) - for v in arrow_table_date_timestamps[source_col].to_pylist() - ] - - -def test_hour_pyarrow_transforms_throws_with_dates(arrow_table_date_timestamps: "pa.Table") -> None: - # HourTransform is not supported for DateType - with pytest.raises(ValueError): - HourTransform().pyarrow_transform(DateType())(arrow_table_date_timestamps["date"]) + if transform.can_transform(source_type): + assert transform.pyarrow_transform(source_type)(arrow_table_date_timestamps[source_col]).to_pylist() == [ + transform.transform(source_type)(_to_partition_representation(source_type, v)) + for v in arrow_table_date_timestamps[source_col].to_pylist() + ] + else: + with pytest.raises(ValueError): + transform.pyarrow_transform(DateType())(arrow_table_date_timestamps[source_col]) From 1a5327a5f6436d631fa21bcc9ec15d510274457e Mon Sep 17 00:00:00 2001 From: Sung Yun <107272191+syun64@users.noreply.github.com> Date: Mon, 6 May 2024 02:05:44 +0000 Subject: [PATCH 04/80] checkpoint --- pyiceberg/table/__init__.py | 59 +++++++++++++++---------------------- 1 file changed, 24 insertions(+), 35 deletions(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 85c3c3360b..a65fbaa5ca 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -3545,33 +3545,6 @@ class TablePartition: arrow_table_partition: pa.Table -def _get_partition_sort_order(partition_columns: list[str], reverse: bool = False) -> dict[str, Any]: - order = 'ascending' if not reverse else 'descending' - null_placement = 'at_start' if reverse else 'at_end' - return {'sort_keys': [(column_name, order) for column_name in partition_columns], 'null_placement': null_placement} - - -def group_by_partition_scheme(arrow_table: pa.Table, partition_columns: list[str]) -> pa.Table: - """Given a table, sort it by current partition scheme.""" - # only works for identity for now - sort_options = _get_partition_sort_order(partition_columns, reverse=False) - sorted_arrow_table = arrow_table.sort_by(sorting=sort_options['sort_keys'], null_placement=sort_options['null_placement']) - return sorted_arrow_table - - -def get_partition_columns( - spec: PartitionSpec, - schema: Schema, -) -> list[str]: - partition_cols = [] - for partition_field in spec.fields: - column_name = schema.find_column_name(partition_field.source_id) - if not column_name: - raise ValueError(f"{partition_field=} could not be found in {schema}.") - partition_cols.append(column_name) - return partition_cols - - def _get_table_partitions( arrow_table: pa.Table, partition_spec: PartitionSpec, @@ -3626,13 +3599,29 @@ def _determine_partitions(spec: PartitionSpec, schema: Schema, arrow_table: pa.T """ import pyarrow as pa - partition_columns = get_partition_columns(spec=spec, schema=schema) - arrow_table = group_by_partition_scheme(arrow_table, partition_columns) - - reversing_sort_order_options = _get_partition_sort_order(partition_columns, reverse=True) - reversed_indices = pa.compute.sort_indices(arrow_table, **reversing_sort_order_options).to_pylist() - - slice_instructions: list[dict[str, Any]] = [] + partition_columns: List[Tuple[PartitionField, NestedField]] = [ + (partition_field, schema.find_field(partition_field.source_id)) for partition_field in spec.fields + ] + partition_values_table = pa.table({ + str(partition.field_id): partition.pyarrow_transform(field.field_type)(arrow_table[field.name]) + for partition, field in partition_columns + }) + + # Sort by partitions + sort_indices = pa.compute.sort_indices( + partition_values_table, + sort_keys=[(col, "ascending") for col in partition_values_table.column_names], + null_placement="at_end", + ).to_pylist() + arrow_table = arrow_table.take(sort_indices) + + # Get slice_instructions to group by partitions + reversed_indices = pa.compute.sort_indices( + partition_values_table, + sort_keys=[(col, "descending") for col in partition_values_table.column_names], + null_placement="at_start", + ).to_pylist() + slice_instructions: List[Dict[str, Any]] = [] last = len(reversed_indices) reversed_indices_size = len(reversed_indices) ptr = 0 @@ -3643,6 +3632,6 @@ def _determine_partitions(spec: PartitionSpec, schema: Schema, arrow_table: pa.T last = reversed_indices[ptr] ptr = ptr + group_size - table_partitions: list[TablePartition] = _get_table_partitions(arrow_table, spec, schema, slice_instructions) + table_partitions: List[TablePartition] = _get_table_partitions(arrow_table, spec, schema, slice_instructions) return table_partitions From e067a28a286f11005710f42013f88c4e13c9dbeb Mon Sep 17 00:00:00 2001 From: Sung Yun <107272191+syun64@users.noreply.github.com> Date: Mon, 6 May 2024 02:09:32 +0000 Subject: [PATCH 05/80] checkpoint --- pyiceberg/table/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index a65fbaa5ca..e580eb5990 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -3603,7 +3603,7 @@ def _determine_partitions(spec: PartitionSpec, schema: Schema, arrow_table: pa.T (partition_field, schema.find_field(partition_field.source_id)) for partition_field in spec.fields ] partition_values_table = pa.table({ - str(partition.field_id): partition.pyarrow_transform(field.field_type)(arrow_table[field.name]) + str(partition.field_id): partition.transform.pyarrow_transform(field.field_type)(arrow_table[field.name]) for partition, field in partition_columns }) From 069f3bd9e7e1f8812ee67acc21547ca33a04f81a Mon Sep 17 00:00:00 2001 From: Sung Yun <107272191+syun64@users.noreply.github.com> Date: Mon, 6 May 2024 02:17:31 +0000 Subject: [PATCH 06/80] fix --- pyiceberg/table/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index e580eb5990..6937673f44 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -3616,6 +3616,7 @@ def _determine_partitions(spec: PartitionSpec, schema: Schema, arrow_table: pa.T arrow_table = arrow_table.take(sort_indices) # Get slice_instructions to group by partitions + partition_values_table = partition_values_table.take(sort_indices) reversed_indices = pa.compute.sort_indices( partition_values_table, sort_keys=[(col, "descending") for col in partition_values_table.column_names], From 615d5e397b3a2fc4b99b69a2f8a0781f182b4d99 Mon Sep 17 00:00:00 2001 From: Sung Yun <107272191+syun64@users.noreply.github.com> Date: Mon, 6 May 2024 14:42:47 +0000 Subject: [PATCH 07/80] tests --- tests/conftest.py | 43 +++++++++++ .../test_writes/test_partitioned_writes.py | 76 +++++++++++++++---- tests/test_transforms.py | 45 +++-------- 3 files changed, 115 insertions(+), 49 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 6679543694..1afdcae4bc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2145,3 +2145,46 @@ def arrow_table_with_only_nulls(pa_schema: "pa.Schema") -> "pa.Table": import pyarrow as pa return pa.Table.from_pylist([{}, {}], schema=pa_schema) + + +@pytest.fixture(scope="session") +def arrow_table_date_timestamps() -> "pa.Table": + """Pyarrow table with only date, timestamp and timestamptz values.""" + import pyarrow as pa + + return pa.Table.from_pydict( + { + "date": [date(2023, 12, 31), date(2024, 1, 1), date(2024, 1, 31), date(2024, 2, 1), date(2024, 2, 1), None], + "timestamp": [ + datetime(2023, 12, 31, 0, 0, 0), + datetime(2024, 1, 1, 0, 0, 0), + datetime(2024, 1, 31, 0, 0, 0), + datetime(2024, 2, 1, 0, 0, 0), + datetime(2024, 2, 1, 6, 0, 0), + None, + ], + "timestamptz": [ + datetime(2023, 12, 31, 0, 0, 0, tzinfo=timezone.utc), + datetime(2024, 1, 1, 0, 0, 0, tzinfo=timezone.utc), + datetime(2024, 1, 31, 0, 0, 0, tzinfo=timezone.utc), + datetime(2024, 2, 1, 0, 0, 0, tzinfo=timezone.utc), + datetime(2024, 2, 1, 6, 0, 0, tzinfo=timezone.utc), + None, + ], + }, + schema=pa.schema([ + ("date", pa.date32()), + ("timestamp", pa.timestamp(unit="us")), + ("timestamptz", pa.timestamp(unit="us", tz="UTC")), + ]), + ) + + +@pytest.fixture(scope="session") +def arrow_table_date_timestamps_schema() -> Schema: + """Pyarrow table Schema with only date, timestamp and timestamptz values.""" + return Schema( + NestedField(field_id=1, name="date", field_type=DateType(), required=False), + NestedField(field_id=2, name="timestamp", field_type=TimestampType(), required=False), + NestedField(field_id=3, name="timestamptz", field_type=TimestamptzType(), required=False), + ) diff --git a/tests/integration/test_writes/test_partitioned_writes.py b/tests/integration/test_writes/test_partitioned_writes.py index 62c241b0eb..acdb5cb7b6 100644 --- a/tests/integration/test_writes/test_partitioned_writes.py +++ b/tests/integration/test_writes/test_partitioned_writes.py @@ -26,6 +26,7 @@ from pyiceberg.catalog import Catalog from pyiceberg.exceptions import NoSuchTableError from pyiceberg.partitioning import PartitionField, PartitionSpec +from pyiceberg.schema import Schema from pyiceberg.transforms import ( BucketTransform, DayTransform, @@ -355,18 +356,6 @@ def test_invalid_arguments(spark: SparkSession, session_catalog: Catalog) -> Non (PartitionSpec(PartitionField(source_id=5, field_id=1001, transform=TruncateTransform(2), name="long_trunc"))), (PartitionSpec(PartitionField(source_id=2, field_id=1001, transform=TruncateTransform(2), name="string_trunc"))), (PartitionSpec(PartitionField(source_id=11, field_id=1001, transform=TruncateTransform(2), name="binary_trunc"))), - (PartitionSpec(PartitionField(source_id=8, field_id=1001, transform=YearTransform(), name="timestamp_year"))), - (PartitionSpec(PartitionField(source_id=9, field_id=1001, transform=YearTransform(), name="timestamptz_year"))), - (PartitionSpec(PartitionField(source_id=10, field_id=1001, transform=YearTransform(), name="date_year"))), - (PartitionSpec(PartitionField(source_id=8, field_id=1001, transform=MonthTransform(), name="timestamp_month"))), - (PartitionSpec(PartitionField(source_id=9, field_id=1001, transform=MonthTransform(), name="timestamptz_month"))), - (PartitionSpec(PartitionField(source_id=10, field_id=1001, transform=MonthTransform(), name="date_month"))), - (PartitionSpec(PartitionField(source_id=8, field_id=1001, transform=DayTransform(), name="timestamp_day"))), - (PartitionSpec(PartitionField(source_id=9, field_id=1001, transform=DayTransform(), name="timestamptz_day"))), - (PartitionSpec(PartitionField(source_id=10, field_id=1001, transform=DayTransform(), name="date_day"))), - (PartitionSpec(PartitionField(source_id=8, field_id=1001, transform=HourTransform(), name="timestamp_hour"))), - (PartitionSpec(PartitionField(source_id=9, field_id=1001, transform=HourTransform(), name="timestamptz_hour"))), - (PartitionSpec(PartitionField(source_id=10, field_id=1001, transform=HourTransform(), name="date_hour"))), ], ) def test_unsupported_transform( @@ -386,7 +375,10 @@ def test_unsupported_transform( properties={'format-version': '1'}, ) - with pytest.raises(ValueError, match="All transforms are not supported.*"): + with pytest.raises( + ValueError, + match="Not all partition types are supported for writes. Following partitions cannot be written using pyarrow: *", + ): tbl.append(arrow_table_with_null) @@ -411,7 +403,7 @@ def test_append_ymd_transform_partitioned( format_version: int, ) -> None: # Given - identifier = f"default.arrow_table_v{format_version}_with_ymd_transform_partitioned_on_col_{part_col}" + identifier = f"default.arrow_table_v{format_version}_with_{str(transform)}_partition_on_col_{part_col}" nested_field = TABLE_SCHEMA.find_field(part_col) partition_spec = PartitionSpec( PartitionField(source_id=nested_field.field_id, field_id=1001, transform=transform, name=part_col) @@ -442,3 +434,59 @@ def test_append_ymd_transform_partitioned( """ ) assert files_df.count() == expected_rows + + +@pytest.mark.integration +@pytest.mark.parametrize( + "transform,expected_partitions", + [ + pytest.param(YearTransform(), 3, id="year_transform"), + pytest.param(MonthTransform(), 4, id="month_transform"), + pytest.param(DayTransform(), 5, id="day_transform"), + pytest.param(HourTransform(), 6, id="hour_transform"), + ], +) +@pytest.mark.parametrize("format_version", [1, 2]) +def test_append_transform_partition_verify_partitions_count( + session_catalog: Catalog, + spark: SparkSession, + arrow_table_date_timestamps: pa.Table, + arrow_table_date_timestamps_schema: Schema, + transform: Transform[Any, Any], + expected_partitions: int, + format_version: int, +) -> None: + # Given + part_col = "timestamptz" + identifier = f"default.arrow_table_v{format_version}_with_{str(transform)}_transform_partitioned_on_col_{part_col}" + nested_field = arrow_table_date_timestamps_schema.find_field(part_col) + partition_spec = PartitionSpec( + PartitionField(source_id=nested_field.field_id, field_id=1001, transform=transform, name=part_col) + ) + + # When + tbl = _create_table( + session_catalog=session_catalog, + identifier=identifier, + properties={"format-version": str(format_version)}, + data=[arrow_table_date_timestamps], + partition_spec=partition_spec, + schema=arrow_table_date_timestamps_schema, + ) + + # Then + assert tbl.format_version == format_version, f"Expected v{format_version}, got: v{tbl.format_version}" + df = spark.table(identifier) + assert df.count() == 6, f"Expected 6 total rows for {identifier}" + for col in arrow_table_date_timestamps.column_names: + assert df.where(f"{col} is not null").count() == 5, f"Expected 2 non-null rows for {col}" + assert df.where(f"{col} is null").count() == 1, f"Expected 1 null row for {col} is null" + + assert tbl.inspect.partitions().num_rows == expected_partitions + files_df = spark.sql( + f""" + SELECT * + FROM {identifier}.files + """ + ) + assert files_df.count() == expected_partitions diff --git a/tests/test_transforms.py b/tests/test_transforms.py index 4f926e4fb4..d86817a310 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=eval-used,protected-access,redefined-outer-name -from datetime import date, datetime, timezone +from datetime import date from decimal import Decimal from typing import TYPE_CHECKING, Any, Callable, Optional from uuid import UUID @@ -1814,40 +1814,15 @@ def test_strict_binary(bound_reference_binary: BoundReference[str]) -> None: ) -@pytest.fixture(scope="session") -def arrow_table_date_timestamps() -> "pa.Table": - """Pyarrow table with only date, timestamp and timestamptz values.""" - import pyarrow as pa - - return pa.Table.from_pydict( - { - "date": [date(2023, 12, 31), date(2024, 1, 1), date(2024, 1, 31), date(2024, 2, 1), date(2024, 2, 1), None], - "timestamp": [ - datetime(2023, 12, 31, 0, 0, 0), - datetime(2024, 1, 1, 0, 0, 0), - datetime(2024, 1, 31, 0, 0, 0), - datetime(2024, 2, 1, 0, 0, 0), - datetime(2024, 2, 1, 6, 0, 0), - None, - ], - "timestamptz": [ - datetime(2023, 12, 31, 0, 0, 0, tzinfo=timezone.utc), - datetime(2024, 1, 1, 0, 0, 0, tzinfo=timezone.utc), - datetime(2024, 1, 31, 0, 0, 0, tzinfo=timezone.utc), - datetime(2024, 2, 1, 0, 0, 0, tzinfo=timezone.utc), - datetime(2024, 2, 1, 6, 0, 0, tzinfo=timezone.utc), - None, - ], - }, - schema=pa.schema([ - ("date", pa.date32()), - ("timestamp", pa.timestamp(unit="us")), - ("timestamptz", pa.timestamp(unit="us", tz="UTC")), - ]), - ) - - -@pytest.mark.parametrize('transform', [YearTransform(), MonthTransform(), DayTransform(), HourTransform()]) +@pytest.mark.parametrize( + 'transform', + [ + pytest.param(YearTransform(), id="year_transform"), + pytest.param(MonthTransform(), id="month_transform"), + pytest.param(DayTransform(), id="day_transform"), + pytest.param(HourTransform(), id="hour_transform"), + ], +) @pytest.mark.parametrize( "source_col, source_type", [("date", DateType()), ("timestamp", TimestampType()), ("timestamptz", TimestamptzType())] ) From c0a0f321793850f598bd221405bbbda0e3337610 Mon Sep 17 00:00:00 2001 From: Sung Yun <107272191+syun64@users.noreply.github.com> Date: Mon, 6 May 2024 16:20:19 +0000 Subject: [PATCH 08/80] more tests --- Makefile | 2 +- pyiceberg/partitioning.py | 2 +- .../test_writes/test_partitioned_writes.py | 87 +++++++++++++++++-- 3 files changed, 80 insertions(+), 11 deletions(-) diff --git a/Makefile b/Makefile index 35051be9c1..de50374cfb 100644 --- a/Makefile +++ b/Makefile @@ -43,7 +43,7 @@ test-integration: sleep 10 docker compose -f dev/docker-compose-integration.yml cp ./dev/provision.py spark-iceberg:/opt/spark/provision.py docker compose -f dev/docker-compose-integration.yml exec -T spark-iceberg ipython ./provision.py - poetry run pytest tests/ -v -m integration ${PYTEST_ARGS} + poetry run pytest tests/integration/test_writes/test_partitioned_writes.py -v -m integration ${PYTEST_ARGS} test-integration-rebuild: docker compose -f dev/docker-compose-integration.yml kill diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py index a3cf255341..a3b482181d 100644 --- a/pyiceberg/partitioning.py +++ b/pyiceberg/partitioning.py @@ -387,7 +387,7 @@ def partition(self) -> Record: # partition key transformed with iceberg interna for raw_partition_field_value in self.raw_partition_field_values: partition_fields = self.partition_spec.source_id_to_fields_map[raw_partition_field_value.field.source_id] if len(partition_fields) != 1: - raise ValueError("partition_fields must contain exactly one field.") + raise ValueError(f"Cannot have redundant partitions: {partition_fields}") partition_field = partition_fields[0] iceberg_typed_key_values[partition_field.name] = partition_record_value( partition_field=partition_field, diff --git a/tests/integration/test_writes/test_partitioned_writes.py b/tests/integration/test_writes/test_partitioned_writes.py index acdb5cb7b6..97960cd536 100644 --- a/tests/integration/test_writes/test_partitioned_writes.py +++ b/tests/integration/test_writes/test_partitioned_writes.py @@ -17,7 +17,8 @@ # pylint:disable=redefined-outer-name -from typing import Any +from datetime import date +from typing import Any, Set import pyarrow as pa import pytest @@ -440,10 +441,12 @@ def test_append_ymd_transform_partitioned( @pytest.mark.parametrize( "transform,expected_partitions", [ - pytest.param(YearTransform(), 3, id="year_transform"), - pytest.param(MonthTransform(), 4, id="month_transform"), - pytest.param(DayTransform(), 5, id="day_transform"), - pytest.param(HourTransform(), 6, id="hour_transform"), + pytest.param(YearTransform(), {53, 54, None}, id="year_transform"), + pytest.param(MonthTransform(), {647, 648, 649, None}, id="month_transform"), + pytest.param( + DayTransform(), {date(2023, 12, 31), date(2024, 1, 1), date(2024, 1, 31), date(2024, 2, 1), None}, id="day_transform" + ), + pytest.param(HourTransform(), {473328, 473352, 474072, 474096, 474102, None}, id="hour_transform"), ], ) @pytest.mark.parametrize("format_version", [1, 2]) @@ -453,7 +456,7 @@ def test_append_transform_partition_verify_partitions_count( arrow_table_date_timestamps: pa.Table, arrow_table_date_timestamps_schema: Schema, transform: Transform[Any, Any], - expected_partitions: int, + expected_partitions: Set[Any], format_version: int, ) -> None: # Given @@ -461,7 +464,7 @@ def test_append_transform_partition_verify_partitions_count( identifier = f"default.arrow_table_v{format_version}_with_{str(transform)}_transform_partitioned_on_col_{part_col}" nested_field = arrow_table_date_timestamps_schema.find_field(part_col) partition_spec = PartitionSpec( - PartitionField(source_id=nested_field.field_id, field_id=1001, transform=transform, name=part_col) + PartitionField(source_id=nested_field.field_id, field_id=1001, transform=transform, name=part_col), ) # When @@ -482,11 +485,77 @@ def test_append_transform_partition_verify_partitions_count( assert df.where(f"{col} is not null").count() == 5, f"Expected 2 non-null rows for {col}" assert df.where(f"{col} is null").count() == 1, f"Expected 1 null row for {col} is null" - assert tbl.inspect.partitions().num_rows == expected_partitions + partitions_table = tbl.inspect.partitions() + assert partitions_table.num_rows == len(expected_partitions) + assert {part[part_col] for part in partitions_table['partition'].to_pylist()} == expected_partitions + files_df = spark.sql( + f""" + SELECT * + FROM {identifier}.files + """ + ) + assert files_df.count() == len(expected_partitions) + + +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_append_multiple_partitions( + session_catalog: Catalog, + spark: SparkSession, + arrow_table_date_timestamps: pa.Table, + arrow_table_date_timestamps_schema: Schema, + format_version: int, +) -> None: + # Given + identifier = f"default.arrow_table_v{format_version}_with_multiple_partitions" + partition_spec = PartitionSpec( + PartitionField( + source_id=arrow_table_date_timestamps_schema.find_field("date").field_id, + field_id=1001, + transform=YearTransform(), + name="date_year", + ), + PartitionField( + source_id=arrow_table_date_timestamps_schema.find_field("timestamptz").field_id, + field_id=1000, + transform=HourTransform(), + name="timestamptz_hour", + ), + ) + + # When + tbl = _create_table( + session_catalog=session_catalog, + identifier=identifier, + properties={"format-version": str(format_version)}, + data=[arrow_table_date_timestamps], + partition_spec=partition_spec, + schema=arrow_table_date_timestamps_schema, + ) + + # Then + assert tbl.format_version == format_version, f"Expected v{format_version}, got: v{tbl.format_version}" + df = spark.table(identifier) + assert df.count() == 6, f"Expected 6 total rows for {identifier}" + for col in arrow_table_date_timestamps.column_names: + assert df.where(f"{col} is not null").count() == 5, f"Expected 2 non-null rows for {col}" + assert df.where(f"{col} is null").count() == 1, f"Expected 1 null row for {col} is null" + + partitions_table = tbl.inspect.partitions() + assert partitions_table.num_rows == 6 + partitions = partitions_table['partition'].to_pylist() + assert {(part["date_year"], part["timestamptz_hour"]) for part in partitions} == { + (53, 473328), + (54, 473352), + (54, 474072), + (54, 474096), + (54, 474102), + (None, None), + } files_df = spark.sql( f""" SELECT * FROM {identifier}.files """ ) - assert files_df.count() == expected_partitions + assert files_df.count() == 6 From d872245f143b919b25fda90a7b7c7fb0729a402a Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Mon, 6 May 2024 19:32:09 +0200 Subject: [PATCH 09/80] Remove trailing slash from table location when creating a table (#702) --- pyiceberg/catalog/__init__.py | 2 +- pyiceberg/catalog/rest.py | 2 + tests/catalog/test_base.py | 14 +++ tests/catalog/test_dynamodb.py | 15 +++ tests/catalog/test_glue.py | 16 +++ tests/catalog/test_hive.py | 175 +++++++++++++++++++++++++++++++++ tests/catalog/test_rest.py | 25 +++++ tests/catalog/test_sql.py | 22 +++++ 8 files changed, 270 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index 18d803fe1c..5bb9ec277a 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -779,7 +779,7 @@ def _get_updated_props_and_update_summary( def _resolve_table_location(self, location: Optional[str], database_name: str, table_name: str) -> str: if not location: return self._get_default_warehouse_location(database_name, table_name) - return location + return location.rstrip("/") def _get_default_warehouse_location(self, database_name: str, table_name: str) -> str: database_properties = self.load_namespace_properties(database_name) diff --git a/pyiceberg/catalog/rest.py b/pyiceberg/catalog/rest.py index 53e3f6a123..565d809194 100644 --- a/pyiceberg/catalog/rest.py +++ b/pyiceberg/catalog/rest.py @@ -519,6 +519,8 @@ def _create_table( fresh_sort_order = assign_fresh_sort_order_ids(sort_order, iceberg_schema, fresh_schema) namespace_and_table = self._split_identifier_for_path(identifier) + if location: + location = location.rstrip("/") request = CreateTableRequest( name=namespace_and_table["table"], location=location, diff --git a/tests/catalog/test_base.py b/tests/catalog/test_base.py index 7d5e0a973c..06e9a8a3aa 100644 --- a/tests/catalog/test_base.py +++ b/tests/catalog/test_base.py @@ -105,6 +105,7 @@ def create_table( if not location: location = f'{self._warehouse_location}/{"/".join(identifier)}' + location = location.rstrip("/") metadata_location = self._get_metadata_location(location=location) metadata = new_table_metadata( @@ -353,6 +354,19 @@ def test_create_table_location_override(catalog: InMemoryCatalog) -> None: assert table.location() == new_location +def test_create_table_removes_trailing_slash_from_location(catalog: InMemoryCatalog) -> None: + new_location = f"{catalog._warehouse_location}/new_location" + table = catalog.create_table( + identifier=TEST_TABLE_IDENTIFIER, + schema=TEST_TABLE_SCHEMA, + location=f"{new_location}/", + partition_spec=TEST_TABLE_PARTITION_SPEC, + properties=TEST_TABLE_PROPERTIES, + ) + assert catalog.load_table(TEST_TABLE_IDENTIFIER) == table + assert table.location() == new_location + + @pytest.mark.parametrize( "schema,expected", [ diff --git a/tests/catalog/test_dynamodb.py b/tests/catalog/test_dynamodb.py index 1c647cf828..f4b16d343b 100644 --- a/tests/catalog/test_dynamodb.py +++ b/tests/catalog/test_dynamodb.py @@ -117,6 +117,21 @@ def test_create_table_with_given_location( assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) +@mock_aws +def test_create_table_removes_trailing_slash_in_location( + _bucket_initialize: None, moto_endpoint_url: str, table_schema_nested: Schema, database_name: str, table_name: str +) -> None: + catalog_name = "test_ddb_catalog" + identifier = (database_name, table_name) + test_catalog = DynamoDbCatalog(catalog_name, **{"s3.endpoint": moto_endpoint_url}) + test_catalog.create_namespace(namespace=database_name) + location = f"s3://{BUCKET_NAME}/{database_name}.db/{table_name}" + table = test_catalog.create_table(identifier=identifier, schema=table_schema_nested, location=f"{location}/") + assert table.identifier == (catalog_name,) + identifier + assert table.location() == location + assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) + + @mock_aws def test_create_table_with_no_location( _bucket_initialize: None, table_schema_nested: Schema, database_name: str, table_name: str diff --git a/tests/catalog/test_glue.py b/tests/catalog/test_glue.py index 5999b192a2..5b67b92c68 100644 --- a/tests/catalog/test_glue.py +++ b/tests/catalog/test_glue.py @@ -137,6 +137,22 @@ def test_create_table_with_given_location( assert test_catalog._parse_metadata_version(table.metadata_location) == 0 +@mock_aws +def test_create_table_removes_trailing_slash_in_location( + _bucket_initialize: None, moto_endpoint_url: str, table_schema_nested: Schema, database_name: str, table_name: str +) -> None: + catalog_name = "glue" + identifier = (database_name, table_name) + test_catalog = GlueCatalog(catalog_name, **{"s3.endpoint": moto_endpoint_url}) + test_catalog.create_namespace(namespace=database_name) + location = f"s3://{BUCKET_NAME}/{database_name}.db/{table_name}" + table = test_catalog.create_table(identifier=identifier, schema=table_schema_nested, location=f"{location}/") + assert table.identifier == (catalog_name,) + identifier + assert table.location() == location + assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location) + assert test_catalog._parse_metadata_version(table.metadata_location) == 0 + + @mock_aws def test_create_table_with_pyarrow_schema( _bucket_initialize: None, diff --git a/tests/catalog/test_hive.py b/tests/catalog/test_hive.py index 70927ea1bc..af3a380100 100644 --- a/tests/catalog/test_hive.py +++ b/tests/catalog/test_hive.py @@ -365,6 +365,181 @@ def test_create_table( assert metadata.model_dump() == expected.model_dump() +@pytest.mark.parametrize("hive2_compatible", [True, False]) +@patch("time.time", MagicMock(return_value=12345)) +def test_create_table_with_given_location_removes_trailing_slash( + table_schema_with_all_types: Schema, hive_database: HiveDatabase, hive_table: HiveTable, hive2_compatible: bool +) -> None: + catalog = HiveCatalog(HIVE_CATALOG_NAME, uri=HIVE_METASTORE_FAKE_URL) + if hive2_compatible: + catalog = HiveCatalog(HIVE_CATALOG_NAME, uri=HIVE_METASTORE_FAKE_URL, **{"hive.hive2-compatible": "true"}) + + location = f"{hive_database.locationUri}/table-given-location" + + catalog._client = MagicMock() + catalog._client.__enter__().create_table.return_value = None + catalog._client.__enter__().get_table.return_value = hive_table + catalog._client.__enter__().get_database.return_value = hive_database + catalog.create_table( + ("default", "table"), schema=table_schema_with_all_types, properties={"owner": "javaberg"}, location=f"{location}/" + ) + + called_hive_table: HiveTable = catalog._client.__enter__().create_table.call_args[0][0] + # This one is generated within the function itself, so we need to extract + # it to construct the assert_called_with + metadata_location: str = called_hive_table.parameters["metadata_location"] + assert metadata_location.endswith(".metadata.json") + assert "/database/table-given-location/metadata/" in metadata_location + catalog._client.__enter__().create_table.assert_called_with( + HiveTable( + tableName="table", + dbName="default", + owner="javaberg", + createTime=12345, + lastAccessTime=12345, + retention=None, + sd=StorageDescriptor( + cols=[ + FieldSchema(name='boolean', type='boolean', comment=None), + FieldSchema(name='integer', type='int', comment=None), + FieldSchema(name='long', type='bigint', comment=None), + FieldSchema(name='float', type='float', comment=None), + FieldSchema(name='double', type='double', comment=None), + FieldSchema(name='decimal', type='decimal(32,3)', comment=None), + FieldSchema(name='date', type='date', comment=None), + FieldSchema(name='time', type='string', comment=None), + FieldSchema(name='timestamp', type='timestamp', comment=None), + FieldSchema( + name='timestamptz', + type='timestamp' if hive2_compatible else 'timestamp with local time zone', + comment=None, + ), + FieldSchema(name='string', type='string', comment=None), + FieldSchema(name='uuid', type='string', comment=None), + FieldSchema(name='fixed', type='binary', comment=None), + FieldSchema(name='binary', type='binary', comment=None), + FieldSchema(name='list', type='array', comment=None), + FieldSchema(name='map', type='map', comment=None), + FieldSchema(name='struct', type='struct', comment=None), + ], + location=f"{hive_database.locationUri}/table-given-location", + inputFormat="org.apache.hadoop.mapred.FileInputFormat", + outputFormat="org.apache.hadoop.mapred.FileOutputFormat", + compressed=None, + numBuckets=None, + serdeInfo=SerDeInfo( + name=None, + serializationLib="org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", + parameters=None, + description=None, + serializerClass=None, + deserializerClass=None, + serdeType=None, + ), + bucketCols=None, + sortCols=None, + parameters=None, + skewedInfo=None, + storedAsSubDirectories=None, + ), + partitionKeys=None, + parameters={"EXTERNAL": "TRUE", "table_type": "ICEBERG", "metadata_location": metadata_location}, + viewOriginalText=None, + viewExpandedText=None, + tableType="EXTERNAL_TABLE", + privileges=None, + temporary=False, + rewriteEnabled=None, + creationMetadata=None, + catName=None, + ownerType=1, + writeId=-1, + isStatsCompliant=None, + colStats=None, + accessType=None, + requiredReadCapabilities=None, + requiredWriteCapabilities=None, + id=None, + fileMetadata=None, + dictionary=None, + txnId=None, + ) + ) + + with open(metadata_location, encoding=UTF8) as f: + payload = f.read() + + metadata = TableMetadataUtil.parse_raw(payload) + + assert "database/table-given-location" in metadata.location + + expected = TableMetadataV2( + location=metadata.location, + table_uuid=metadata.table_uuid, + last_updated_ms=metadata.last_updated_ms, + last_column_id=22, + schemas=[ + Schema( + NestedField(field_id=1, name='boolean', field_type=BooleanType(), required=True), + NestedField(field_id=2, name='integer', field_type=IntegerType(), required=True), + NestedField(field_id=3, name='long', field_type=LongType(), required=True), + NestedField(field_id=4, name='float', field_type=FloatType(), required=True), + NestedField(field_id=5, name='double', field_type=DoubleType(), required=True), + NestedField(field_id=6, name='decimal', field_type=DecimalType(precision=32, scale=3), required=True), + NestedField(field_id=7, name='date', field_type=DateType(), required=True), + NestedField(field_id=8, name='time', field_type=TimeType(), required=True), + NestedField(field_id=9, name='timestamp', field_type=TimestampType(), required=True), + NestedField(field_id=10, name='timestamptz', field_type=TimestamptzType(), required=True), + NestedField(field_id=11, name='string', field_type=StringType(), required=True), + NestedField(field_id=12, name='uuid', field_type=UUIDType(), required=True), + NestedField(field_id=13, name='fixed', field_type=FixedType(length=12), required=True), + NestedField(field_id=14, name='binary', field_type=BinaryType(), required=True), + NestedField( + field_id=15, + name='list', + field_type=ListType(type='list', element_id=18, element_type=StringType(), element_required=True), + required=True, + ), + NestedField( + field_id=16, + name='map', + field_type=MapType( + type='map', key_id=19, key_type=StringType(), value_id=20, value_type=IntegerType(), value_required=True + ), + required=True, + ), + NestedField( + field_id=17, + name='struct', + field_type=StructType( + NestedField(field_id=21, name='inner_string', field_type=StringType(), required=False), + NestedField(field_id=22, name='inner_int', field_type=IntegerType(), required=True), + ), + required=False, + ), + schema_id=0, + identifier_field_ids=[2], + ) + ], + current_schema_id=0, + last_partition_id=999, + properties={"owner": "javaberg", 'write.parquet.compression-codec': 'zstd'}, + partition_specs=[PartitionSpec()], + default_spec_id=0, + current_snapshot_id=None, + snapshots=[], + snapshot_log=[], + metadata_log=[], + sort_orders=[SortOrder(order_id=0)], + default_sort_order_id=0, + refs={}, + format_version=2, + last_sequence_number=0, + ) + + assert metadata.model_dump() == expected.model_dump() + + @patch("time.time", MagicMock(return_value=12345)) def test_create_v1_table(table_schema_simple: Schema, hive_database: HiveDatabase, hive_table: HiveTable) -> None: catalog = HiveCatalog(HIVE_CATALOG_NAME, uri=HIVE_METASTORE_FAKE_URL) diff --git a/tests/catalog/test_rest.py b/tests/catalog/test_rest.py index 15ddb01b25..b8410d6841 100644 --- a/tests/catalog/test_rest.py +++ b/tests/catalog/test_rest.py @@ -732,6 +732,31 @@ def test_create_table_200( assert actual == expected +def test_create_table_with_given_location_removes_trailing_slash_200( + rest_mock: Mocker, table_schema_simple: Schema, example_table_metadata_no_snapshot_v1_rest_json: Dict[str, Any] +) -> None: + rest_mock.post( + f"{TEST_URI}v1/namespaces/fokko/tables", + json=example_table_metadata_no_snapshot_v1_rest_json, + status_code=200, + request_headers=TEST_HEADERS, + ) + catalog = RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN) + location = "s3://warehouse/database/table-custom-location" + catalog.create_table( + identifier=("fokko", "fokko2"), + schema=table_schema_simple, + location=f"{location}/", + partition_spec=PartitionSpec( + PartitionField(source_id=1, field_id=1000, transform=TruncateTransform(width=3), name="id"), spec_id=1 + ), + sort_order=SortOrder(SortField(source_id=2, transform=IdentityTransform())), + properties={"owner": "fokko"}, + ) + assert rest_mock.last_request + assert rest_mock.last_request.json()["location"] == location + + def test_create_table_409(rest_mock: Mocker, table_schema_simple: Schema) -> None: rest_mock.post( f"{TEST_URI}v1/namespaces/fokko/tables", diff --git a/tests/catalog/test_sql.py b/tests/catalog/test_sql.py index 40a1566e2f..9796526887 100644 --- a/tests/catalog/test_sql.py +++ b/tests/catalog/test_sql.py @@ -264,6 +264,28 @@ def test_create_table_with_default_warehouse_location( catalog.drop_table(random_identifier) +@pytest.mark.parametrize( + 'catalog', + [ + lazy_fixture('catalog_memory'), + lazy_fixture('catalog_sqlite'), + ], +) +def test_create_table_with_given_location_removes_trailing_slash( + warehouse: Path, catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier +) -> None: + database_name, table_name = random_identifier + location = f"file://{warehouse}/{database_name}.db/{table_name}-given" + catalog.create_namespace(database_name) + catalog.create_table(random_identifier, table_schema_nested, location=f"{location}/") + table = catalog.load_table(random_identifier) + assert table.identifier == (catalog.name,) + random_identifier + assert table.metadata_location.startswith(f"file://{warehouse}") + assert os.path.exists(table.metadata_location[len("file://") :]) + assert table.location() == location + catalog.drop_table(random_identifier) + + @pytest.mark.parametrize( 'catalog', [ From a1f4ba8e6c9d21744ab16a49d0ed76619fd0e5fe Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 6 May 2024 21:09:29 -0700 Subject: [PATCH 10/80] Build: Bump mkdocs-section-index from 0.3.8 to 0.3.9 (#696) Bumps [mkdocs-section-index](https://github.com/oprypin/mkdocs-section-index) from 0.3.8 to 0.3.9. - [Release notes](https://github.com/oprypin/mkdocs-section-index/releases) - [Commits](https://github.com/oprypin/mkdocs-section-index/compare/v0.3.8...v0.3.9) --- updated-dependencies: - dependency-name: mkdocs-section-index dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- mkdocs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs/requirements.txt b/mkdocs/requirements.txt index 83a067ce8f..86bfe8e060 100644 --- a/mkdocs/requirements.txt +++ b/mkdocs/requirements.txt @@ -25,4 +25,4 @@ mkdocs-autorefs==1.0.1 mkdocs-gen-files==0.5.0 mkdocs-material==9.5.20 mkdocs-material-extensions==1.3.1 -mkdocs-section-index==0.3.8 +mkdocs-section-index==0.3.9 From e2f547d54af0334d5d91a63e63f23a3c55bbd941 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 6 May 2024 21:45:35 -0700 Subject: [PATCH 11/80] Build: Bump cython from 3.0.8 to 3.0.10 (#697) Bumps [cython](https://github.com/cython/cython) from 3.0.8 to 3.0.10. - [Release notes](https://github.com/cython/cython/releases) - [Changelog](https://github.com/cython/cython/blob/master/CHANGES.rst) - [Commits](https://github.com/cython/cython/compare/3.0.8...3.0.10) --- updated-dependencies: - dependency-name: cython dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 124 ++++++++++++++++++++++++------------------------- pyproject.toml | 2 +- 2 files changed, 63 insertions(+), 63 deletions(-) diff --git a/poetry.lock b/poetry.lock index 2821d1c687..fb93b1cb4c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -885,69 +885,69 @@ test-randomorder = ["pytest-randomly"] [[package]] name = "cython" -version = "3.0.8" +version = "3.0.10" description = "The Cython compiler for writing C extensions in the Python language." optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -files = [ - {file = "Cython-3.0.8-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a846e0a38e2b24e9a5c5dc74b0e54c6e29420d88d1dafabc99e0fc0f3e338636"}, - {file = "Cython-3.0.8-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45523fdc2b78d79b32834cc1cc12dc2ca8967af87e22a3ee1bff20e77c7f5520"}, - {file = "Cython-3.0.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa0b7f3f841fe087410cab66778e2d3fb20ae2d2078a2be3dffe66c6574be39"}, - {file = "Cython-3.0.8-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e87294e33e40c289c77a135f491cd721bd089f193f956f7b8ed5aa2d0b8c558f"}, - {file = "Cython-3.0.8-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:a1df7a129344b1215c20096d33c00193437df1a8fcca25b71f17c23b1a44f782"}, - {file = "Cython-3.0.8-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:13c2a5e57a0358da467d97667297bf820b62a1a87ae47c5f87938b9bb593acbd"}, - {file = "Cython-3.0.8-cp310-cp310-win32.whl", hash = "sha256:96b028f044f5880e3cb18ecdcfc6c8d3ce9d0af28418d5ab464509f26d8adf12"}, - {file = "Cython-3.0.8-cp310-cp310-win_amd64.whl", hash = "sha256:8140597a8b5cc4f119a1190f5a2228a84f5ca6d8d9ec386cfce24663f48b2539"}, - {file = "Cython-3.0.8-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:aae26f9663e50caf9657148403d9874eea41770ecdd6caf381d177c2b1bb82ba"}, - {file = "Cython-3.0.8-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:547eb3cdb2f8c6f48e6865d5a741d9dd051c25b3ce076fbca571727977b28ac3"}, - {file = "Cython-3.0.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a567d4b9ba70b26db89d75b243529de9e649a2f56384287533cf91512705bee"}, - {file = "Cython-3.0.8-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:51d1426263b0e82fb22bda8ea60dc77a428581cc19e97741011b938445d383f1"}, - {file = "Cython-3.0.8-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c26daaeccda072459b48d211415fd1e5507c06bcd976fa0d5b8b9f1063467d7b"}, - {file = "Cython-3.0.8-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:289ce7838208211cd166e975865fd73b0649bf118170b6cebaedfbdaf4a37795"}, - {file = "Cython-3.0.8-cp311-cp311-win32.whl", hash = "sha256:c8aa05f5e17f8042a3be052c24f2edc013fb8af874b0bf76907d16c51b4e7871"}, - {file = "Cython-3.0.8-cp311-cp311-win_amd64.whl", hash = "sha256:000dc9e135d0eec6ecb2b40a5b02d0868a2f8d2e027a41b0fe16a908a9e6de02"}, - {file = "Cython-3.0.8-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:90d3fe31db55685d8cb97d43b0ec39ef614fcf660f83c77ed06aa670cb0e164f"}, - {file = "Cython-3.0.8-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e24791ddae2324e88e3c902a765595c738f19ae34ee66bfb1a6dac54b1833419"}, - {file = "Cython-3.0.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2f020fa1c0552052e0660790b8153b79e3fc9a15dbd8f1d0b841fe5d204a6ae6"}, - {file = "Cython-3.0.8-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:18bfa387d7a7f77d7b2526af69a65dbd0b731b8d941aaff5becff8e21f6d7717"}, - {file = "Cython-3.0.8-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:fe81b339cffd87c0069c6049b4d33e28bdd1874625ee515785bf42c9fdff3658"}, - {file = "Cython-3.0.8-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:80fd94c076e1e1b1ee40a309be03080b75f413e8997cddcf401a118879863388"}, - {file = "Cython-3.0.8-cp312-cp312-win32.whl", hash = "sha256:85077915a93e359a9b920280d214dc0cf8a62773e1f3d7d30fab8ea4daed670c"}, - {file = "Cython-3.0.8-cp312-cp312-win_amd64.whl", hash = "sha256:0cb2dcc565c7851f75d496f724a384a790fab12d1b82461b663e66605bec429a"}, - {file = "Cython-3.0.8-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:870d2a0a7e3cbd5efa65aecdb38d715ea337a904ea7bb22324036e78fb7068e7"}, - {file = "Cython-3.0.8-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7e8f2454128974905258d86534f4fd4f91d2f1343605657ecab779d80c9d6d5e"}, - {file = "Cython-3.0.8-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1949d6aa7bc792554bee2b67a9fe41008acbfe22f4f8df7b6ec7b799613a4b3"}, - {file = "Cython-3.0.8-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c9f2c6e1b8f3bcd6cb230bac1843f85114780bb8be8614855b1628b36bb510e0"}, - {file = "Cython-3.0.8-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:05d7eddc668ae7993643f32c7661f25544e791edb745758672ea5b1a82ecffa6"}, - {file = "Cython-3.0.8-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:bfabe115deef4ada5d23c87bddb11289123336dcc14347011832c07db616dd93"}, - {file = "Cython-3.0.8-cp36-cp36m-win32.whl", hash = "sha256:0c38c9f0bcce2df0c3347285863621be904ac6b64c5792d871130569d893efd7"}, - {file = "Cython-3.0.8-cp36-cp36m-win_amd64.whl", hash = "sha256:6c46939c3983217d140999de7c238c3141f56b1ea349e47ca49cae899969aa2c"}, - {file = "Cython-3.0.8-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:115f0a50f752da6c99941b103b5cb090da63eb206abbc7c2ad33856ffc73f064"}, - {file = "Cython-3.0.8-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c9c0f29246734561c90f36e70ed0506b61aa3d044e4cc4cba559065a2a741fae"}, - {file = "Cython-3.0.8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ab75242869ff71e5665fe5c96f3378e79e792fa3c11762641b6c5afbbbbe026"}, - {file = "Cython-3.0.8-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6717c06e9cfc6c1df18543cd31a21f5d8e378a40f70c851fa2d34f0597037abc"}, - {file = "Cython-3.0.8-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:9d3f74388db378a3c6fd06e79a809ed98df3f56484d317b81ee762dbf3c263e0"}, - {file = "Cython-3.0.8-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ae7ac561fd8253a9ae96311e91d12af5f701383564edc11d6338a7b60b285a6f"}, - {file = "Cython-3.0.8-cp37-cp37m-win32.whl", hash = "sha256:97b2a45845b993304f1799664fa88da676ee19442b15fdcaa31f9da7e1acc434"}, - {file = "Cython-3.0.8-cp37-cp37m-win_amd64.whl", hash = "sha256:9e2be2b340fea46fb849d378f9b80d3c08ff2e81e2bfbcdb656e2e3cd8c6b2dc"}, - {file = "Cython-3.0.8-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2cde23c555470db3f149ede78b518e8274853745289c956a0e06ad8d982e4db9"}, - {file = "Cython-3.0.8-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7990ca127e1f1beedaf8fc8bf66541d066ef4723ad7d8d47a7cbf842e0f47580"}, - {file = "Cython-3.0.8-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b983c8e6803f016146c26854d9150ddad5662960c804ea7f0c752c9266752f0"}, - {file = "Cython-3.0.8-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a973268d7ca1a2bdf78575e459a94a78e1a0a9bb62a7db0c50041949a73b02ff"}, - {file = "Cython-3.0.8-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:61a237bc9dd23c7faef0fcfce88c11c65d0c9bb73c74ccfa408b3a012073c20e"}, - {file = "Cython-3.0.8-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:3a3d67f079598af49e90ff9655bf85bd358f093d727eb21ca2708f467c489cae"}, - {file = "Cython-3.0.8-cp38-cp38-win32.whl", hash = "sha256:17a642bb01a693e34c914106566f59844b4461665066613913463a719e0dd15d"}, - {file = "Cython-3.0.8-cp38-cp38-win_amd64.whl", hash = "sha256:2cdfc32252f3b6dc7c94032ab744dcedb45286733443c294d8f909a4854e7f83"}, - {file = "Cython-3.0.8-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fa97893d99385386925d00074654aeae3a98867f298d1e12ceaf38a9054a9bae"}, - {file = "Cython-3.0.8-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f05c0bf9d085c031df8f583f0d506aa3be1692023de18c45d0aaf78685bbb944"}, - {file = "Cython-3.0.8-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de892422582f5758bd8de187e98ac829330ec1007bc42c661f687792999988a7"}, - {file = "Cython-3.0.8-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:314f2355a1f1d06e3c431eaad4708cf10037b5e91e4b231d89c913989d0bdafd"}, - {file = "Cython-3.0.8-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:78825a3774211e7d5089730f00cdf7f473042acc9ceb8b9eeebe13ed3a5541de"}, - {file = "Cython-3.0.8-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:df8093deabc55f37028190cf5e575c26aad23fc673f34b85d5f45076bc37ce39"}, - {file = "Cython-3.0.8-cp39-cp39-win32.whl", hash = "sha256:1aca1b97e0095b3a9a6c33eada3f661a4ed0d499067d121239b193e5ba3bb4f0"}, - {file = "Cython-3.0.8-cp39-cp39-win_amd64.whl", hash = "sha256:16873d78be63bd38ffb759da7ab82814b36f56c769ee02b1d5859560e4c3ac3c"}, - {file = "Cython-3.0.8-py2.py3-none-any.whl", hash = "sha256:171b27051253d3f9108e9759e504ba59ff06e7f7ba944457f94deaf9c21bf0b6"}, - {file = "Cython-3.0.8.tar.gz", hash = "sha256:8333423d8fd5765e7cceea3a9985dd1e0a5dfeb2734629e1a2ed2d6233d39de6"}, +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7" +files = [ + {file = "Cython-3.0.10-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e876272548d73583e90babda94c1299537006cad7a34e515a06c51b41f8657aa"}, + {file = "Cython-3.0.10-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:adc377aa33c3309191e617bf675fdbb51ca727acb9dc1aa23fc698d8121f7e23"}, + {file = "Cython-3.0.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:401aba1869a57aba2922ccb656a6320447e55ace42709b504c2f8e8b166f46e1"}, + {file = "Cython-3.0.10-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:541fbe725d6534a90b93f8c577eb70924d664b227a4631b90a6e0506d1469591"}, + {file = "Cython-3.0.10-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:86998b01f6a6d48398df8467292c7637e57f7e3a2ca68655367f13f66fed7734"}, + {file = "Cython-3.0.10-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d092c0ddba7e9e530a5c5be4ac06db8360258acc27675d1fc86294a5dc8994c5"}, + {file = "Cython-3.0.10-cp310-cp310-win32.whl", hash = "sha256:3cffb666e649dba23810732497442fb339ee67ba4e0be1f0579991e83fcc2436"}, + {file = "Cython-3.0.10-cp310-cp310-win_amd64.whl", hash = "sha256:9ea31184c7b3a728ef1f81fccb161d8948c05aa86c79f63b74fb6f3ddec860ec"}, + {file = "Cython-3.0.10-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:051069638abfb076900b0c2bcb6facf545655b3f429e80dd14365192074af5a4"}, + {file = "Cython-3.0.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:712760879600907189c7d0d346851525545484e13cd8b787e94bfd293da8ccf0"}, + {file = "Cython-3.0.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:38d40fa1324ac47c04483d151f5e092406a147eac88a18aec789cf01c089c3f2"}, + {file = "Cython-3.0.10-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5bd49a3a9fdff65446a3e1c2bfc0ec85c6ce4c3cad27cd4ad7ba150a62b7fb59"}, + {file = "Cython-3.0.10-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e8df79b596633b8295eaa48b1157d796775c2bb078f32267d32f3001b687f2fd"}, + {file = "Cython-3.0.10-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:bcc9795990e525c192bc5c0775e441d7d56d7a7d02210451e9e13c0448dba51b"}, + {file = "Cython-3.0.10-cp311-cp311-win32.whl", hash = "sha256:09f2000041db482cad3bfce94e1fa3a4c82b0e57390a164c02566cbbda8c4f12"}, + {file = "Cython-3.0.10-cp311-cp311-win_amd64.whl", hash = "sha256:3919a55ec9b6c7db6f68a004c21c05ed540c40dbe459ced5d801d5a1f326a053"}, + {file = "Cython-3.0.10-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8f2864ab5fcd27a346f0b50f901ebeb8f60b25a60a575ccfd982e7f3e9674914"}, + {file = "Cython-3.0.10-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:407840c56385b9c085826fe300213e0e76ba15d1d47daf4b58569078ecb94446"}, + {file = "Cython-3.0.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a036d00caa73550a3a976432ef21c1e3fa12637e1616aab32caded35331ae96"}, + {file = "Cython-3.0.10-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9cc6a0e7e23a96dec3f3c9d39690d4281beabd5297855140d0d30855f950275e"}, + {file = "Cython-3.0.10-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:a5e14a8c6a8157d2b0cdc2e8e3444905d20a0e78e19d2a097e89fb8b04b51f6b"}, + {file = "Cython-3.0.10-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f8a2b8fa0fd8358bccb5f3304be563c4750aae175100463d212d5ea0ec74cbe0"}, + {file = "Cython-3.0.10-cp312-cp312-win32.whl", hash = "sha256:2d29e617fd23cf4b83afe8f93f2966566c9f565918ad1e86a4502fe825cc0a79"}, + {file = "Cython-3.0.10-cp312-cp312-win_amd64.whl", hash = "sha256:6c5af936940a38c300977b81598d9c0901158f220a58c177820e17e1774f1cf1"}, + {file = "Cython-3.0.10-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:5f465443917d5c0f69825fca3b52b64c74ac3de0143b1fff6db8ba5b48c9fb4a"}, + {file = "Cython-3.0.10-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4fadb84193c25641973666e583df8df4e27c52cdc05ddce7c6f6510d690ba34a"}, + {file = "Cython-3.0.10-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fa9e7786083b6aa61594c16979d621b62e61fcd9c2edd4761641b95c7fb34b2"}, + {file = "Cython-3.0.10-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f4780d0f98ce28191c4d841c4358b5d5e79d96520650910cd59904123821c52d"}, + {file = "Cython-3.0.10-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:32fbad02d1189be75eb96456d9c73f5548078e5338d8fa153ecb0115b6ee279f"}, + {file = "Cython-3.0.10-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:90e2f514fc753b55245351305a399463103ec18666150bb1c36779b9862388e9"}, + {file = "Cython-3.0.10-cp36-cp36m-win32.whl", hash = "sha256:a9c976e9ec429539a4367cb4b24d15a1e46b925976f4341143f49f5f161171f5"}, + {file = "Cython-3.0.10-cp36-cp36m-win_amd64.whl", hash = "sha256:a9bb402674788a7f4061aeef8057632ec440123e74ed0fb425308a59afdfa10e"}, + {file = "Cython-3.0.10-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:206e803598010ecc3813db8748ed685f7beeca6c413f982df9f8a505fce56563"}, + {file = "Cython-3.0.10-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15b6d397f4ee5ad54e373589522af37935a32863f1b23fa8c6922adf833e28e2"}, + {file = "Cython-3.0.10-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a181144c2f893ed8e6a994d43d0b96300bc99873f21e3b7334ca26c61c37b680"}, + {file = "Cython-3.0.10-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b74b700d6a793113d03fb54b63bdbadba6365379424bac7c0470605672769260"}, + {file = "Cython-3.0.10-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:076e9fd4e0ca33c5fa00a7479180dbfb62f17fe928e2909f82da814536e96d2b"}, + {file = "Cython-3.0.10-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:269f06e6961e8591d56e30b46e1a51b6ccb42cab04c29fa3b30d3e8723485fb4"}, + {file = "Cython-3.0.10-cp37-cp37m-win32.whl", hash = "sha256:d4e83a8ceff7af60064da4ccfce0ac82372544dd5392f1b350c34f1b04d0fae6"}, + {file = "Cython-3.0.10-cp37-cp37m-win_amd64.whl", hash = "sha256:40fac59c3a7fbcd9c25aea64c342c890a5e2270ce64a1525e840807800167799"}, + {file = "Cython-3.0.10-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f43a58bf2434870d2fc42ac2e9ff8138c9e00c6251468de279d93fa279e9ba3b"}, + {file = "Cython-3.0.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e9a885ec63d3955a08cefc4eec39fefa9fe14989c6e5e2382bd4aeb6bdb9bc3"}, + {file = "Cython-3.0.10-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:acfbe0fff364d54906058fc61f2393f38cd7fa07d344d80923937b87e339adcf"}, + {file = "Cython-3.0.10-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8adcde00a8a88fab27509b558cd8c2959ab0c70c65d3814cfea8c68b83fa6dcd"}, + {file = "Cython-3.0.10-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2c9c1e3e78909488f3b16fabae02308423fa6369ed96ab1e250807d344cfffd7"}, + {file = "Cython-3.0.10-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:fc6e0faf5b57523b073f0cdefadcaef3a51235d519a0594865925cadb3aeadf0"}, + {file = "Cython-3.0.10-cp38-cp38-win32.whl", hash = "sha256:35f6ede7c74024ed1982832ae61c9fad7cf60cc3f5b8c6a63bb34e38bc291936"}, + {file = "Cython-3.0.10-cp38-cp38-win_amd64.whl", hash = "sha256:950c0c7b770d2a7cec74fb6f5ccc321d0b51d151f48c075c0d0db635a60ba1b5"}, + {file = "Cython-3.0.10-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:077b61ee789e48700e25d4a16daa4258b8e65167136e457174df400cf9b4feab"}, + {file = "Cython-3.0.10-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:64f1f8bba9d8f37c0cffc934792b4ac7c42d0891077127c11deebe9fa0a0f7e4"}, + {file = "Cython-3.0.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:651a15a8534ebfb9b58cb0b87c269c70984b6f9c88bfe65e4f635f0e3f07dfcd"}, + {file = "Cython-3.0.10-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d10fc9aa82e5e53a0b7fd118f9771199cddac8feb4a6d8350b7d4109085aa775"}, + {file = "Cython-3.0.10-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4f610964ab252a83e573a427e28b103e2f1dd3c23bee54f32319f9e73c3c5499"}, + {file = "Cython-3.0.10-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8c9c4c4f3ab8f8c02817b0e16e8fa7b8cc880f76e9b63fe9c010e60c1a6c2b13"}, + {file = "Cython-3.0.10-cp39-cp39-win32.whl", hash = "sha256:0bac3ccdd4e03924028220c62ae3529e17efa8ca7e9df9330de95de02f582b26"}, + {file = "Cython-3.0.10-cp39-cp39-win_amd64.whl", hash = "sha256:81f356c1c8c0885b8435bfc468025f545c5d764aa9c75ab662616dd1193c331e"}, + {file = "Cython-3.0.10-py2.py3-none-any.whl", hash = "sha256:fcbb679c0b43514d591577fd0d20021c55c240ca9ccafbdb82d3fb95e5edfee2"}, + {file = "Cython-3.0.10.tar.gz", hash = "sha256:dcc96739331fb854dcf503f94607576cfe8488066c61ca50dfd55836f132de99"}, ] [[package]] @@ -4461,4 +4461,4 @@ zstandard = ["zstandard"] [metadata] lock-version = "2.0" python-versions = "^3.8" -content-hash = "91de7f775ff1499d79db490197eee5aadc7078b5244d86e56d8626c2615645f6" +content-hash = "2c019a99dfec370111ef19bae1ca7e00f434cec159296f5fcf4aee1b4552ba06" diff --git a/pyproject.toml b/pyproject.toml index 2682e16173..fafa5231a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -85,7 +85,7 @@ moto = { version = "^5.0.2", extras = ["server"] } typing-extensions = "4.11.0" pytest-mock = "3.14.0" pyspark = "3.5.1" -cython = "3.0.8" +cython = "3.0.10" deptry = ">=0.14,<0.17" docutils = "!=0.21" From 29beaf89b1a30922b7f41efc833b2824a349d8e6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 6 May 2024 22:39:00 -0700 Subject: [PATCH 12/80] Build: Bump tqdm from 4.66.2 to 4.66.3 (#699) Bumps [tqdm](https://github.com/tqdm/tqdm) from 4.66.2 to 4.66.3. - [Release notes](https://github.com/tqdm/tqdm/releases) - [Commits](https://github.com/tqdm/tqdm/compare/v4.66.2...v4.66.3) --- updated-dependencies: - dependency-name: tqdm dependency-type: indirect ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index fb93b1cb4c..727c639efb 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4062,13 +4062,13 @@ files = [ [[package]] name = "tqdm" -version = "4.66.2" +version = "4.66.3" description = "Fast, Extensible Progress Meter" -optional = true +optional = false python-versions = ">=3.7" files = [ - {file = "tqdm-4.66.2-py3-none-any.whl", hash = "sha256:1ee4f8a893eb9bef51c6e35730cebf234d5d0b6bd112b0271e10ed7c24a02bd9"}, - {file = "tqdm-4.66.2.tar.gz", hash = "sha256:6cd52cdf0fef0e0f543299cfc96fec90d7b8a7e88745f411ec33eb44d5ed3531"}, + {file = "tqdm-4.66.3-py3-none-any.whl", hash = "sha256:4f41d54107ff9a223dca80b53efe4fb654c67efaba7f47bada3ee9d50e05bd53"}, + {file = "tqdm-4.66.3.tar.gz", hash = "sha256:23097a41eba115ba99ecae40d06444c15d1c0c698d527a01c6c8bd1c5d0647e5"}, ] [package.dependencies] From 70a45f638533d6b551625d1cf234abf7fcc375ac Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 6 May 2024 22:39:18 -0700 Subject: [PATCH 13/80] Build: Bump werkzeug from 3.0.1 to 3.0.3 (#706) Bumps [werkzeug](https://github.com/pallets/werkzeug) from 3.0.1 to 3.0.3. - [Release notes](https://github.com/pallets/werkzeug/releases) - [Changelog](https://github.com/pallets/werkzeug/blob/main/CHANGES.rst) - [Commits](https://github.com/pallets/werkzeug/compare/3.0.1...3.0.3) --- updated-dependencies: - dependency-name: werkzeug dependency-type: indirect ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index 727c639efb..7cb8d11523 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4157,13 +4157,13 @@ test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess [[package]] name = "werkzeug" -version = "3.0.1" +version = "3.0.3" description = "The comprehensive WSGI web application library." optional = false python-versions = ">=3.8" files = [ - {file = "werkzeug-3.0.1-py3-none-any.whl", hash = "sha256:90a285dc0e42ad56b34e696398b8122ee4c681833fb35b8334a095d82c56da10"}, - {file = "werkzeug-3.0.1.tar.gz", hash = "sha256:507e811ecea72b18a404947aded4b3390e1db8f826b494d76550ef45bb3b1dcc"}, + {file = "werkzeug-3.0.3-py3-none-any.whl", hash = "sha256:fc9645dc43e03e4d630d23143a04a7f947a9a3b5727cd535fdfe155a17cc48c8"}, + {file = "werkzeug-3.0.3.tar.gz", hash = "sha256:097e5bfda9f0aba8da6b8545146def481d06aa7d3266e7448e2cccf67dd8bd18"}, ] [package.dependencies] From 0eb0c1ca932c727d9e378d59035b83cd56413ccf Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 6 May 2024 22:39:56 -0700 Subject: [PATCH 14/80] Build: Bump jinja2 from 3.1.3 to 3.1.4 in /mkdocs (#707) Bumps [jinja2](https://github.com/pallets/jinja) from 3.1.3 to 3.1.4. - [Release notes](https://github.com/pallets/jinja/releases) - [Changelog](https://github.com/pallets/jinja/blob/main/CHANGES.rst) - [Commits](https://github.com/pallets/jinja/compare/3.1.3...3.1.4) --- updated-dependencies: - dependency-name: jinja2 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- mkdocs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs/requirements.txt b/mkdocs/requirements.txt index 86bfe8e060..549c92f238 100644 --- a/mkdocs/requirements.txt +++ b/mkdocs/requirements.txt @@ -17,7 +17,7 @@ mkdocs==1.6.0 griffe==0.44.0 -jinja2==3.1.3 +jinja2==3.1.4 mkdocstrings==0.25.0 mkdocstrings-python==1.10.0 mkdocs-literate-nav==0.6.1 From 6a39eda3ea0992b591757f857b57c117d0b36e0b Mon Sep 17 00:00:00 2001 From: Sung Yun <107272191+syun64@users.noreply.github.com> Date: Tue, 7 May 2024 13:24:08 +0000 Subject: [PATCH 15/80] adopt review feedback --- Makefile | 2 +- pyiceberg/transforms.py | 18 +++++++++++++++--- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index de50374cfb..35051be9c1 100644 --- a/Makefile +++ b/Makefile @@ -43,7 +43,7 @@ test-integration: sleep 10 docker compose -f dev/docker-compose-integration.yml cp ./dev/provision.py spark-iceberg:/opt/spark/provision.py docker compose -f dev/docker-compose-integration.yml exec -T spark-iceberg ipython ./provision.py - poetry run pytest tests/integration/test_writes/test_partitioned_writes.py -v -m integration ${PYTEST_ARGS} + poetry run pytest tests/ -v -m integration ${PYTEST_ARGS} test-integration-rebuild: docker compose -f dev/docker-compose-integration.yml kill diff --git a/pyiceberg/transforms.py b/pyiceberg/transforms.py index c8af97c301..f4d0640d43 100644 --- a/pyiceberg/transforms.py +++ b/pyiceberg/transforms.py @@ -182,6 +182,9 @@ def __eq__(self, other: Any) -> bool: def supports_pyarrow_transform(self) -> bool: return False + @abstractmethod + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": ... + class BucketTransform(Transform[S, int]): """Base Transform class to transform a value into a bucket partition value. @@ -297,6 +300,9 @@ def __repr__(self) -> str: """Return the string representation of the BucketTransform class.""" return f"BucketTransform(num_buckets={self._num_buckets})" + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": + raise NotImplementedError() + class TimeResolution(IntEnum): YEAR = 6 @@ -356,9 +362,6 @@ def dedup_name(self) -> str: def preserves_order(self) -> bool: return True - @abstractmethod - def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": ... - @property def supports_pyarrow_transform(self) -> bool: return True @@ -810,6 +813,9 @@ def __repr__(self) -> str: """Return the string representation of the TruncateTransform class.""" return f"TruncateTransform(width={self._width})" + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": + raise NotImplementedError() + @singledispatch def _human_string(value: Any, _type: IcebergType) -> str: @@ -892,6 +898,9 @@ def __repr__(self) -> str: """Return the string representation of the UnknownTransform class.""" return f"UnknownTransform(transform={repr(self._transform)})" + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": + raise NotImplementedError() + class VoidTransform(Transform[S, None], Singleton): """A transform that always returns None.""" @@ -920,6 +929,9 @@ def __repr__(self) -> str: """Return the string representation of the VoidTransform class.""" return "VoidTransform()" + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": + raise NotImplementedError() + def _truncate_number( name: str, pred: BoundLiteralPredicate[L], transform: Callable[[Optional[L]], Optional[L]] From 990ce80ed937fa1db092e1aac2b0e87aecf34d84 Mon Sep 17 00:00:00 2001 From: Maksym Shalenyi Date: Tue, 7 May 2024 09:46:02 -0700 Subject: [PATCH 16/80] Make `add_files` to support `snapshot_properties` argument (#695) --- pyiceberg/table/__init__.py | 8 +++--- tests/integration/test_add_files.py | 40 +++++++++++++++++++++++------ 2 files changed, 36 insertions(+), 12 deletions(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 13186c42cc..5b7d04b543 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -443,7 +443,7 @@ def overwrite( for data_file in data_files: update_snapshot.append_data_file(data_file) - def add_files(self, file_paths: List[str]) -> None: + def add_files(self, file_paths: List[str], snapshot_properties: Dict[str, str] = EMPTY_DICT) -> None: """ Shorthand API for adding files as data files to the table transaction. @@ -455,7 +455,7 @@ def add_files(self, file_paths: List[str]) -> None: """ if self._table.name_mapping() is None: self.set_properties(**{TableProperties.DEFAULT_NAME_MAPPING: self._table.schema().name_mapping.model_dump_json()}) - with self.update_snapshot().fast_append() as update_snapshot: + with self.update_snapshot(snapshot_properties=snapshot_properties).fast_append() as update_snapshot: data_files = _parquet_files_to_data_files( table_metadata=self._table.metadata, file_paths=file_paths, io=self._table.io ) @@ -1341,7 +1341,7 @@ def overwrite( with self.transaction() as tx: tx.overwrite(df=df, overwrite_filter=overwrite_filter, snapshot_properties=snapshot_properties) - def add_files(self, file_paths: List[str]) -> None: + def add_files(self, file_paths: List[str], snapshot_properties: Dict[str, str] = EMPTY_DICT) -> None: """ Shorthand API for adding files as data files to the table. @@ -1352,7 +1352,7 @@ def add_files(self, file_paths: List[str]) -> None: FileNotFoundError: If the file does not exist. """ with self.transaction() as tx: - tx.add_files(file_paths=file_paths) + tx.add_files(file_paths=file_paths, snapshot_properties=snapshot_properties) def update_spec(self, case_sensitive: bool = True) -> UpdateSpec: return UpdateSpec(Transaction(self, autocommit=True), case_sensitive=case_sensitive) diff --git a/tests/integration/test_add_files.py b/tests/integration/test_add_files.py index 0de5d5f4ce..94c73918c8 100644 --- a/tests/integration/test_add_files.py +++ b/tests/integration/test_add_files.py @@ -17,7 +17,7 @@ # pylint:disable=redefined-outer-name from datetime import date -from typing import Optional +from typing import Iterator, Optional import pyarrow as pa import pyarrow.parquet as pq @@ -122,8 +122,13 @@ def _create_table( return tbl +@pytest.fixture(name="format_version", params=[pytest.param(1, id="format_version=1"), pytest.param(2, id="format_version=2")]) +def format_version_fixure(request: pytest.FixtureRequest) -> Iterator[int]: + """Fixture to run tests with different table format versions.""" + yield request.param + + @pytest.mark.integration -@pytest.mark.parametrize("format_version", [1, 2]) def test_add_files_to_unpartitioned_table(spark: SparkSession, session_catalog: Catalog, format_version: int) -> None: identifier = f"default.unpartitioned_table_v{format_version}" tbl = _create_table(session_catalog, identifier, format_version) @@ -163,7 +168,6 @@ def test_add_files_to_unpartitioned_table(spark: SparkSession, session_catalog: @pytest.mark.integration -@pytest.mark.parametrize("format_version", [1, 2]) def test_add_files_to_unpartitioned_table_raises_file_not_found( spark: SparkSession, session_catalog: Catalog, format_version: int ) -> None: @@ -184,7 +188,6 @@ def test_add_files_to_unpartitioned_table_raises_file_not_found( @pytest.mark.integration -@pytest.mark.parametrize("format_version", [1, 2]) def test_add_files_to_unpartitioned_table_raises_has_field_ids( spark: SparkSession, session_catalog: Catalog, format_version: int ) -> None: @@ -205,7 +208,6 @@ def test_add_files_to_unpartitioned_table_raises_has_field_ids( @pytest.mark.integration -@pytest.mark.parametrize("format_version", [1, 2]) def test_add_files_to_unpartitioned_table_with_schema_updates( spark: SparkSession, session_catalog: Catalog, format_version: int ) -> None: @@ -263,7 +265,6 @@ def test_add_files_to_unpartitioned_table_with_schema_updates( @pytest.mark.integration -@pytest.mark.parametrize("format_version", [1, 2]) def test_add_files_to_partitioned_table(spark: SparkSession, session_catalog: Catalog, format_version: int) -> None: identifier = f"default.partitioned_table_v{format_version}" @@ -335,7 +336,6 @@ def test_add_files_to_partitioned_table(spark: SparkSession, session_catalog: Ca @pytest.mark.integration -@pytest.mark.parametrize("format_version", [1, 2]) def test_add_files_to_bucket_partitioned_table_fails(spark: SparkSession, session_catalog: Catalog, format_version: int) -> None: identifier = f"default.partitioned_table_bucket_fails_v{format_version}" @@ -378,7 +378,6 @@ def test_add_files_to_bucket_partitioned_table_fails(spark: SparkSession, sessio @pytest.mark.integration -@pytest.mark.parametrize("format_version", [1, 2]) def test_add_files_to_partitioned_table_fails_with_lower_and_upper_mismatch( spark: SparkSession, session_catalog: Catalog, format_version: int ) -> None: @@ -424,3 +423,28 @@ def test_add_files_to_partitioned_table_fails_with_lower_and_upper_mismatch( "Cannot infer partition value from parquet metadata as there are more than one partition values for Partition Field: baz. lower_value=123, upper_value=124" in str(exc_info.value) ) + + +@pytest.mark.integration +def test_add_files_snapshot_properties(spark: SparkSession, session_catalog: Catalog, format_version: int) -> None: + identifier = f"default.unpartitioned_table_v{format_version}" + tbl = _create_table(session_catalog, identifier, format_version) + + file_paths = [f"s3://warehouse/default/unpartitioned/v{format_version}/test-{i}.parquet" for i in range(5)] + # write parquet files + for file_path in file_paths: + fo = tbl.io.new_output(file_path) + with fo.create(overwrite=True) as fos: + with pq.ParquetWriter(fos, schema=ARROW_SCHEMA) as writer: + writer.write_table(ARROW_TABLE) + + # add the parquet files as data files + tbl.add_files(file_paths=file_paths, snapshot_properties={"snapshot_prop_a": "test_prop_a"}) + + # NameMapping must have been set to enable reads + assert tbl.name_mapping() is not None + + summary = spark.sql(f"SELECT * FROM {identifier}.snapshots;").collect()[0].summary + + assert "snapshot_prop_a" in summary + assert summary["snapshot_prop_a"] == "test_prop_a" From 05086675d267ee1df271a8ec631e83091b7be114 Mon Sep 17 00:00:00 2001 From: Sung Yun <107272191+syun64@users.noreply.github.com> Date: Tue, 7 May 2024 16:18:48 -0400 Subject: [PATCH 17/80] Add support for categorical type (#693) --- pyiceberg/io/pyarrow.py | 10 ++++++++ tests/integration/test_writes/test_writes.py | 24 ++++++++++++++++++++ tests/io/test_pyarrow_visitor.py | 14 ++++++++++++ 3 files changed, 48 insertions(+) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 72b386d25a..9216c37f15 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -731,6 +731,16 @@ def _(obj: pa.MapType, visitor: PyArrowSchemaVisitor[T]) -> T: return visitor.map(obj, key_result, value_result) +@visit_pyarrow.register(pa.DictionaryType) +def _(obj: pa.DictionaryType, visitor: PyArrowSchemaVisitor[T]) -> T: + # Parquet has no dictionary type. dictionary-encoding is handled + # as an encoding detail, not as a separate type. + # We will follow this approach in determining the Iceberg Type, + # as we only support parquet in PyIceberg for now. + logger.warning(f"Iceberg does not have a dictionary type. {type(obj)} will be inferred as {obj.value_type} on read.") + return visit_pyarrow(obj.value_type, visitor) + + @visit_pyarrow.register(pa.DataType) def _(obj: pa.DataType, visitor: PyArrowSchemaVisitor[T]) -> T: if pa.types.is_nested(obj): diff --git a/tests/integration/test_writes/test_writes.py b/tests/integration/test_writes/test_writes.py index 8bebc53d92..a4a93396cc 100644 --- a/tests/integration/test_writes/test_writes.py +++ b/tests/integration/test_writes/test_writes.py @@ -315,6 +315,30 @@ def test_python_writes_special_character_column_with_spark_reads( assert spark_df.equals(pyiceberg_df) +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_python_writes_dictionary_encoded_column_with_spark_reads( + spark: SparkSession, session_catalog: Catalog, format_version: int +) -> None: + identifier = "default.python_writes_dictionary_encoded_column_with_spark_reads" + TEST_DATA = { + 'id': [1, 2, 3, 1, 1], + 'name': ['AB', 'CD', 'EF', 'CD', 'EF'], + } + pa_schema = pa.schema([ + pa.field('id', pa.dictionary(pa.int32(), pa.int32(), False)), + pa.field('name', pa.dictionary(pa.int32(), pa.string(), False)), + ]) + arrow_table = pa.Table.from_pydict(TEST_DATA, schema=pa_schema) + + tbl = _create_table(session_catalog, identifier, {"format-version": format_version}, schema=pa_schema) + + tbl.overwrite(arrow_table) + spark_df = spark.sql(f"SELECT * FROM {identifier}").toPandas() + pyiceberg_df = tbl.scan().to_pandas() + assert spark_df.equals(pyiceberg_df) + + @pytest.mark.integration def test_write_bin_pack_data_files(spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table) -> None: identifier = "default.write_bin_pack_data_files" diff --git a/tests/io/test_pyarrow_visitor.py b/tests/io/test_pyarrow_visitor.py index 5b55bd61b6..46ad331aa0 100644 --- a/tests/io/test_pyarrow_visitor.py +++ b/tests/io/test_pyarrow_visitor.py @@ -39,6 +39,7 @@ DoubleType, FixedType, FloatType, + IcebergType, IntegerType, ListType, LongType, @@ -280,6 +281,19 @@ def test_pyarrow_map_to_iceberg() -> None: assert visit_pyarrow(pyarrow_map, _ConvertToIceberg()) == expected +@pytest.mark.parametrize( + "value_type, expected_result", + [ + (pa.string(), StringType()), + (pa.int32(), IntegerType()), + (pa.float64(), DoubleType()), + ], +) +def test_pyarrow_dictionary_encoded_type_to_iceberg(value_type: pa.DataType, expected_result: IcebergType) -> None: + pyarrow_dict = pa.dictionary(pa.int32(), value_type) + assert visit_pyarrow(pyarrow_dict, _ConvertToIceberg()) == expected_result + + def test_round_schema_conversion_simple(table_schema_simple: Schema) -> None: actual = str(pyarrow_to_schema(schema_to_pyarrow(table_schema_simple))) expected = """table { From 1f39b59b05340705a4171ecb0aa64703920eb9b6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 8 May 2024 15:10:09 -0600 Subject: [PATCH 18/80] Build: Bump tenacity from 8.2.3 to 8.3.0 (#714) Bumps [tenacity](https://github.com/jd/tenacity) from 8.2.3 to 8.3.0. - [Release notes](https://github.com/jd/tenacity/releases) - [Commits](https://github.com/jd/tenacity/compare/8.2.3...8.3.0) --- updated-dependencies: - dependency-name: tenacity dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/poetry.lock b/poetry.lock index 7cb8d11523..a5a7f3f2cb 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4019,17 +4019,18 @@ mpmath = ">=0.19" [[package]] name = "tenacity" -version = "8.2.3" +version = "8.3.0" description = "Retry code until it succeeds" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "tenacity-8.2.3-py3-none-any.whl", hash = "sha256:ce510e327a630c9e1beaf17d42e6ffacc88185044ad85cf74c0a8887c6a0f88c"}, - {file = "tenacity-8.2.3.tar.gz", hash = "sha256:5398ef0d78e63f40007c1fb4c0bff96e1911394d2fa8d194f77619c05ff6cc8a"}, + {file = "tenacity-8.3.0-py3-none-any.whl", hash = "sha256:3649f6443dbc0d9b01b9d8020a9c4ec7a1ff5f6f3c6c8a036ef371f573fe9185"}, + {file = "tenacity-8.3.0.tar.gz", hash = "sha256:953d4e6ad24357bceffbc9707bc74349aca9d245f68eb65419cf0c249a1949a2"}, ] [package.extras] -doc = ["reno", "sphinx", "tornado (>=4.5)"] +doc = ["reno", "sphinx"] +test = ["pytest", "tornado (>=4.5)", "typeguard"] [[package]] name = "thrift" @@ -4064,7 +4065,7 @@ files = [ name = "tqdm" version = "4.66.3" description = "Fast, Extensible Progress Meter" -optional = false +optional = true python-versions = ">=3.7" files = [ {file = "tqdm-4.66.3-py3-none-any.whl", hash = "sha256:4f41d54107ff9a223dca80b53efe4fb654c67efaba7f47bada3ee9d50e05bd53"}, From 50a65e58e7d0ec807b7cdaf5585cffddc206b782 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 8 May 2024 15:11:24 -0600 Subject: [PATCH 19/80] Build: Bump mkdocstrings from 0.25.0 to 0.25.1 (#715) Bumps [mkdocstrings](https://github.com/mkdocstrings/mkdocstrings) from 0.25.0 to 0.25.1. - [Release notes](https://github.com/mkdocstrings/mkdocstrings/releases) - [Changelog](https://github.com/mkdocstrings/mkdocstrings/blob/main/CHANGELOG.md) - [Commits](https://github.com/mkdocstrings/mkdocstrings/compare/0.25.0...0.25.1) --- updated-dependencies: - dependency-name: mkdocstrings dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- mkdocs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs/requirements.txt b/mkdocs/requirements.txt index 549c92f238..3f6c097fb2 100644 --- a/mkdocs/requirements.txt +++ b/mkdocs/requirements.txt @@ -18,7 +18,7 @@ mkdocs==1.6.0 griffe==0.44.0 jinja2==3.1.4 -mkdocstrings==0.25.0 +mkdocstrings==0.25.1 mkdocstrings-python==1.10.0 mkdocs-literate-nav==0.6.1 mkdocs-autorefs==1.0.1 From 3461305b47be496b2c46bb836bee25ec83edff9c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 8 May 2024 15:11:33 -0600 Subject: [PATCH 20/80] Build: Bump coverage from 7.5.0 to 7.5.1 (#713) Bumps [coverage](https://github.com/nedbat/coveragepy) from 7.5.0 to 7.5.1. - [Release notes](https://github.com/nedbat/coveragepy/releases) - [Changelog](https://github.com/nedbat/coveragepy/blob/master/CHANGES.rst) - [Commits](https://github.com/nedbat/coveragepy/compare/7.5.0...7.5.1) --- updated-dependencies: - dependency-name: coverage dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 106 ++++++++++++++++++++++++++-------------------------- 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/poetry.lock b/poetry.lock index a5a7f3f2cb..304f9df6e5 100644 --- a/poetry.lock +++ b/poetry.lock @@ -652,63 +652,63 @@ files = [ [[package]] name = "coverage" -version = "7.5.0" +version = "7.5.1" description = "Code coverage measurement for Python" optional = false python-versions = ">=3.8" files = [ - {file = "coverage-7.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:432949a32c3e3f820af808db1833d6d1631664d53dd3ce487aa25d574e18ad1c"}, - {file = "coverage-7.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2bd7065249703cbeb6d4ce679c734bef0ee69baa7bff9724361ada04a15b7e3b"}, - {file = "coverage-7.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bbfe6389c5522b99768a93d89aca52ef92310a96b99782973b9d11e80511f932"}, - {file = "coverage-7.5.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:39793731182c4be939b4be0cdecde074b833f6171313cf53481f869937129ed3"}, - {file = "coverage-7.5.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85a5dbe1ba1bf38d6c63b6d2c42132d45cbee6d9f0c51b52c59aa4afba057517"}, - {file = "coverage-7.5.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:357754dcdfd811462a725e7501a9b4556388e8ecf66e79df6f4b988fa3d0b39a"}, - {file = "coverage-7.5.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a81eb64feded34f40c8986869a2f764f0fe2db58c0530d3a4afbcde50f314880"}, - {file = "coverage-7.5.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:51431d0abbed3a868e967f8257c5faf283d41ec882f58413cf295a389bb22e58"}, - {file = "coverage-7.5.0-cp310-cp310-win32.whl", hash = "sha256:f609ebcb0242d84b7adeee2b06c11a2ddaec5464d21888b2c8255f5fd6a98ae4"}, - {file = "coverage-7.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:6782cd6216fab5a83216cc39f13ebe30adfac2fa72688c5a4d8d180cd52e8f6a"}, - {file = "coverage-7.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e768d870801f68c74c2b669fc909839660180c366501d4cc4b87efd6b0eee375"}, - {file = "coverage-7.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:84921b10aeb2dd453247fd10de22907984eaf80901b578a5cf0bb1e279a587cb"}, - {file = "coverage-7.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:710c62b6e35a9a766b99b15cdc56d5aeda0914edae8bb467e9c355f75d14ee95"}, - {file = "coverage-7.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c379cdd3efc0658e652a14112d51a7668f6bfca7445c5a10dee7eabecabba19d"}, - {file = "coverage-7.5.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fea9d3ca80bcf17edb2c08a4704259dadac196fe5e9274067e7a20511fad1743"}, - {file = "coverage-7.5.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:41327143c5b1d715f5f98a397608f90ab9ebba606ae4e6f3389c2145410c52b1"}, - {file = "coverage-7.5.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:565b2e82d0968c977e0b0f7cbf25fd06d78d4856289abc79694c8edcce6eb2de"}, - {file = "coverage-7.5.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cf3539007202ebfe03923128fedfdd245db5860a36810136ad95a564a2fdffff"}, - {file = "coverage-7.5.0-cp311-cp311-win32.whl", hash = "sha256:bf0b4b8d9caa8d64df838e0f8dcf68fb570c5733b726d1494b87f3da85db3a2d"}, - {file = "coverage-7.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:9c6384cc90e37cfb60435bbbe0488444e54b98700f727f16f64d8bfda0b84656"}, - {file = "coverage-7.5.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fed7a72d54bd52f4aeb6c6e951f363903bd7d70bc1cad64dd1f087980d309ab9"}, - {file = "coverage-7.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:cbe6581fcff7c8e262eb574244f81f5faaea539e712a058e6707a9d272fe5b64"}, - {file = "coverage-7.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ad97ec0da94b378e593ef532b980c15e377df9b9608c7c6da3506953182398af"}, - {file = "coverage-7.5.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bd4bacd62aa2f1a1627352fe68885d6ee694bdaebb16038b6e680f2924a9b2cc"}, - {file = "coverage-7.5.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:adf032b6c105881f9d77fa17d9eebe0ad1f9bfb2ad25777811f97c5362aa07f2"}, - {file = "coverage-7.5.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:4ba01d9ba112b55bfa4b24808ec431197bb34f09f66f7cb4fd0258ff9d3711b1"}, - {file = "coverage-7.5.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:f0bfe42523893c188e9616d853c47685e1c575fe25f737adf473d0405dcfa7eb"}, - {file = "coverage-7.5.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a9a7ef30a1b02547c1b23fa9a5564f03c9982fc71eb2ecb7f98c96d7a0db5cf2"}, - {file = "coverage-7.5.0-cp312-cp312-win32.whl", hash = "sha256:3c2b77f295edb9fcdb6a250f83e6481c679335ca7e6e4a955e4290350f2d22a4"}, - {file = "coverage-7.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:427e1e627b0963ac02d7c8730ca6d935df10280d230508c0ba059505e9233475"}, - {file = "coverage-7.5.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9dd88fce54abbdbf4c42fb1fea0e498973d07816f24c0e27a1ecaf91883ce69e"}, - {file = "coverage-7.5.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a898c11dca8f8c97b467138004a30133974aacd572818c383596f8d5b2eb04a9"}, - {file = "coverage-7.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:07dfdd492d645eea1bd70fb1d6febdcf47db178b0d99161d8e4eed18e7f62fe7"}, - {file = "coverage-7.5.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d3d117890b6eee85887b1eed41eefe2e598ad6e40523d9f94c4c4b213258e4a4"}, - {file = "coverage-7.5.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6afd2e84e7da40fe23ca588379f815fb6dbbb1b757c883935ed11647205111cb"}, - {file = "coverage-7.5.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:a9960dd1891b2ddf13a7fe45339cd59ecee3abb6b8326d8b932d0c5da208104f"}, - {file = "coverage-7.5.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ced268e82af993d7801a9db2dbc1d2322e786c5dc76295d8e89473d46c6b84d4"}, - {file = "coverage-7.5.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:e7c211f25777746d468d76f11719e64acb40eed410d81c26cefac641975beb88"}, - {file = "coverage-7.5.0-cp38-cp38-win32.whl", hash = "sha256:262fffc1f6c1a26125d5d573e1ec379285a3723363f3bd9c83923c9593a2ac25"}, - {file = "coverage-7.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:eed462b4541c540d63ab57b3fc69e7d8c84d5957668854ee4e408b50e92ce26a"}, - {file = "coverage-7.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d0194d654e360b3e6cc9b774e83235bae6b9b2cac3be09040880bb0e8a88f4a1"}, - {file = "coverage-7.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:33c020d3322662e74bc507fb11488773a96894aa82a622c35a5a28673c0c26f5"}, - {file = "coverage-7.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0cbdf2cae14a06827bec50bd58e49249452d211d9caddd8bd80e35b53cb04631"}, - {file = "coverage-7.5.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3235d7c781232e525b0761730e052388a01548bd7f67d0067a253887c6e8df46"}, - {file = "coverage-7.5.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db2de4e546f0ec4b2787d625e0b16b78e99c3e21bc1722b4977c0dddf11ca84e"}, - {file = "coverage-7.5.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4d0e206259b73af35c4ec1319fd04003776e11e859936658cb6ceffdeba0f5be"}, - {file = "coverage-7.5.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:2055c4fb9a6ff624253d432aa471a37202cd8f458c033d6d989be4499aed037b"}, - {file = "coverage-7.5.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:075299460948cd12722a970c7eae43d25d37989da682997687b34ae6b87c0ef0"}, - {file = "coverage-7.5.0-cp39-cp39-win32.whl", hash = "sha256:280132aada3bc2f0fac939a5771db4fbb84f245cb35b94fae4994d4c1f80dae7"}, - {file = "coverage-7.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:c58536f6892559e030e6924896a44098bc1290663ea12532c78cef71d0df8493"}, - {file = "coverage-7.5.0-pp38.pp39.pp310-none-any.whl", hash = "sha256:2b57780b51084d5223eee7b59f0d4911c31c16ee5aa12737c7a02455829ff067"}, - {file = "coverage-7.5.0.tar.gz", hash = "sha256:cf62d17310f34084c59c01e027259076479128d11e4661bb6c9acb38c5e19bb8"}, + {file = "coverage-7.5.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0884920835a033b78d1c73b6d3bbcda8161a900f38a488829a83982925f6c2e"}, + {file = "coverage-7.5.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:39afcd3d4339329c5f58de48a52f6e4e50f6578dd6099961cf22228feb25f38f"}, + {file = "coverage-7.5.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a7b0ceee8147444347da6a66be737c9d78f3353b0681715b668b72e79203e4a"}, + {file = "coverage-7.5.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a9ca3f2fae0088c3c71d743d85404cec8df9be818a005ea065495bedc33da35"}, + {file = "coverage-7.5.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fd215c0c7d7aab005221608a3c2b46f58c0285a819565887ee0b718c052aa4e"}, + {file = "coverage-7.5.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4bf0655ab60d754491004a5efd7f9cccefcc1081a74c9ef2da4735d6ee4a6223"}, + {file = "coverage-7.5.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:61c4bf1ba021817de12b813338c9be9f0ad5b1e781b9b340a6d29fc13e7c1b5e"}, + {file = "coverage-7.5.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:db66fc317a046556a96b453a58eced5024af4582a8dbdc0c23ca4dbc0d5b3146"}, + {file = "coverage-7.5.1-cp310-cp310-win32.whl", hash = "sha256:b016ea6b959d3b9556cb401c55a37547135a587db0115635a443b2ce8f1c7228"}, + {file = "coverage-7.5.1-cp310-cp310-win_amd64.whl", hash = "sha256:df4e745a81c110e7446b1cc8131bf986157770fa405fe90e15e850aaf7619bc8"}, + {file = "coverage-7.5.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:796a79f63eca8814ca3317a1ea443645c9ff0d18b188de470ed7ccd45ae79428"}, + {file = "coverage-7.5.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4fc84a37bfd98db31beae3c2748811a3fa72bf2007ff7902f68746d9757f3746"}, + {file = "coverage-7.5.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6175d1a0559986c6ee3f7fccfc4a90ecd12ba0a383dcc2da30c2b9918d67d8a3"}, + {file = "coverage-7.5.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1fc81d5878cd6274ce971e0a3a18a8803c3fe25457165314271cf78e3aae3aa2"}, + {file = "coverage-7.5.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:556cf1a7cbc8028cb60e1ff0be806be2eded2daf8129b8811c63e2b9a6c43bca"}, + {file = "coverage-7.5.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:9981706d300c18d8b220995ad22627647be11a4276721c10911e0e9fa44c83e8"}, + {file = "coverage-7.5.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:d7fed867ee50edf1a0b4a11e8e5d0895150e572af1cd6d315d557758bfa9c057"}, + {file = "coverage-7.5.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:ef48e2707fb320c8f139424a596f5b69955a85b178f15af261bab871873bb987"}, + {file = "coverage-7.5.1-cp311-cp311-win32.whl", hash = "sha256:9314d5678dcc665330df5b69c1e726a0e49b27df0461c08ca12674bcc19ef136"}, + {file = "coverage-7.5.1-cp311-cp311-win_amd64.whl", hash = "sha256:5fa567e99765fe98f4e7d7394ce623e794d7cabb170f2ca2ac5a4174437e90dd"}, + {file = "coverage-7.5.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b6cf3764c030e5338e7f61f95bd21147963cf6aa16e09d2f74f1fa52013c1206"}, + {file = "coverage-7.5.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2ec92012fefebee89a6b9c79bc39051a6cb3891d562b9270ab10ecfdadbc0c34"}, + {file = "coverage-7.5.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:16db7f26000a07efcf6aea00316f6ac57e7d9a96501e990a36f40c965ec7a95d"}, + {file = "coverage-7.5.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:beccf7b8a10b09c4ae543582c1319c6df47d78fd732f854ac68d518ee1fb97fa"}, + {file = "coverage-7.5.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8748731ad392d736cc9ccac03c9845b13bb07d020a33423fa5b3a36521ac6e4e"}, + {file = "coverage-7.5.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7352b9161b33fd0b643ccd1f21f3a3908daaddf414f1c6cb9d3a2fd618bf2572"}, + {file = "coverage-7.5.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:7a588d39e0925f6a2bff87154752481273cdb1736270642aeb3635cb9b4cad07"}, + {file = "coverage-7.5.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:68f962d9b72ce69ea8621f57551b2fa9c70509af757ee3b8105d4f51b92b41a7"}, + {file = "coverage-7.5.1-cp312-cp312-win32.whl", hash = "sha256:f152cbf5b88aaeb836127d920dd0f5e7edff5a66f10c079157306c4343d86c19"}, + {file = "coverage-7.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:5a5740d1fb60ddf268a3811bcd353de34eb56dc24e8f52a7f05ee513b2d4f596"}, + {file = "coverage-7.5.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:e2213def81a50519d7cc56ed643c9e93e0247f5bbe0d1247d15fa520814a7cd7"}, + {file = "coverage-7.5.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5037f8fcc2a95b1f0e80585bd9d1ec31068a9bcb157d9750a172836e98bc7a90"}, + {file = "coverage-7.5.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c3721c2c9e4c4953a41a26c14f4cef64330392a6d2d675c8b1db3b645e31f0e"}, + {file = "coverage-7.5.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca498687ca46a62ae590253fba634a1fe9836bc56f626852fb2720f334c9e4e5"}, + {file = "coverage-7.5.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0cdcbc320b14c3e5877ee79e649677cb7d89ef588852e9583e6b24c2e5072661"}, + {file = "coverage-7.5.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:57e0204b5b745594e5bc14b9b50006da722827f0b8c776949f1135677e88d0b8"}, + {file = "coverage-7.5.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:8fe7502616b67b234482c3ce276ff26f39ffe88adca2acf0261df4b8454668b4"}, + {file = "coverage-7.5.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:9e78295f4144f9dacfed4f92935fbe1780021247c2fabf73a819b17f0ccfff8d"}, + {file = "coverage-7.5.1-cp38-cp38-win32.whl", hash = "sha256:1434e088b41594baa71188a17533083eabf5609e8e72f16ce8c186001e6b8c41"}, + {file = "coverage-7.5.1-cp38-cp38-win_amd64.whl", hash = "sha256:0646599e9b139988b63704d704af8e8df7fa4cbc4a1f33df69d97f36cb0a38de"}, + {file = "coverage-7.5.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4cc37def103a2725bc672f84bd939a6fe4522310503207aae4d56351644682f1"}, + {file = "coverage-7.5.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fc0b4d8bfeabd25ea75e94632f5b6e047eef8adaed0c2161ada1e922e7f7cece"}, + {file = "coverage-7.5.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d0a0f5e06881ecedfe6f3dd2f56dcb057b6dbeb3327fd32d4b12854df36bf26"}, + {file = "coverage-7.5.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9735317685ba6ec7e3754798c8871c2f49aa5e687cc794a0b1d284b2389d1bd5"}, + {file = "coverage-7.5.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d21918e9ef11edf36764b93101e2ae8cc82aa5efdc7c5a4e9c6c35a48496d601"}, + {file = "coverage-7.5.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:c3e757949f268364b96ca894b4c342b41dc6f8f8b66c37878aacef5930db61be"}, + {file = "coverage-7.5.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:79afb6197e2f7f60c4824dd4b2d4c2ec5801ceb6ba9ce5d2c3080e5660d51a4f"}, + {file = "coverage-7.5.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d1d0d98d95dd18fe29dc66808e1accf59f037d5716f86a501fc0256455219668"}, + {file = "coverage-7.5.1-cp39-cp39-win32.whl", hash = "sha256:1cc0fe9b0b3a8364093c53b0b4c0c2dd4bb23acbec4c9240b5f284095ccf7981"}, + {file = "coverage-7.5.1-cp39-cp39-win_amd64.whl", hash = "sha256:dde0070c40ea8bb3641e811c1cfbf18e265d024deff6de52c5950677a8fb1e0f"}, + {file = "coverage-7.5.1-pp38.pp39.pp310-none-any.whl", hash = "sha256:6537e7c10cc47c595828b8a8be04c72144725c383c4702703ff4e42e44577312"}, + {file = "coverage-7.5.1.tar.gz", hash = "sha256:54de9ef3a9da981f7af93eafde4ede199e0846cd819eb27c88e2b712aae9708c"}, ] [package.dependencies] From 399a9bef095faf0ba5cdee2339c0c6ed014116fa Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 8 May 2024 15:11:39 -0600 Subject: [PATCH 21/80] Build: Bump sqlalchemy from 2.0.29 to 2.0.30 (#712) Bumps [sqlalchemy](https://github.com/sqlalchemy/sqlalchemy) from 2.0.29 to 2.0.30. - [Release notes](https://github.com/sqlalchemy/sqlalchemy/releases) - [Changelog](https://github.com/sqlalchemy/sqlalchemy/blob/main/CHANGES.rst) - [Commits](https://github.com/sqlalchemy/sqlalchemy/commits) --- updated-dependencies: - dependency-name: sqlalchemy dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 100 ++++++++++++++++++++++++++-------------------------- 1 file changed, 50 insertions(+), 50 deletions(-) diff --git a/poetry.lock b/poetry.lock index 304f9df6e5..ae86cf1ecf 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3904,60 +3904,60 @@ files = [ [[package]] name = "sqlalchemy" -version = "2.0.29" +version = "2.0.30" description = "Database Abstraction Library" optional = true python-versions = ">=3.7" files = [ - {file = "SQLAlchemy-2.0.29-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4c142852ae192e9fe5aad5c350ea6befe9db14370b34047e1f0f7cf99e63c63b"}, - {file = "SQLAlchemy-2.0.29-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:99a1e69d4e26f71e750e9ad6fdc8614fbddb67cfe2173a3628a2566034e223c7"}, - {file = "SQLAlchemy-2.0.29-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5ef3fbccb4058355053c51b82fd3501a6e13dd808c8d8cd2561e610c5456013c"}, - {file = "SQLAlchemy-2.0.29-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d6753305936eddc8ed190e006b7bb33a8f50b9854823485eed3a886857ab8d1"}, - {file = "SQLAlchemy-2.0.29-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0f3ca96af060a5250a8ad5a63699180bc780c2edf8abf96c58af175921df847a"}, - {file = "SQLAlchemy-2.0.29-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c4520047006b1d3f0d89e0532978c0688219857eb2fee7c48052560ae76aca1e"}, - {file = "SQLAlchemy-2.0.29-cp310-cp310-win32.whl", hash = "sha256:b2a0e3cf0caac2085ff172c3faacd1e00c376e6884b5bc4dd5b6b84623e29e4f"}, - {file = "SQLAlchemy-2.0.29-cp310-cp310-win_amd64.whl", hash = "sha256:01d10638a37460616708062a40c7b55f73e4d35eaa146781c683e0fa7f6c43fb"}, - {file = "SQLAlchemy-2.0.29-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:308ef9cb41d099099fffc9d35781638986870b29f744382904bf9c7dadd08513"}, - {file = "SQLAlchemy-2.0.29-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:296195df68326a48385e7a96e877bc19aa210e485fa381c5246bc0234c36c78e"}, - {file = "SQLAlchemy-2.0.29-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a13b917b4ffe5a0a31b83d051d60477819ddf18276852ea68037a144a506efb9"}, - {file = "SQLAlchemy-2.0.29-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f6d971255d9ddbd3189e2e79d743ff4845c07f0633adfd1de3f63d930dbe673"}, - {file = "SQLAlchemy-2.0.29-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:61405ea2d563407d316c63a7b5271ae5d274a2a9fbcd01b0aa5503635699fa1e"}, - {file = "SQLAlchemy-2.0.29-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:de7202ffe4d4a8c1e3cde1c03e01c1a3772c92858837e8f3879b497158e4cb44"}, - {file = "SQLAlchemy-2.0.29-cp311-cp311-win32.whl", hash = "sha256:b5d7ed79df55a731749ce65ec20d666d82b185fa4898430b17cb90c892741520"}, - {file = "SQLAlchemy-2.0.29-cp311-cp311-win_amd64.whl", hash = "sha256:205f5a2b39d7c380cbc3b5dcc8f2762fb5bcb716838e2d26ccbc54330775b003"}, - {file = "SQLAlchemy-2.0.29-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d96710d834a6fb31e21381c6d7b76ec729bd08c75a25a5184b1089141356171f"}, - {file = "SQLAlchemy-2.0.29-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:52de4736404e53c5c6a91ef2698c01e52333988ebdc218f14c833237a0804f1b"}, - {file = "SQLAlchemy-2.0.29-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c7b02525ede2a164c5fa5014915ba3591730f2cc831f5be9ff3b7fd3e30958e"}, - {file = "SQLAlchemy-2.0.29-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0dfefdb3e54cd15f5d56fd5ae32f1da2d95d78319c1f6dfb9bcd0eb15d603d5d"}, - {file = "SQLAlchemy-2.0.29-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:a88913000da9205b13f6f195f0813b6ffd8a0c0c2bd58d499e00a30eb508870c"}, - {file = "SQLAlchemy-2.0.29-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:fecd5089c4be1bcc37c35e9aa678938d2888845a134dd016de457b942cf5a758"}, - {file = "SQLAlchemy-2.0.29-cp312-cp312-win32.whl", hash = "sha256:8197d6f7a3d2b468861ebb4c9f998b9df9e358d6e1cf9c2a01061cb9b6cf4e41"}, - {file = "SQLAlchemy-2.0.29-cp312-cp312-win_amd64.whl", hash = "sha256:9b19836ccca0d321e237560e475fd99c3d8655d03da80c845c4da20dda31b6e1"}, - {file = "SQLAlchemy-2.0.29-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:87a1d53a5382cdbbf4b7619f107cc862c1b0a4feb29000922db72e5a66a5ffc0"}, - {file = "SQLAlchemy-2.0.29-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2a0732dffe32333211801b28339d2a0babc1971bc90a983e3035e7b0d6f06b93"}, - {file = "SQLAlchemy-2.0.29-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90453597a753322d6aa770c5935887ab1fc49cc4c4fdd436901308383d698b4b"}, - {file = "SQLAlchemy-2.0.29-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:ea311d4ee9a8fa67f139c088ae9f905fcf0277d6cd75c310a21a88bf85e130f5"}, - {file = "SQLAlchemy-2.0.29-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:5f20cb0a63a3e0ec4e169aa8890e32b949c8145983afa13a708bc4b0a1f30e03"}, - {file = "SQLAlchemy-2.0.29-cp37-cp37m-win32.whl", hash = "sha256:e5bbe55e8552019c6463709b39634a5fc55e080d0827e2a3a11e18eb73f5cdbd"}, - {file = "SQLAlchemy-2.0.29-cp37-cp37m-win_amd64.whl", hash = "sha256:c2f9c762a2735600654c654bf48dad388b888f8ce387b095806480e6e4ff6907"}, - {file = "SQLAlchemy-2.0.29-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7e614d7a25a43a9f54fcce4675c12761b248547f3d41b195e8010ca7297c369c"}, - {file = "SQLAlchemy-2.0.29-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:471fcb39c6adf37f820350c28aac4a7df9d3940c6548b624a642852e727ea586"}, - {file = "SQLAlchemy-2.0.29-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:988569c8732f54ad3234cf9c561364221a9e943b78dc7a4aaf35ccc2265f1930"}, - {file = "SQLAlchemy-2.0.29-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dddaae9b81c88083e6437de95c41e86823d150f4ee94bf24e158a4526cbead01"}, - {file = "SQLAlchemy-2.0.29-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:334184d1ab8f4c87f9652b048af3f7abea1c809dfe526fb0435348a6fef3d380"}, - {file = "SQLAlchemy-2.0.29-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:38b624e5cf02a69b113c8047cf7f66b5dfe4a2ca07ff8b8716da4f1b3ae81567"}, - {file = "SQLAlchemy-2.0.29-cp38-cp38-win32.whl", hash = "sha256:bab41acf151cd68bc2b466deae5deeb9e8ae9c50ad113444151ad965d5bf685b"}, - {file = "SQLAlchemy-2.0.29-cp38-cp38-win_amd64.whl", hash = "sha256:52c8011088305476691b8750c60e03b87910a123cfd9ad48576d6414b6ec2a1d"}, - {file = "SQLAlchemy-2.0.29-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3071ad498896907a5ef756206b9dc750f8e57352113c19272bdfdc429c7bd7de"}, - {file = "SQLAlchemy-2.0.29-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:dba622396a3170974f81bad49aacebd243455ec3cc70615aeaef9e9613b5bca5"}, - {file = "SQLAlchemy-2.0.29-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b184e3de58009cc0bf32e20f137f1ec75a32470f5fede06c58f6c355ed42a72"}, - {file = "SQLAlchemy-2.0.29-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c37f1050feb91f3d6c32f864d8e114ff5545a4a7afe56778d76a9aec62638ba"}, - {file = "SQLAlchemy-2.0.29-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bda7ce59b06d0f09afe22c56714c65c957b1068dee3d5e74d743edec7daba552"}, - {file = "SQLAlchemy-2.0.29-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:25664e18bef6dc45015b08f99c63952a53a0a61f61f2e48a9e70cec27e55f699"}, - {file = "SQLAlchemy-2.0.29-cp39-cp39-win32.whl", hash = "sha256:77d29cb6c34b14af8a484e831ab530c0f7188f8efed1c6a833a2c674bf3c26ec"}, - {file = "SQLAlchemy-2.0.29-cp39-cp39-win_amd64.whl", hash = "sha256:04c487305ab035a9548f573763915189fc0fe0824d9ba28433196f8436f1449c"}, - {file = "SQLAlchemy-2.0.29-py3-none-any.whl", hash = "sha256:dc4ee2d4ee43251905f88637d5281a8d52e916a021384ec10758826f5cbae305"}, - {file = "SQLAlchemy-2.0.29.tar.gz", hash = "sha256:bd9566b8e58cabd700bc367b60e90d9349cd16f0984973f98a9a09f9c64e86f0"}, + {file = "SQLAlchemy-2.0.30-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3b48154678e76445c7ded1896715ce05319f74b1e73cf82d4f8b59b46e9c0ddc"}, + {file = "SQLAlchemy-2.0.30-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2753743c2afd061bb95a61a51bbb6a1a11ac1c44292fad898f10c9839a7f75b2"}, + {file = "SQLAlchemy-2.0.30-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a7bfc726d167f425d4c16269a9a10fe8630ff6d14b683d588044dcef2d0f6be7"}, + {file = "SQLAlchemy-2.0.30-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c4f61ada6979223013d9ab83a3ed003ded6959eae37d0d685db2c147e9143797"}, + {file = "SQLAlchemy-2.0.30-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:3a365eda439b7a00732638f11072907c1bc8e351c7665e7e5da91b169af794af"}, + {file = "SQLAlchemy-2.0.30-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bba002a9447b291548e8d66fd8c96a6a7ed4f2def0bb155f4f0a1309fd2735d5"}, + {file = "SQLAlchemy-2.0.30-cp310-cp310-win32.whl", hash = "sha256:0138c5c16be3600923fa2169532205d18891b28afa817cb49b50e08f62198bb8"}, + {file = "SQLAlchemy-2.0.30-cp310-cp310-win_amd64.whl", hash = "sha256:99650e9f4cf3ad0d409fed3eec4f071fadd032e9a5edc7270cd646a26446feeb"}, + {file = "SQLAlchemy-2.0.30-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:955991a09f0992c68a499791a753523f50f71a6885531568404fa0f231832aa0"}, + {file = "SQLAlchemy-2.0.30-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f69e4c756ee2686767eb80f94c0125c8b0a0b87ede03eacc5c8ae3b54b99dc46"}, + {file = "SQLAlchemy-2.0.30-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69c9db1ce00e59e8dd09d7bae852a9add716efdc070a3e2068377e6ff0d6fdaa"}, + {file = "SQLAlchemy-2.0.30-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a1429a4b0f709f19ff3b0cf13675b2b9bfa8a7e79990003207a011c0db880a13"}, + {file = "SQLAlchemy-2.0.30-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:efedba7e13aa9a6c8407c48facfdfa108a5a4128e35f4c68f20c3407e4376aa9"}, + {file = "SQLAlchemy-2.0.30-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:16863e2b132b761891d6c49f0a0f70030e0bcac4fd208117f6b7e053e68668d0"}, + {file = "SQLAlchemy-2.0.30-cp311-cp311-win32.whl", hash = "sha256:2ecabd9ccaa6e914e3dbb2aa46b76dede7eadc8cbf1b8083c94d936bcd5ffb49"}, + {file = "SQLAlchemy-2.0.30-cp311-cp311-win_amd64.whl", hash = "sha256:0b3f4c438e37d22b83e640f825ef0f37b95db9aa2d68203f2c9549375d0b2260"}, + {file = "SQLAlchemy-2.0.30-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5a79d65395ac5e6b0c2890935bad892eabb911c4aa8e8015067ddb37eea3d56c"}, + {file = "SQLAlchemy-2.0.30-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9a5baf9267b752390252889f0c802ea13b52dfee5e369527da229189b8bd592e"}, + {file = "SQLAlchemy-2.0.30-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3cb5a646930c5123f8461f6468901573f334c2c63c795b9af350063a736d0134"}, + {file = "SQLAlchemy-2.0.30-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:296230899df0b77dec4eb799bcea6fbe39a43707ce7bb166519c97b583cfcab3"}, + {file = "SQLAlchemy-2.0.30-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c62d401223f468eb4da32627bffc0c78ed516b03bb8a34a58be54d618b74d472"}, + {file = "SQLAlchemy-2.0.30-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3b69e934f0f2b677ec111b4d83f92dc1a3210a779f69bf905273192cf4ed433e"}, + {file = "SQLAlchemy-2.0.30-cp312-cp312-win32.whl", hash = "sha256:77d2edb1f54aff37e3318f611637171e8ec71472f1fdc7348b41dcb226f93d90"}, + {file = "SQLAlchemy-2.0.30-cp312-cp312-win_amd64.whl", hash = "sha256:b6c7ec2b1f4969fc19b65b7059ed00497e25f54069407a8701091beb69e591a5"}, + {file = "SQLAlchemy-2.0.30-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:5a8e3b0a7e09e94be7510d1661339d6b52daf202ed2f5b1f9f48ea34ee6f2d57"}, + {file = "SQLAlchemy-2.0.30-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b60203c63e8f984df92035610c5fb76d941254cf5d19751faab7d33b21e5ddc0"}, + {file = "SQLAlchemy-2.0.30-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1dc3eabd8c0232ee8387fbe03e0a62220a6f089e278b1f0aaf5e2d6210741ad"}, + {file = "SQLAlchemy-2.0.30-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:40ad017c672c00b9b663fcfcd5f0864a0a97828e2ee7ab0c140dc84058d194cf"}, + {file = "SQLAlchemy-2.0.30-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:e42203d8d20dc704604862977b1470a122e4892791fe3ed165f041e4bf447a1b"}, + {file = "SQLAlchemy-2.0.30-cp37-cp37m-win32.whl", hash = "sha256:2a4f4da89c74435f2bc61878cd08f3646b699e7d2eba97144030d1be44e27584"}, + {file = "SQLAlchemy-2.0.30-cp37-cp37m-win_amd64.whl", hash = "sha256:b6bf767d14b77f6a18b6982cbbf29d71bede087edae495d11ab358280f304d8e"}, + {file = "SQLAlchemy-2.0.30-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bc0c53579650a891f9b83fa3cecd4e00218e071d0ba00c4890f5be0c34887ed3"}, + {file = "SQLAlchemy-2.0.30-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:311710f9a2ee235f1403537b10c7687214bb1f2b9ebb52702c5aa4a77f0b3af7"}, + {file = "SQLAlchemy-2.0.30-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:408f8b0e2c04677e9c93f40eef3ab22f550fecb3011b187f66a096395ff3d9fd"}, + {file = "SQLAlchemy-2.0.30-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:37a4b4fb0dd4d2669070fb05b8b8824afd0af57587393015baee1cf9890242d9"}, + {file = "SQLAlchemy-2.0.30-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:a943d297126c9230719c27fcbbeab57ecd5d15b0bd6bfd26e91bfcfe64220621"}, + {file = "SQLAlchemy-2.0.30-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:0a089e218654e740a41388893e090d2e2c22c29028c9d1353feb38638820bbeb"}, + {file = "SQLAlchemy-2.0.30-cp38-cp38-win32.whl", hash = "sha256:fa561138a64f949f3e889eb9ab8c58e1504ab351d6cf55259dc4c248eaa19da6"}, + {file = "SQLAlchemy-2.0.30-cp38-cp38-win_amd64.whl", hash = "sha256:7d74336c65705b986d12a7e337ba27ab2b9d819993851b140efdf029248e818e"}, + {file = "SQLAlchemy-2.0.30-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ae8c62fe2480dd61c532ccafdbce9b29dacc126fe8be0d9a927ca3e699b9491a"}, + {file = "SQLAlchemy-2.0.30-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2383146973a15435e4717f94c7509982770e3e54974c71f76500a0136f22810b"}, + {file = "SQLAlchemy-2.0.30-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8409de825f2c3b62ab15788635ccaec0c881c3f12a8af2b12ae4910a0a9aeef6"}, + {file = "SQLAlchemy-2.0.30-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0094c5dc698a5f78d3d1539853e8ecec02516b62b8223c970c86d44e7a80f6c7"}, + {file = "SQLAlchemy-2.0.30-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:edc16a50f5e1b7a06a2dcc1f2205b0b961074c123ed17ebda726f376a5ab0953"}, + {file = "SQLAlchemy-2.0.30-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:f7703c2010355dd28f53deb644a05fc30f796bd8598b43f0ba678878780b6e4c"}, + {file = "SQLAlchemy-2.0.30-cp39-cp39-win32.whl", hash = "sha256:1f9a727312ff6ad5248a4367358e2cf7e625e98b1028b1d7ab7b806b7d757513"}, + {file = "SQLAlchemy-2.0.30-cp39-cp39-win_amd64.whl", hash = "sha256:a0ef36b28534f2a5771191be6edb44cc2673c7b2edf6deac6562400288664221"}, + {file = "SQLAlchemy-2.0.30-py3-none-any.whl", hash = "sha256:7108d569d3990c71e26a42f60474b4c02c8586c4681af5fd67e51a044fdea86a"}, + {file = "SQLAlchemy-2.0.30.tar.gz", hash = "sha256:2b1708916730f4830bc69d6f49d37f7698b5bd7530aca7f04f785f8849e95255"}, ] [package.dependencies] From 6f72e30ee3d3e79d946646bdbda8942aaff00e76 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 8 May 2024 16:11:04 -0600 Subject: [PATCH 22/80] Build: Bump flask-cors from 4.0.0 to 4.0.1 (#718) Bumps [flask-cors](https://github.com/corydolphin/flask-cors) from 4.0.0 to 4.0.1. - [Release notes](https://github.com/corydolphin/flask-cors/releases) - [Changelog](https://github.com/corydolphin/flask-cors/blob/main/CHANGELOG.md) - [Commits](https://github.com/corydolphin/flask-cors/compare/4.0.0...4.0.1) --- updated-dependencies: - dependency-name: flask-cors dependency-type: indirect ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index ae86cf1ecf..dd3c271939 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1185,13 +1185,13 @@ dotenv = ["python-dotenv"] [[package]] name = "flask-cors" -version = "4.0.0" +version = "4.0.1" description = "A Flask extension adding a decorator for CORS support" optional = false python-versions = "*" files = [ - {file = "Flask-Cors-4.0.0.tar.gz", hash = "sha256:f268522fcb2f73e2ecdde1ef45e2fd5c71cc48fe03cffb4b441c6d1b40684eb0"}, - {file = "Flask_Cors-4.0.0-py2.py3-none-any.whl", hash = "sha256:bc3492bfd6368d27cfe79c7821df5a8a319e1a6d5eab277a3794be19bdc51783"}, + {file = "Flask_Cors-4.0.1-py2.py3-none-any.whl", hash = "sha256:f2a704e4458665580c074b714c4627dd5a306b333deb9074d0b1794dfa2fb677"}, + {file = "flask_cors-4.0.1.tar.gz", hash = "sha256:eeb69b342142fdbf4766ad99357a7f3876a2ceb77689dc10ff912aac06c389e4"}, ] [package.dependencies] From d14e137b5fabcb05de8b58de4f42591674d98e48 Mon Sep 17 00:00:00 2001 From: Sung Yun <107272191+syun64@users.noreply.github.com> Date: Wed, 8 May 2024 22:12:57 +0000 Subject: [PATCH 23/80] comment --- pyiceberg/transforms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/transforms.py b/pyiceberg/transforms.py index f4d0640d43..38cc6221a2 100644 --- a/pyiceberg/transforms.py +++ b/pyiceberg/transforms.py @@ -592,7 +592,7 @@ def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Arr elif isinstance(source, TimestamptzType): epoch = datetime.EPOCH_TIMESTAMPTZ else: - raise ValueError(f"Cannot apply month transform for type: {source}") + raise ValueError(f"Cannot apply hour transform for type: {source}") return lambda v: pc.hours_between(pa.scalar(epoch), v) if v is not None else None From 4de207de395f6e3c1ac3c26eeebb7055adac829e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 8 May 2024 18:13:43 -0600 Subject: [PATCH 24/80] Build: Bump mkdocs-material from 9.5.20 to 9.5.21 (#719) --- mkdocs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs/requirements.txt b/mkdocs/requirements.txt index 3f6c097fb2..16f75e3ed6 100644 --- a/mkdocs/requirements.txt +++ b/mkdocs/requirements.txt @@ -23,6 +23,6 @@ mkdocstrings-python==1.10.0 mkdocs-literate-nav==0.6.1 mkdocs-autorefs==1.0.1 mkdocs-gen-files==0.5.0 -mkdocs-material==9.5.20 +mkdocs-material==9.5.21 mkdocs-material-extensions==1.3.1 mkdocs-section-index==0.3.9 From d02d7a1a7fefe4c2b68aef68c8029a2d5fc2b253 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 8 May 2024 18:14:20 -0600 Subject: [PATCH 25/80] Build: Bump getdaft from 0.2.23 to 0.2.24 (#721) --- poetry.lock | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/poetry.lock b/poetry.lock index dd3c271939..ae5a4ea267 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1344,17 +1344,17 @@ gcsfuse = ["fusepy"] [[package]] name = "getdaft" -version = "0.2.23" +version = "0.2.24" description = "Distributed Dataframes for Multimodal Data" optional = true -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "getdaft-0.2.23-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:a59f6084ca865528b26ed478d584f98c102500005314dbc7fc44b7c4b3e18d49"}, - {file = "getdaft-0.2.23-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:dfaf492bb453675999d70626a8fdb6d4ecaecafbf4a0548e68105757a7a4025a"}, - {file = "getdaft-0.2.23-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d6f4dbb7f3b5d62f8df1006bf55cc657148c2a3962766e62fbd3c2df337fa32"}, - {file = "getdaft-0.2.23-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9bfa567569a8b53e9b0a7ab3eb0044afe8d5499d995bfeb40bd867661bfa2aa7"}, - {file = "getdaft-0.2.23-cp37-abi3-win_amd64.whl", hash = "sha256:533b78abefa738cac97a6823ef2b8f2df3300bf2d4bda4e8336371fc2780bbb9"}, - {file = "getdaft-0.2.23.tar.gz", hash = "sha256:c2d66e6a4ce75aeb4cedbe2c04c18fa8f3f7dcfe2799f66211f36c7be2f835a5"}, + {file = "getdaft-0.2.24-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:6dbb2c25f14c008fe1323590dc86bbed9d0de8b470aa62c0844bb218864b42da"}, + {file = "getdaft-0.2.24-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:1c27ff4e3e00275db611c8fad5edefc1a24f8494093ce18f0b846b147b4d6cd6"}, + {file = "getdaft-0.2.24-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae0d0ae1238fa5eb2ddfbefbc52e47aa6f9d00e9621dde0ecbee70be43cee8e8"}, + {file = "getdaft-0.2.24-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:473881f9406d166dace7f12a3cb74915f8901b628f6d9f0900fdf69cf05b0031"}, + {file = "getdaft-0.2.24-cp38-abi3-win_amd64.whl", hash = "sha256:c77266e55245c95a5c972dd49a47a764cde1b2007cc30ab08c2f25f7a36d6697"}, + {file = "getdaft-0.2.24.tar.gz", hash = "sha256:1fa4eae81ab101bed544ee64e3128e2df4f267a87640cd1473e00f944c32a216"}, ] [package.dependencies] From aa361d1485f4a914bc0bbc2e574becaec9a773ac Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Thu, 9 May 2024 09:29:26 -0700 Subject: [PATCH 26/80] Test, write subset of schema (#704) --- tests/integration/test_writes/test_writes.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/integration/test_writes/test_writes.py b/tests/integration/test_writes/test_writes.py index a4a93396cc..74b6857dce 100644 --- a/tests/integration/test_writes/test_writes.py +++ b/tests/integration/test_writes/test_writes.py @@ -857,3 +857,15 @@ def test_sanitize_character_partitioned(catalog: Catalog) -> None: ) assert len(tbl.scan().to_arrow()) == 22 + + +@pytest.mark.parametrize("format_version", [1, 2]) +def table_write_subset_of_schema(session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int) -> None: + identifier = "default.table_append_subset_of_schema" + tbl = _create_table(session_catalog, identifier, {"format-version": format_version}, [arrow_table_with_null]) + arrow_table_without_some_columns = arrow_table_with_null.combine_chunks().drop(arrow_table_with_null.column_names[0]) + assert len(arrow_table_without_some_columns.columns) < len(arrow_table_with_null.columns) + tbl.overwrite(arrow_table_without_some_columns) + tbl.append(arrow_table_without_some_columns) + # overwrite and then append should produce twice the data + assert len(tbl.scan().to_arrow()) == len(arrow_table_without_some_columns) * 2 From b41c98cf4f5c4363640e34e635b0b56528bd7068 Mon Sep 17 00:00:00 2001 From: Andre Luis Anastacio Date: Mon, 13 May 2024 00:15:00 -0300 Subject: [PATCH 27/80] Remove pylintrc file (#724) --- pylintrc | 565 ------------------------------------------------------- 1 file changed, 565 deletions(-) delete mode 100644 pylintrc diff --git a/pylintrc b/pylintrc deleted file mode 100644 index 9835535209..0000000000 --- a/pylintrc +++ /dev/null @@ -1,565 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -[MASTER] - -# A comma-separated list of package or module names from where C extensions may -# be loaded. Extensions are loading into the active Python interpreter and may -# run arbitrary code. -extension-pkg-allow-list= - -# A comma-separated list of package or module names from where C extensions may -# be loaded. Extensions are loading into the active Python interpreter and may -# run arbitrary code. (This is an alternative name to extension-pkg-allow-list -# for backward compatibility.) -extension-pkg-whitelist= - -# Return non-zero exit code if any of these messages/categories are detected, -# even if score is above --fail-under value. Syntax same as enable. Messages -# specified are enabled, while categories only check already-enabled messages. -fail-on= - -# Specify a score threshold to be exceeded before program exits with error. -fail-under=10.0 - -# Files or directories to be skipped. They should be base names, not paths. -ignore=CVS - -# Add files or directories matching the regex patterns to the ignore-list. The -# regex matches against paths and can be in Posix or Windows format. -ignore-paths= - -# Files or directories matching the regex patterns are skipped. The regex -# matches against base names, not paths. The default value ignores emacs file -# locks -ignore-patterns=^\.# - -# Python code to execute, usually for sys.path manipulation such as -# pygtk.require(). -#init-hook= - -# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the -# number of processors available to use. -jobs=1 - -# Control the amount of potential inferred values when inferring a single -# object. This can help the performance when dealing with large functions or -# complex, nested conditions. -limit-inference-results=100 - -# List of plugins (as comma separated values of python module names) to load, -# usually to register additional checkers. -load-plugins= - -# Pickle collected data for later comparisons. -persistent=yes - -# Minimum Python version to use for version dependent checks. Will default to -# the version used to run pylint. -py-version=3.9 - -# Discover python modules and packages in the file system subtree. -recursive=no - -# When enabled, pylint would attempt to guess common misconfiguration and emit -# user-friendly hints instead of false-positive error messages. -suggestion-mode=yes - -# Allow loading of arbitrary C extensions. Extensions are imported into the -# active Python interpreter and may run arbitrary code. -unsafe-load-any-extension=no - - -[MESSAGES CONTROL] - -# Only show warnings with the listed confidence levels. Leave empty to show -# all. Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE, -# UNDEFINED. -confidence= - -# Disable the message, report, category or checker with the given id(s). You -# can either give multiple identifiers separated by comma (,) or put this -# option multiple times (only on the command line, not in the configuration -# file where it should appear only once). You can also use "--disable=all" to -# disable everything first and then re-enable specific checks. For example, if -# you want to run only the similarities checker, you can use "--disable=all -# --enable=similarities". If you want to run only the classes checker, but have -# no Warning level messages displayed, use "--disable=all --enable=classes -# --disable=W". -disable=all - -# Enable the message, report, category or checker with the given id(s). You can -# either give multiple identifier separated by comma (,) or put this option -# multiple time (only on the command line, not in the configuration file where -# it should appear only once). See also the "--disable" option for examples. -enable=W - - -[REPORTS] - -# Python expression which should return a score less than or equal to 10. You -# have access to the variables 'fatal', 'error', 'warning', 'refactor', -# 'convention', and 'info' which contain the number of messages in each -# category, as well as 'statement' which is the total number of statements -# analyzed. This score is used by the global evaluation report (RP0004). -evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)) - -# Template used to display messages. This is a python new-style format string -# used to format the message information. See doc for all details. -#msg-template= - -# Set the output format. Available formats are text, parseable, colorized, json -# and msvs (visual studio). You can also give a reporter class, e.g. -# mypackage.mymodule.MyReporterClass. -output-format=text - -# Tells whether to display a full report or only the messages. -reports=no - -# Activate the evaluation score. -score=yes - - -[REFACTORING] - -# Maximum number of nested blocks for function / method body -max-nested-blocks=5 - -# Complete name of functions that never returns. When checking for -# inconsistent-return-statements if a never returning function is called then -# it will be considered as an explicit return statement and no message will be -# printed. -never-returning-functions=sys.exit,argparse.parse_error - - -[LOGGING] - -# The type of string formatting that logging methods do. `old` means using % -# formatting, `new` is for `{}` formatting. -logging-format-style=old - -# Logging modules to check that the string format arguments are in logging -# function parameter format. -logging-modules=logging - -[MISCELLANEOUS] - -# List of note tags to take in consideration, separated by a comma. -notes=FIXME, - XXX, - TODO - -# Regular expression of note tags to take in consideration. -#notes-rgx= - - -[TYPECHECK] - -# List of decorators that produce context managers, such as -# contextlib.contextmanager. Add to this list to register other decorators that -# produce valid context managers. -contextmanager-decorators=contextlib.contextmanager - -# List of members which are set dynamically and missed by pylint inference -# system, and so shouldn't trigger E1101 when accessed. Python regular -# expressions are accepted. -generated-members= - -# Tells whether missing members accessed in mixin class should be ignored. A -# class is considered mixin if its name matches the mixin-class-rgx option. -ignore-mixin-members=yes - -# Tells whether to warn about missing members when the owner of the attribute -# is inferred to be None. -ignore-none=yes - -# This flag controls whether pylint should warn about no-member and similar -# checks whenever an opaque object is returned when inferring. The inference -# can return multiple potential results while evaluating a Python object, but -# some branches might not be evaluated, which results in partial inference. In -# that case, it might be useful to still emit no-member and other checks for -# the rest of the inferred objects. -ignore-on-opaque-inference=yes - -# List of class names for which member attributes should not be checked (useful -# for classes with dynamically set attributes). This supports the use of -# qualified names. -ignored-classes=optparse.Values,thread._local,_thread._local - -# List of module names for which member attributes should not be checked -# (useful for modules/projects where namespaces are manipulated during runtime -# and thus existing member attributes cannot be deduced by static analysis). It -# supports qualified module names, as well as Unix pattern matching. -ignored-modules= - -# Show a hint with possible names when a member name was not found. The aspect -# of finding the hint is based on edit distance. -missing-member-hint=yes - -# The minimum edit distance a name should have in order to be considered a -# similar match for a missing member name. -missing-member-hint-distance=1 - -# The total number of similar names that should be taken in consideration when -# showing a hint for a missing member. -missing-member-max-choices=1 - -# Regex pattern to define which classes are considered mixins ignore-mixin- -# members is set to 'yes' -mixin-class-rgx=.*[Mm]ixin - -# List of decorators that change the signature of a decorated function. -signature-mutators= - - -[VARIABLES] - -# List of additional names supposed to be defined in builtins. Remember that -# you should avoid defining new builtins when possible. -additional-builtins= - -# Tells whether unused global variables should be treated as a violation. -allow-global-unused-variables=yes - -# List of names allowed to shadow builtins -allowed-redefined-builtins= - -# List of strings which can identify a callback function by name. A callback -# name must start or end with one of those strings. -callbacks=cb_, - _cb - -# A regular expression matching the name of dummy variables (i.e. expected to -# not be used). -dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ - -# Argument names that match this expression will be ignored. Default to name -# with leading underscore. -ignored-argument-names=_.*|^ignored_|^unused_ - -# Tells whether we should check for unused import in __init__ files. -init-import=no - -# List of qualified module names which can have objects that can redefine -# builtins. -redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io - - -[FORMAT] - -# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. -expected-line-ending-format= - -# Regexp for a line that is allowed to be longer than the limit. -ignore-long-lines=^\s*(# )??$ - -# Number of spaces of indent required inside a hanging or continued line. -indent-after-paren=4 - -# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 -# tab). -indent-string=' ' - -# Maximum number of characters on a single line. -max-line-length=130 - -# Maximum number of lines in a module. -max-module-lines=1000 - -# Allow the body of a class to be on the same line as the declaration if body -# contains single statement. -single-line-class-stmt=no - -# Allow the body of an if to be on the same line as the test if there is no -# else. -single-line-if-stmt=no - - -[SIMILARITIES] - -# Comments are removed from the similarity computation -ignore-comments=yes - -# Docstrings are removed from the similarity computation -ignore-docstrings=yes - -# Imports are removed from the similarity computation -ignore-imports=no - -# Signatures are removed from the similarity computation -ignore-signatures=no - -# Minimum lines number of a similarity. -min-similarity-lines=4 - - -[STRING] - -# This flag controls whether inconsistent-quotes generates a warning when the -# character used as a quote delimiter is used inconsistently within a module. -check-quote-consistency=no - -# This flag controls whether the implicit-str-concat should generate a warning -# on implicit string concatenation in sequences defined over several lines. -check-str-concat-over-line-jumps=no - - -[BASIC] - -# Naming style matching correct argument names. -argument-naming-style=snake_case - -# Regular expression matching correct argument names. Overrides argument- -# naming-style. If left empty, argument names will be checked with the set -# naming style. -#argument-rgx= - -# Naming style matching correct attribute names. -attr-naming-style=snake_case - -# Regular expression matching correct attribute names. Overrides attr-naming- -# style. If left empty, attribute names will be checked with the set naming -# style. -#attr-rgx= - -# Bad variable names which should always be refused, separated by a comma. -bad-names=foo, - bar, - baz, - toto, - tutu, - tata - -# Bad variable names regexes, separated by a comma. If names match any regex, -# they will always be refused -bad-names-rgxs= - -# Naming style matching correct class attribute names. -class-attribute-naming-style=any - -# Regular expression matching correct class attribute names. Overrides class- -# attribute-naming-style. If left empty, class attribute names will be checked -# with the set naming style. -#class-attribute-rgx= - -# Naming style matching correct class constant names. -class-const-naming-style=UPPER_CASE - -# Regular expression matching correct class constant names. Overrides class- -# const-naming-style. If left empty, class constant names will be checked with -# the set naming style. -#class-const-rgx= - -# Naming style matching correct class names. -class-naming-style=PascalCase - -# Regular expression matching correct class names. Overrides class-naming- -# style. If left empty, class names will be checked with the set naming style. -#class-rgx= - -# Naming style matching correct constant names. -const-naming-style=UPPER_CASE - -# Regular expression matching correct constant names. Overrides const-naming- -# style. If left empty, constant names will be checked with the set naming -# style. -#const-rgx= - -# Minimum line length for functions/classes that require docstrings, shorter -# ones are exempt. -docstring-min-length=-1 - -# Naming style matching correct function names. -function-naming-style=snake_case - -# Regular expression matching correct function names. Overrides function- -# naming-style. If left empty, function names will be checked with the set -# naming style. -#function-rgx= - -# Good variable names which should always be accepted, separated by a comma. -good-names=i, - j, - k, - ex, - Run, - _ - -# Good variable names regexes, separated by a comma. If names match any regex, -# they will always be accepted -good-names-rgxs= - -# Include a hint for the correct naming format with invalid-name. -include-naming-hint=no - -# Naming style matching correct inline iteration names. -inlinevar-naming-style=any - -# Regular expression matching correct inline iteration names. Overrides -# inlinevar-naming-style. If left empty, inline iteration names will be checked -# with the set naming style. -#inlinevar-rgx= - -# Naming style matching correct method names. -method-naming-style=snake_case - -# Regular expression matching correct method names. Overrides method-naming- -# style. If left empty, method names will be checked with the set naming style. -#method-rgx= - -# Naming style matching correct module names. -module-naming-style=snake_case - -# Regular expression matching correct module names. Overrides module-naming- -# style. If left empty, module names will be checked with the set naming style. -#module-rgx= - -# Colon-delimited sets of names that determine each other's naming style when -# the name regexes allow several styles. -name-group= - -# Regular expression which should only match function or class names that do -# not require a docstring. -no-docstring-rgx=^_ - -# List of decorators that produce properties, such as abc.abstractproperty. Add -# to this list to register other decorators that produce valid properties. -# These decorators are taken in consideration only for invalid-name. -property-classes=abc.abstractproperty - -# Regular expression matching correct type variable names. If left empty, type -# variable names will be checked with the set naming style. -#typevar-rgx= - -# Naming style matching correct variable names. -variable-naming-style=snake_case - -# Regular expression matching correct variable names. Overrides variable- -# naming-style. If left empty, variable names will be checked with the set -# naming style. -#variable-rgx= - - -[CLASSES] - -# Warn about protected attribute access inside special methods -check-protected-access-in-special-methods=no - -# List of method names used to declare (i.e. assign) instance attributes. -defining-attr-methods=__init__, - __new__, - setUp, - __post_init__ - -# List of member names, which should be excluded from the protected access -# warning. -exclude-protected=_asdict, - _fields, - _replace, - _source, - _make - -# List of valid names for the first argument in a class method. -valid-classmethod-first-arg=cls - -# List of valid names for the first argument in a metaclass class method. -valid-metaclass-classmethod-first-arg=cls - - -[IMPORTS] - -# List of modules that can be imported at any level, not just the top level -# one. -allow-any-import-level= - -# Allow wildcard imports from modules that define __all__. -allow-wildcard-with-all=no - -# Analyse import fallback blocks. This can be used to support both Python 2 and -# 3 compatible code, which means that the block might have code that exists -# only in one or another interpreter, leading to false positives when analysed. -analyse-fallback-blocks=no - -# Deprecated modules which should not be used, separated by a comma. -deprecated-modules= - -# Output a graph (.gv or any supported image format) of external dependencies -# to the given file (report RP0402 must not be disabled). -ext-import-graph= - -# Output a graph (.gv or any supported image format) of all (i.e. internal and -# external) dependencies to the given file (report RP0402 must not be -# disabled). -import-graph= - -# Output a graph (.gv or any supported image format) of internal dependencies -# to the given file (report RP0402 must not be disabled). -int-import-graph= - -# Force import order to recognize a module as part of the standard -# compatibility libraries. -known-standard-library= - -# Force import order to recognize a module as part of a third party library. -known-third-party=enchant - -# Couples of modules and preferred modules, separated by a comma. -preferred-modules= - - -[DESIGN] - -# List of regular expressions of class ancestor names to ignore when counting -# public methods (see R0903) -exclude-too-few-public-methods= - -# List of qualified class names to ignore when counting class parents (see -# R0901) -ignored-parents= - -# Maximum number of arguments for function / method. -max-args=5 - -# Maximum number of attributes for a class (see R0902). -max-attributes=7 - -# Maximum number of boolean expressions in an if statement (see R0916). -max-bool-expr=5 - -# Maximum number of branch for function / method body. -max-branches=12 - -# Maximum number of locals for function / method body. -max-locals=15 - -# Maximum number of parents for a class (see R0901). -max-parents=7 - -# Maximum number of public methods for a class (see R0904). -max-public-methods=20 - -# Maximum number of return / yield for function / method body. -max-returns=6 - -# Maximum number of statements in function / method body. -max-statements=50 - -# Minimum number of public methods for a class (see R0903). -min-public-methods=2 - - -[EXCEPTIONS] - -# Exceptions that will emit a warning when being caught. Defaults to -# "BaseException, Exception". -overgeneral-exceptions=BaseException, - Exception From 444dca70658217e8344eb9c4bd92c2c364d38e16 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Mon, 13 May 2024 19:48:33 +0200 Subject: [PATCH 28/80] Add kevinjqliu to collaborators (#729) --- .asf.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.asf.yaml b/.asf.yaml index b1f557e903..209b722893 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -45,6 +45,7 @@ github: collaborators: # Note: the number of collaborators is limited to 10 - ajantha-bhat - syun64 + - kevinjqliu ghp_branch: gh-pages ghp_path: / From 7904fe593703ac744c9e1311553afa86e6252244 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 14 May 2024 14:32:13 +0200 Subject: [PATCH 29/80] Build: Bump moto from 5.0.6 to 5.0.7 (#733) Bumps [moto](https://github.com/getmoto/moto) from 5.0.6 to 5.0.7. - [Release notes](https://github.com/getmoto/moto/releases) - [Changelog](https://github.com/getmoto/moto/blob/master/CHANGELOG.md) - [Commits](https://github.com/getmoto/moto/compare/5.0.6...5.0.7) --- updated-dependencies: - dependency-name: moto dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/poetry.lock b/poetry.lock index ae5a4ea267..5d5279d8bb 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2212,13 +2212,13 @@ test = ["mypy (>=1.0)", "pytest (>=7.0.0)"] [[package]] name = "moto" -version = "5.0.6" +version = "5.0.7" description = "" optional = false python-versions = ">=3.8" files = [ - {file = "moto-5.0.6-py2.py3-none-any.whl", hash = "sha256:ca1e22831a741733b581ff2ef4d6ae2e1c6db1eab97af1b78b86ca2c6e88c609"}, - {file = "moto-5.0.6.tar.gz", hash = "sha256:ad8b23f2b555ad694da8b2432a42b6d96beaaf67a4e7d932196a72193a2eee2c"}, + {file = "moto-5.0.7-py2.py3-none-any.whl", hash = "sha256:c0214c1361fb1dc85f587d9ce17cd988c6f69ff0ed54d43789654022e0e744f2"}, + {file = "moto-5.0.7.tar.gz", hash = "sha256:f2cde691dc4bc675e318a65f018902ac7f89d61bf2646052f7df215d212f069e"}, ] [package.dependencies] @@ -2237,7 +2237,7 @@ joserfc = {version = ">=0.9.0", optional = true, markers = "extra == \"server\"" jsondiff = {version = ">=1.1.2", optional = true, markers = "extra == \"server\""} jsonpath-ng = {version = "*", optional = true, markers = "extra == \"server\""} openapi-spec-validator = {version = ">=0.5.0", optional = true, markers = "extra == \"server\""} -py-partiql-parser = {version = "0.5.4", optional = true, markers = "extra == \"server\""} +py-partiql-parser = {version = "0.5.5", optional = true, markers = "extra == \"server\""} pyparsing = {version = ">=3.0.7", optional = true, markers = "extra == \"server\""} python-dateutil = ">=2.1,<3.0.0" PyYAML = {version = ">=5.1", optional = true, markers = "extra == \"server\""} @@ -2248,23 +2248,23 @@ werkzeug = ">=0.5,<2.2.0 || >2.2.0,<2.2.1 || >2.2.1" xmltodict = "*" [package.extras] -all = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "multipart", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"] +all = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "multipart", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.5)", "pyparsing (>=3.0.7)", "setuptools"] apigateway = ["PyYAML (>=5.1)", "joserfc (>=0.9.0)", "openapi-spec-validator (>=0.5.0)"] apigatewayv2 = ["PyYAML (>=5.1)", "openapi-spec-validator (>=0.5.0)"] appsync = ["graphql-core"] awslambda = ["docker (>=3.0.0)"] batch = ["docker (>=3.0.0)"] -cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"] +cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.5)", "pyparsing (>=3.0.7)", "setuptools"] cognitoidp = ["joserfc (>=0.9.0)"] -dynamodb = ["docker (>=3.0.0)", "py-partiql-parser (==0.5.4)"] -dynamodbstreams = ["docker (>=3.0.0)", "py-partiql-parser (==0.5.4)"] +dynamodb = ["docker (>=3.0.0)", "py-partiql-parser (==0.5.5)"] +dynamodbstreams = ["docker (>=3.0.0)", "py-partiql-parser (==0.5.5)"] glue = ["pyparsing (>=3.0.7)"] iotdata = ["jsondiff (>=1.1.2)"] -proxy = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "multipart", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"] -resourcegroupstaggingapi = ["PyYAML (>=5.1)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)"] -s3 = ["PyYAML (>=5.1)", "py-partiql-parser (==0.5.4)"] -s3crc32c = ["PyYAML (>=5.1)", "crc32c", "py-partiql-parser (==0.5.4)"] -server = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"] +proxy = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "multipart", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.5)", "pyparsing (>=3.0.7)", "setuptools"] +resourcegroupstaggingapi = ["PyYAML (>=5.1)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.5)", "pyparsing (>=3.0.7)"] +s3 = ["PyYAML (>=5.1)", "py-partiql-parser (==0.5.5)"] +s3crc32c = ["PyYAML (>=5.1)", "crc32c", "py-partiql-parser (==0.5.5)"] +server = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.5)", "pyparsing (>=3.0.7)", "setuptools"] ssm = ["PyYAML (>=5.1)"] stepfunctions = ["antlr4-python3-runtime", "jsonpath-ng"] xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"] @@ -2915,13 +2915,13 @@ files = [ [[package]] name = "py-partiql-parser" -version = "0.5.4" +version = "0.5.5" description = "Pure Python PartiQL Parser" optional = false python-versions = "*" files = [ - {file = "py_partiql_parser-0.5.4-py2.py3-none-any.whl", hash = "sha256:3dc4295a47da9587681a96b35c6e151886fdbd0a4acbe0d97c4c68e5f689d315"}, - {file = "py_partiql_parser-0.5.4.tar.gz", hash = "sha256:72e043919538fa63edae72fb59afc7e3fd93adbde656718a7d2b4666f23dd114"}, + {file = "py_partiql_parser-0.5.5-py2.py3-none-any.whl", hash = "sha256:90d278818385bd60c602410c953ee78f04ece599d8cd21c656fc5e47399577a1"}, + {file = "py_partiql_parser-0.5.5.tar.gz", hash = "sha256:ed07f8edf4b55e295cab4f5fd3e2ba3196cee48a43fe210d53ddd6ffce1cf1ff"}, ] [package.extras] From 0d98ec82e01a56f589b7515b3d4a7c6d90612118 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 14 May 2024 14:32:27 +0200 Subject: [PATCH 30/80] Build: Bump mkdocs-material from 9.5.21 to 9.5.22 (#732) Bumps [mkdocs-material](https://github.com/squidfunk/mkdocs-material) from 9.5.21 to 9.5.22. - [Release notes](https://github.com/squidfunk/mkdocs-material/releases) - [Changelog](https://github.com/squidfunk/mkdocs-material/blob/master/CHANGELOG) - [Commits](https://github.com/squidfunk/mkdocs-material/compare/9.5.21...9.5.22) --- updated-dependencies: - dependency-name: mkdocs-material dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- mkdocs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs/requirements.txt b/mkdocs/requirements.txt index 16f75e3ed6..6c98b8d9cf 100644 --- a/mkdocs/requirements.txt +++ b/mkdocs/requirements.txt @@ -23,6 +23,6 @@ mkdocstrings-python==1.10.0 mkdocs-literate-nav==0.6.1 mkdocs-autorefs==1.0.1 mkdocs-gen-files==0.5.0 -mkdocs-material==9.5.21 +mkdocs-material==9.5.22 mkdocs-material-extensions==1.3.1 mkdocs-section-index==0.3.9 From 6c2ba346d950b7784a7e083f6da5655850999bbd Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 14 May 2024 14:32:44 +0200 Subject: [PATCH 31/80] Build: Bump griffe from 0.44.0 to 0.45.0 (#731) Bumps [griffe](https://github.com/mkdocstrings/griffe) from 0.44.0 to 0.45.0. - [Release notes](https://github.com/mkdocstrings/griffe/releases) - [Changelog](https://github.com/mkdocstrings/griffe/blob/main/CHANGELOG.md) - [Commits](https://github.com/mkdocstrings/griffe/compare/0.44.0...0.45.0) --- updated-dependencies: - dependency-name: griffe dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- mkdocs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs/requirements.txt b/mkdocs/requirements.txt index 6c98b8d9cf..2710d51227 100644 --- a/mkdocs/requirements.txt +++ b/mkdocs/requirements.txt @@ -16,7 +16,7 @@ # under the License. mkdocs==1.6.0 -griffe==0.44.0 +griffe==0.45.0 jinja2==3.1.4 mkdocstrings==0.25.1 mkdocstrings-python==1.10.0 From 20b7b5339a1a0ab59d884c7d042c4bc96a166b11 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 14 May 2024 14:33:01 +0200 Subject: [PATCH 32/80] Build: Bump pypa/cibuildwheel from 2.17.0 to 2.18.0 (#730) Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from 2.17.0 to 2.18.0. - [Release notes](https://github.com/pypa/cibuildwheel/releases) - [Changelog](https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md) - [Commits](https://github.com/pypa/cibuildwheel/compare/v2.17.0...v2.18.0) --- updated-dependencies: - dependency-name: pypa/cibuildwheel dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/python-release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-release.yml b/.github/workflows/python-release.yml index 54446049a4..37f28a76c5 100644 --- a/.github/workflows/python-release.yml +++ b/.github/workflows/python-release.yml @@ -59,7 +59,7 @@ jobs: if: startsWith(matrix.os, 'ubuntu') - name: Build wheels - uses: pypa/cibuildwheel@v2.17.0 + uses: pypa/cibuildwheel@v2.18.0 with: output-dir: wheelhouse config-file: "pyproject.toml" From 6d52325b45ed5026c9aaad7556705a608891d8c0 Mon Sep 17 00:00:00 2001 From: frankliee Date: Wed, 15 May 2024 15:13:32 +0800 Subject: [PATCH 33/80] Hive catalog: Add retry logic for hive locking (#701) --- pyiceberg/catalog/hive.py | 61 +++++++++++++++++++++++++++++++-- pyiceberg/exceptions.py | 4 +++ pyiceberg/table/__init__.py | 10 ++++++ tests/catalog/test_hive.py | 39 ++++++++++++++++++++- tests/integration/test_reads.py | 31 +++++++++++++++++ 5 files changed, 142 insertions(+), 3 deletions(-) diff --git a/pyiceberg/catalog/hive.py b/pyiceberg/catalog/hive.py index 804b1105cc..708ae8c9d4 100644 --- a/pyiceberg/catalog/hive.py +++ b/pyiceberg/catalog/hive.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. import getpass +import logging import socket import time from types import TracebackType @@ -33,6 +34,7 @@ from hive_metastore.ThriftHiveMetastore import Client from hive_metastore.ttypes import ( AlreadyExistsException, + CheckLockRequest, FieldSchema, InvalidOperationException, LockComponent, @@ -49,6 +51,7 @@ ) from hive_metastore.ttypes import Database as HiveDatabase from hive_metastore.ttypes import Table as HiveTable +from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential from thrift.protocol import TBinaryProtocol from thrift.transport import TSocket, TTransport @@ -69,12 +72,20 @@ NoSuchNamespaceError, NoSuchTableError, TableAlreadyExistsError, + WaitingForLockException, ) from pyiceberg.io import FileIO, load_file_io from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec from pyiceberg.schema import Schema, SchemaVisitor, visit from pyiceberg.serializers import FromInputFile -from pyiceberg.table import CommitTableRequest, CommitTableResponse, PropertyUtil, Table, TableProperties, update_table_metadata +from pyiceberg.table import ( + CommitTableRequest, + CommitTableResponse, + PropertyUtil, + Table, + TableProperties, + update_table_metadata, +) from pyiceberg.table.metadata import new_table_metadata from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder from pyiceberg.typedef import EMPTY_DICT, Identifier, Properties @@ -111,6 +122,15 @@ HIVE2_COMPATIBLE = "hive.hive2-compatible" HIVE2_COMPATIBLE_DEFAULT = False +LOCK_CHECK_MIN_WAIT_TIME = "lock-check-min-wait-time" +LOCK_CHECK_MAX_WAIT_TIME = "lock-check-max-wait-time" +LOCK_CHECK_RETRIES = "lock-check-retries" +DEFAULT_LOCK_CHECK_MIN_WAIT_TIME = 0.1 # 100 milliseconds +DEFAULT_LOCK_CHECK_MAX_WAIT_TIME = 60 # 1 min +DEFAULT_LOCK_CHECK_RETRIES = 4 + +logger = logging.getLogger(__name__) + class _HiveClient: """Helper class to nicely open and close the transport.""" @@ -240,6 +260,18 @@ def __init__(self, name: str, **properties: str): super().__init__(name, **properties) self._client = _HiveClient(properties["uri"], properties.get("ugi")) + self._lock_check_min_wait_time = PropertyUtil.property_as_float( + properties, LOCK_CHECK_MIN_WAIT_TIME, DEFAULT_LOCK_CHECK_MIN_WAIT_TIME + ) + self._lock_check_max_wait_time = PropertyUtil.property_as_float( + properties, LOCK_CHECK_MAX_WAIT_TIME, DEFAULT_LOCK_CHECK_MAX_WAIT_TIME + ) + self._lock_check_retries = PropertyUtil.property_as_float( + properties, + LOCK_CHECK_RETRIES, + DEFAULT_LOCK_CHECK_RETRIES, + ) + def _convert_hive_into_iceberg(self, table: HiveTable, io: FileIO) -> Table: properties: Dict[str, str] = table.parameters if TABLE_TYPE not in properties: @@ -356,6 +388,26 @@ def _create_lock_request(self, database_name: str, table_name: str) -> LockReque return lock_request + def _wait_for_lock(self, database_name: str, table_name: str, lockid: int, open_client: Client) -> LockResponse: + @retry( + retry=retry_if_exception_type(WaitingForLockException), + wait=wait_exponential(multiplier=2, min=self._lock_check_min_wait_time, max=self._lock_check_max_wait_time), + stop=stop_after_attempt(self._lock_check_retries), + reraise=True, + ) + def _do_wait_for_lock() -> LockResponse: + response: LockResponse = open_client.check_lock(CheckLockRequest(lockid=lockid)) + if response.state == LockState.ACQUIRED: + return response + elif response.state == LockState.WAITING: + msg = f"Wait on lock for {database_name}.{table_name}" + logger.warning(msg) + raise WaitingForLockException(msg) + else: + raise CommitFailedException(f"Failed to check lock for {database_name}.{table_name}, state: {response.state}") + + return _do_wait_for_lock() + def _commit_table(self, table_request: CommitTableRequest) -> CommitTableResponse: """Update the table. @@ -380,7 +432,10 @@ def _commit_table(self, table_request: CommitTableRequest) -> CommitTableRespons try: if lock.state != LockState.ACQUIRED: - raise CommitFailedException(f"Failed to acquire lock for {table_request.identifier}, state: {lock.state}") + if lock.state == LockState.WAITING: + self._wait_for_lock(database_name, table_name, lock.lockid, open_client) + else: + raise CommitFailedException(f"Failed to acquire lock for {table_request.identifier}, state: {lock.state}") hive_table = open_client.get_table(dbname=database_name, tbl_name=table_name) io = load_file_io({**self.properties, **hive_table.parameters}, hive_table.sd.location) @@ -406,6 +461,8 @@ def _commit_table(self, table_request: CommitTableRequest) -> CommitTableRespons open_client.alter_table(dbname=database_name, tbl_name=table_name, new_tbl=hive_table) except NoSuchObjectException as e: raise NoSuchTableError(f"Table does not exist: {table_name}") from e + except WaitingForLockException as e: + raise CommitFailedException(f"Failed to acquire lock for {table_request.identifier}, state: {lock.state}") from e finally: open_client.unlock(UnlockRequest(lockid=lock.lockid)) diff --git a/pyiceberg/exceptions.py b/pyiceberg/exceptions.py index 64356b11a4..c7e37ba7ca 100644 --- a/pyiceberg/exceptions.py +++ b/pyiceberg/exceptions.py @@ -110,3 +110,7 @@ class CommitFailedException(Exception): class CommitStateUnknownException(RESTError): """Commit failed due to unknown reason.""" + + +class WaitingForLockException(Exception): + """Need to wait for a lock, try again.""" diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 5b7d04b543..c57f0d1297 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -251,6 +251,16 @@ def property_as_int(properties: Dict[str, str], property_name: str, default: Opt else: return default + @staticmethod + def property_as_float(properties: Dict[str, str], property_name: str, default: Optional[float] = None) -> Optional[float]: + if value := properties.get(property_name): + try: + return float(value) + except ValueError as e: + raise ValueError(f"Could not parse table property {property_name} to a float: {value}") from e + else: + return default + @staticmethod def property_as_bool(properties: Dict[str, str], property_name: str, default: bool) -> bool: if value := properties.get(property_name): diff --git a/tests/catalog/test_hive.py b/tests/catalog/test_hive.py index af3a380100..ef662b3aff 100644 --- a/tests/catalog/test_hive.py +++ b/tests/catalog/test_hive.py @@ -24,6 +24,8 @@ AlreadyExistsException, FieldSchema, InvalidOperationException, + LockResponse, + LockState, MetaException, NoSuchObjectException, SerDeInfo, @@ -34,12 +36,19 @@ from hive_metastore.ttypes import Table as HiveTable from pyiceberg.catalog import PropertiesUpdateSummary -from pyiceberg.catalog.hive import HiveCatalog, _construct_hive_storage_descriptor +from pyiceberg.catalog.hive import ( + LOCK_CHECK_MAX_WAIT_TIME, + LOCK_CHECK_MIN_WAIT_TIME, + LOCK_CHECK_RETRIES, + HiveCatalog, + _construct_hive_storage_descriptor, +) from pyiceberg.exceptions import ( NamespaceAlreadyExistsError, NamespaceNotEmptyError, NoSuchNamespaceError, NoSuchTableError, + WaitingForLockException, ) from pyiceberg.partitioning import PartitionField, PartitionSpec from pyiceberg.schema import Schema @@ -1158,3 +1167,31 @@ def test_resolve_table_location_warehouse(hive_database: HiveDatabase) -> None: location = catalog._resolve_table_location(None, "database", "table") assert location == "/tmp/warehouse/database.db/table" + + +def test_hive_wait_for_lock() -> None: + lockid = 12345 + acquired = LockResponse(lockid=lockid, state=LockState.ACQUIRED) + waiting = LockResponse(lockid=lockid, state=LockState.WAITING) + prop = { + "uri": HIVE_METASTORE_FAKE_URL, + LOCK_CHECK_MIN_WAIT_TIME: 0.1, + LOCK_CHECK_MAX_WAIT_TIME: 0.5, + LOCK_CHECK_RETRIES: 5, + } + catalog = HiveCatalog(HIVE_CATALOG_NAME, **prop) # type: ignore + catalog._client = MagicMock() + catalog._client.lock.return_value = LockResponse(lockid=lockid, state=LockState.WAITING) + + # lock will be acquired after 3 retries + catalog._client.check_lock.side_effect = [waiting if i < 2 else acquired for i in range(10)] + response: LockResponse = catalog._wait_for_lock("db", "tbl", lockid, catalog._client) + assert response.state == LockState.ACQUIRED + assert catalog._client.check_lock.call_count == 3 + + # lock wait should exit with WaitingForLockException finally after enough retries + catalog._client.check_lock.side_effect = [waiting for _ in range(10)] + catalog._client.check_lock.call_count = 0 + with pytest.raises(WaitingForLockException): + catalog._wait_for_lock("db", "tbl", lockid, catalog._client) + assert catalog._client.check_lock.call_count == 5 diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py index ee9b17e438..2a10e37ba9 100644 --- a/tests/integration/test_reads.py +++ b/tests/integration/test_reads.py @@ -17,6 +17,7 @@ # pylint:disable=redefined-outer-name import math +import time import uuid from urllib.parse import urlparse @@ -48,6 +49,7 @@ StringType, TimestampType, ) +from pyiceberg.utils.concurrent import ExecutorFactory DEFAULT_PROPERTIES = {'write.parquet.compression-codec': 'zstd'} @@ -506,3 +508,32 @@ def test_hive_locking(session_catalog_hive: HiveCatalog) -> None: table.transaction().set_properties(lock="fail").commit_transaction() finally: open_client.unlock(UnlockRequest(lock.lockid)) + + +@pytest.mark.integration +def test_hive_locking_with_retry(session_catalog_hive: HiveCatalog) -> None: + table = create_table(session_catalog_hive) + database_name: str + table_name: str + _, database_name, table_name = table.identifier + session_catalog_hive._lock_check_min_wait_time = 0.1 + session_catalog_hive._lock_check_max_wait_time = 0.5 + session_catalog_hive._lock_check_retries = 5 + + hive_client: _HiveClient = _HiveClient(session_catalog_hive.properties["uri"]) + + executor = ExecutorFactory.get_or_create() + + with hive_client as open_client: + + def another_task() -> None: + lock: LockResponse = open_client.lock(session_catalog_hive._create_lock_request(database_name, table_name)) + time.sleep(1) + open_client.unlock(UnlockRequest(lock.lockid)) + + # test transaction commit with concurrent locking + executor.submit(another_task) + time.sleep(0.5) + + table.transaction().set_properties(lock="xxx").commit_transaction() + assert table.properties.get("lock") == "xxx" From a268e5bdd3f3b2eee36e00bc43578ef6c28c7e51 Mon Sep 17 00:00:00 2001 From: Andre Luis Anastacio Date: Wed, 15 May 2024 04:14:04 -0300 Subject: [PATCH 34/80] Add create_namespace_if_not_exists method (#725) --- pyiceberg/catalog/__init__.py | 20 +++++++++++++++++++- tests/catalog/integration_test_dynamodb.py | 6 ++++++ tests/catalog/integration_test_glue.py | 6 ++++++ tests/catalog/test_rest.py | 18 ++++++++++++++++++ tests/catalog/test_sql.py | 14 ++++++++++++++ 5 files changed, 63 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index 5bb9ec277a..0b70fe32e1 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -36,7 +36,13 @@ cast, ) -from pyiceberg.exceptions import NoSuchNamespaceError, NoSuchTableError, NotInstalledError, TableAlreadyExistsError +from pyiceberg.exceptions import ( + NamespaceAlreadyExistsError, + NoSuchNamespaceError, + NoSuchTableError, + NotInstalledError, + TableAlreadyExistsError, +) from pyiceberg.io import FileIO, load_file_io from pyiceberg.manifest import ManifestFile from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec @@ -477,6 +483,18 @@ def create_namespace(self, namespace: Union[str, Identifier], properties: Proper NamespaceAlreadyExistsError: If a namespace with the given name already exists. """ + def create_namespace_if_not_exists(self, namespace: Union[str, Identifier], properties: Properties = EMPTY_DICT) -> None: + """Create a namespace if it does not exist. + + Args: + namespace (str | Identifier): Namespace identifier. + properties (Properties): A string dictionary of properties for the given namespace. + """ + try: + self.create_namespace(namespace, properties) + except NamespaceAlreadyExistsError: + pass + @abstractmethod def drop_namespace(self, namespace: Union[str, Identifier]) -> None: """Drop a namespace. diff --git a/tests/catalog/integration_test_dynamodb.py b/tests/catalog/integration_test_dynamodb.py index 5b9584c69f..05d51bb0ef 100644 --- a/tests/catalog/integration_test_dynamodb.py +++ b/tests/catalog/integration_test_dynamodb.py @@ -184,6 +184,12 @@ def test_create_duplicate_namespace(test_catalog: Catalog, database_name: str) - test_catalog.create_namespace(database_name) +def test_create_namepsace_if_not_exists(test_catalog: Catalog, database_name: str) -> None: + test_catalog.create_namespace(database_name) + test_catalog.create_namespace_if_not_exists(database_name) + assert (database_name,) in test_catalog.list_namespaces() + + def test_create_namespace_with_comment_and_location(test_catalog: Catalog, database_name: str) -> None: test_location = get_s3_path(get_bucket_name(), database_name) test_properties = { diff --git a/tests/catalog/integration_test_glue.py b/tests/catalog/integration_test_glue.py index a2c430de5f..5b4aa58787 100644 --- a/tests/catalog/integration_test_glue.py +++ b/tests/catalog/integration_test_glue.py @@ -291,6 +291,12 @@ def test_create_duplicate_namespace(test_catalog: Catalog, database_name: str) - test_catalog.create_namespace(database_name) +def test_create_namespace_if_not_exists(test_catalog: Catalog, database_name: str) -> None: + test_catalog.create_namespace(database_name) + test_catalog.create_namespace_if_not_exists(database_name) + assert (database_name,) in test_catalog.list_namespaces() + + def test_create_namespace_with_comment_and_location(test_catalog: Catalog, database_name: str) -> None: test_location = get_s3_path(get_bucket_name(), database_name) test_properties = { diff --git a/tests/catalog/test_rest.py b/tests/catalog/test_rest.py index b8410d6841..ec5a6a22a4 100644 --- a/tests/catalog/test_rest.py +++ b/tests/catalog/test_rest.py @@ -500,6 +500,24 @@ def test_create_namespace_200(rest_mock: Mocker) -> None: RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN).create_namespace(namespace) +def test_create_namespace_if_exists_409(rest_mock: Mocker) -> None: + namespace = "examples" + rest_mock.post( + f"{TEST_URI}v1/namespaces", + json={ + "error": { + "message": "Namespace already exists: fokko in warehouse 8bcb0838-50fc-472d-9ddb-8feb89ef5f1e", + "type": "AlreadyExistsException", + "code": 409, + } + }, + status_code=409, + request_headers=TEST_HEADERS, + ) + + RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN).create_namespace_if_not_exists(namespace) + + def test_create_namespace_409(rest_mock: Mocker) -> None: namespace = "examples" rest_mock.post( diff --git a/tests/catalog/test_sql.py b/tests/catalog/test_sql.py index 9796526887..efa7b746a9 100644 --- a/tests/catalog/test_sql.py +++ b/tests/catalog/test_sql.py @@ -633,6 +633,20 @@ def test_create_namespace(catalog: SqlCatalog, database_name: str) -> None: assert (database_name,) in catalog.list_namespaces() +@pytest.mark.parametrize( + 'catalog', + [ + lazy_fixture('catalog_memory'), + lazy_fixture('catalog_sqlite'), + ], +) +def test_create_namespace_if_not_exists(catalog: SqlCatalog, database_name: str) -> None: + catalog.create_namespace(database_name) + assert (database_name,) in catalog.list_namespaces() + catalog.create_namespace_if_not_exists(database_name) + assert (database_name,) in catalog.list_namespaces() + + @pytest.mark.parametrize( 'catalog', [ From b40378b62410a249223a20bade27371695a8531c Mon Sep 17 00:00:00 2001 From: Andre Luis Anastacio Date: Wed, 15 May 2024 04:15:34 -0300 Subject: [PATCH 35/80] Remove NoSuchNamespaceError on namespace creation (#726) --- pyiceberg/catalog/rest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/catalog/rest.py b/pyiceberg/catalog/rest.py index 565d809194..7259f9fa38 100644 --- a/pyiceberg/catalog/rest.py +++ b/pyiceberg/catalog/rest.py @@ -715,7 +715,7 @@ def create_namespace(self, namespace: Union[str, Identifier], properties: Proper try: response.raise_for_status() except HTTPError as exc: - self._handle_non_200_response(exc, {404: NoSuchNamespaceError, 409: NamespaceAlreadyExistsError}) + self._handle_non_200_response(exc, {409: NamespaceAlreadyExistsError}) @retry(**_RETRY_ARGS) def drop_namespace(self, namespace: Union[str, Identifier]) -> None: From ac84bd5bdca072bd11894cda767eeada9f25f554 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 15 May 2024 00:36:34 -0700 Subject: [PATCH 36/80] Build: Bump pyarrow from 16.0.0 to 16.1.0 (#743) Bumps [pyarrow](https://github.com/apache/arrow) from 16.0.0 to 16.1.0. - [Commits](https://github.com/apache/arrow/compare/go/v16.0.0...go/v16.1.0) --- updated-dependencies: - dependency-name: pyarrow dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 74 ++++++++++++++++++++++++++--------------------------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/poetry.lock b/poetry.lock index 5d5279d8bb..312c3d4884 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2940,47 +2940,47 @@ files = [ [[package]] name = "pyarrow" -version = "16.0.0" +version = "16.1.0" description = "Python library for Apache Arrow" optional = true python-versions = ">=3.8" files = [ - {file = "pyarrow-16.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:22a1fdb1254e5095d629e29cd1ea98ed04b4bbfd8e42cc670a6b639ccc208b60"}, - {file = "pyarrow-16.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:574a00260a4ed9d118a14770edbd440b848fcae5a3024128be9d0274dbcaf858"}, - {file = "pyarrow-16.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c0815d0ddb733b8c1b53a05827a91f1b8bde6240f3b20bf9ba5d650eb9b89cdf"}, - {file = "pyarrow-16.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:df0080339387b5d30de31e0a149c0c11a827a10c82f0c67d9afae3981d1aabb7"}, - {file = "pyarrow-16.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:edf38cce0bf0dcf726e074159c60516447e4474904c0033f018c1f33d7dac6c5"}, - {file = "pyarrow-16.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:91d28f9a40f1264eab2af7905a4d95320ac2f287891e9c8b0035f264fe3c3a4b"}, - {file = "pyarrow-16.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:99af421ee451a78884d7faea23816c429e263bd3618b22d38e7992c9ce2a7ad9"}, - {file = "pyarrow-16.0.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:d22d0941e6c7bafddf5f4c0662e46f2075850f1c044bf1a03150dd9e189427ce"}, - {file = "pyarrow-16.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:266ddb7e823f03733c15adc8b5078db2df6980f9aa93d6bb57ece615df4e0ba7"}, - {file = "pyarrow-16.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5cc23090224b6594f5a92d26ad47465af47c1d9c079dd4a0061ae39551889efe"}, - {file = "pyarrow-16.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56850a0afe9ef37249d5387355449c0f94d12ff7994af88f16803a26d38f2016"}, - {file = "pyarrow-16.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:705db70d3e2293c2f6f8e84874b5b775f690465798f66e94bb2c07bab0a6bb55"}, - {file = "pyarrow-16.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:5448564754c154997bc09e95a44b81b9e31ae918a86c0fcb35c4aa4922756f55"}, - {file = "pyarrow-16.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:729f7b262aa620c9df8b9967db96c1575e4cfc8c25d078a06968e527b8d6ec05"}, - {file = "pyarrow-16.0.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:fb8065dbc0d051bf2ae2453af0484d99a43135cadabacf0af588a3be81fbbb9b"}, - {file = "pyarrow-16.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:20ce707d9aa390593ea93218b19d0eadab56390311cb87aad32c9a869b0e958c"}, - {file = "pyarrow-16.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5823275c8addbbb50cd4e6a6839952682a33255b447277e37a6f518d6972f4e1"}, - {file = "pyarrow-16.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ab8b9050752b16a8b53fcd9853bf07d8daf19093533e990085168f40c64d978"}, - {file = "pyarrow-16.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:42e56557bc7c5c10d3e42c3b32f6cff649a29d637e8f4e8b311d334cc4326730"}, - {file = "pyarrow-16.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:2a7abdee4a4a7cfa239e2e8d721224c4b34ffe69a0ca7981354fe03c1328789b"}, - {file = "pyarrow-16.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:ef2f309b68396bcc5a354106741d333494d6a0d3e1951271849787109f0229a6"}, - {file = "pyarrow-16.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:ed66e5217b4526fa3585b5e39b0b82f501b88a10d36bd0d2a4d8aa7b5a48e2df"}, - {file = "pyarrow-16.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cc8814310486f2a73c661ba8354540f17eef51e1b6dd090b93e3419d3a097b3a"}, - {file = "pyarrow-16.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c2f5e239db7ed43e0ad2baf46a6465f89c824cc703f38ef0fde927d8e0955f7"}, - {file = "pyarrow-16.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f293e92d1db251447cb028ae12f7bc47526e4649c3a9924c8376cab4ad6b98bd"}, - {file = "pyarrow-16.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:dd9334a07b6dc21afe0857aa31842365a62eca664e415a3f9536e3a8bb832c07"}, - {file = "pyarrow-16.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d91073d1e2fef2c121154680e2ba7e35ecf8d4969cc0af1fa6f14a8675858159"}, - {file = "pyarrow-16.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:71d52561cd7aefd22cf52538f262850b0cc9e4ec50af2aaa601da3a16ef48877"}, - {file = "pyarrow-16.0.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:b93c9a50b965ee0bf4fef65e53b758a7e8dcc0c2d86cebcc037aaaf1b306ecc0"}, - {file = "pyarrow-16.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d831690844706e374c455fba2fb8cfcb7b797bfe53ceda4b54334316e1ac4fa4"}, - {file = "pyarrow-16.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:35692ce8ad0b8c666aa60f83950957096d92f2a9d8d7deda93fb835e6053307e"}, - {file = "pyarrow-16.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9dd3151d098e56f16a8389c1247137f9e4c22720b01c6f3aa6dec29a99b74d80"}, - {file = "pyarrow-16.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:bd40467bdb3cbaf2044ed7a6f7f251c8f941c8b31275aaaf88e746c4f3ca4a7a"}, - {file = "pyarrow-16.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:00a1dcb22ad4ceb8af87f7bd30cc3354788776c417f493089e0a0af981bc8d80"}, - {file = "pyarrow-16.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:fda9a7cebd1b1d46c97b511f60f73a5b766a6de4c5236f144f41a5d5afec1f35"}, - {file = "pyarrow-16.0.0.tar.gz", hash = "sha256:59bb1f1edbbf4114c72415f039f1359f1a57d166a331c3229788ccbfbb31689a"}, + {file = "pyarrow-16.1.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:17e23b9a65a70cc733d8b738baa6ad3722298fa0c81d88f63ff94bf25eaa77b9"}, + {file = "pyarrow-16.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4740cc41e2ba5d641071d0ab5e9ef9b5e6e8c7611351a5cb7c1d175eaf43674a"}, + {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:98100e0268d04e0eec47b73f20b39c45b4006f3c4233719c3848aa27a03c1aef"}, + {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f68f409e7b283c085f2da014f9ef81e885d90dcd733bd648cfba3ef265961848"}, + {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:a8914cd176f448e09746037b0c6b3a9d7688cef451ec5735094055116857580c"}, + {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:48be160782c0556156d91adbdd5a4a7e719f8d407cb46ae3bb4eaee09b3111bd"}, + {file = "pyarrow-16.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9cf389d444b0f41d9fe1444b70650fea31e9d52cfcb5f818b7888b91b586efff"}, + {file = "pyarrow-16.1.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:d0ebea336b535b37eee9eee31761813086d33ed06de9ab6fc6aaa0bace7b250c"}, + {file = "pyarrow-16.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e73cfc4a99e796727919c5541c65bb88b973377501e39b9842ea71401ca6c1c"}, + {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf9251264247ecfe93e5f5a0cd43b8ae834f1e61d1abca22da55b20c788417f6"}, + {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddf5aace92d520d3d2a20031d8b0ec27b4395cab9f74e07cc95edf42a5cc0147"}, + {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:25233642583bf658f629eb230b9bb79d9af4d9f9229890b3c878699c82f7d11e"}, + {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a33a64576fddfbec0a44112eaf844c20853647ca833e9a647bfae0582b2ff94b"}, + {file = "pyarrow-16.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:185d121b50836379fe012753cf15c4ba9638bda9645183ab36246923875f8d1b"}, + {file = "pyarrow-16.1.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:2e51ca1d6ed7f2e9d5c3c83decf27b0d17bb207a7dea986e8dc3e24f80ff7d6f"}, + {file = "pyarrow-16.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:06ebccb6f8cb7357de85f60d5da50e83507954af617d7b05f48af1621d331c9a"}, + {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b04707f1979815f5e49824ce52d1dceb46e2f12909a48a6a753fe7cafbc44a0c"}, + {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d32000693deff8dc5df444b032b5985a48592c0697cb6e3071a5d59888714e2"}, + {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:8785bb10d5d6fd5e15d718ee1d1f914fe768bf8b4d1e5e9bf253de8a26cb1628"}, + {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:e1369af39587b794873b8a307cc6623a3b1194e69399af0efd05bb202195a5a7"}, + {file = "pyarrow-16.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:febde33305f1498f6df85e8020bca496d0e9ebf2093bab9e0f65e2b4ae2b3444"}, + {file = "pyarrow-16.1.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:b5f5705ab977947a43ac83b52ade3b881eb6e95fcc02d76f501d549a210ba77f"}, + {file = "pyarrow-16.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0d27bf89dfc2576f6206e9cd6cf7a107c9c06dc13d53bbc25b0bd4556f19cf5f"}, + {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d07de3ee730647a600037bc1d7b7994067ed64d0eba797ac74b2bc77384f4c2"}, + {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fbef391b63f708e103df99fbaa3acf9f671d77a183a07546ba2f2c297b361e83"}, + {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:19741c4dbbbc986d38856ee7ddfdd6a00fc3b0fc2d928795b95410d38bb97d15"}, + {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:f2c5fb249caa17b94e2b9278b36a05ce03d3180e6da0c4c3b3ce5b2788f30eed"}, + {file = "pyarrow-16.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:e6b6d3cd35fbb93b70ade1336022cc1147b95ec6af7d36906ca7fe432eb09710"}, + {file = "pyarrow-16.1.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:18da9b76a36a954665ccca8aa6bd9f46c1145f79c0bb8f4f244f5f8e799bca55"}, + {file = "pyarrow-16.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:99f7549779b6e434467d2aa43ab2b7224dd9e41bdde486020bae198978c9e05e"}, + {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f07fdffe4fd5b15f5ec15c8b64584868d063bc22b86b46c9695624ca3505b7b4"}, + {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddfe389a08ea374972bd4065d5f25d14e36b43ebc22fc75f7b951f24378bf0b5"}, + {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:3b20bd67c94b3a2ea0a749d2a5712fc845a69cb5d52e78e6449bbd295611f3aa"}, + {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:ba8ac20693c0bb0bf4b238751d4409e62852004a8cf031c73b0e0962b03e45e3"}, + {file = "pyarrow-16.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:31a1851751433d89a986616015841977e0a188662fcffd1a5677453f1df2de0a"}, + {file = "pyarrow-16.1.0.tar.gz", hash = "sha256:15fbb22ea96d11f0b5768504a3f961edab25eaf4197c341720c4a387f6c60315"}, ] [package.dependencies] From 20c273104257f0a1ccd74a09f6d4601643115ffd Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 15 May 2024 00:36:41 -0700 Subject: [PATCH 37/80] Build: Bump mkdocstrings-python from 1.10.0 to 1.10.1 (#744) Bumps [mkdocstrings-python](https://github.com/mkdocstrings/python) from 1.10.0 to 1.10.1. - [Release notes](https://github.com/mkdocstrings/python/releases) - [Changelog](https://github.com/mkdocstrings/python/blob/main/CHANGELOG.md) - [Commits](https://github.com/mkdocstrings/python/compare/1.10.0...1.10.1) --- updated-dependencies: - dependency-name: mkdocstrings-python dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- mkdocs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs/requirements.txt b/mkdocs/requirements.txt index 2710d51227..ecd695472d 100644 --- a/mkdocs/requirements.txt +++ b/mkdocs/requirements.txt @@ -19,7 +19,7 @@ mkdocs==1.6.0 griffe==0.45.0 jinja2==3.1.4 mkdocstrings==0.25.1 -mkdocstrings-python==1.10.0 +mkdocstrings-python==1.10.1 mkdocs-literate-nav==0.6.1 mkdocs-autorefs==1.0.1 mkdocs-gen-files==0.5.0 From 4fddcbe9b14cd73ac2672ea341c5d165efd2a64a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 21 May 2024 09:50:30 -0700 Subject: [PATCH 38/80] Build: Bump mkdocstrings-python from 1.10.1 to 1.10.2 (#746) Bumps [mkdocstrings-python](https://github.com/mkdocstrings/python) from 1.10.1 to 1.10.2. - [Release notes](https://github.com/mkdocstrings/python/releases) - [Changelog](https://github.com/mkdocstrings/python/blob/main/CHANGELOG.md) - [Commits](https://github.com/mkdocstrings/python/compare/1.10.1...1.10.2) --- updated-dependencies: - dependency-name: mkdocstrings-python dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- mkdocs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs/requirements.txt b/mkdocs/requirements.txt index ecd695472d..a8d1239015 100644 --- a/mkdocs/requirements.txt +++ b/mkdocs/requirements.txt @@ -19,7 +19,7 @@ mkdocs==1.6.0 griffe==0.45.0 jinja2==3.1.4 mkdocstrings==0.25.1 -mkdocstrings-python==1.10.1 +mkdocstrings-python==1.10.2 mkdocs-literate-nav==0.6.1 mkdocs-autorefs==1.0.1 mkdocs-gen-files==0.5.0 From 0a58636d6320d5b517800be9dd5e46e6f82ee4a9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 21 May 2024 09:50:58 -0700 Subject: [PATCH 39/80] Build: Bump boto3 from 1.34.69 to 1.34.106 (#749) Bumps [boto3](https://github.com/boto/boto3) from 1.34.69 to 1.34.106. - [Release notes](https://github.com/boto/boto3/releases) - [Changelog](https://github.com/boto/boto3/blob/develop/CHANGELOG.rst) - [Commits](https://github.com/boto/boto3/compare/1.34.69...1.34.106) --- updated-dependencies: - dependency-name: boto3 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/poetry.lock b/poetry.lock index 312c3d4884..3de12c85c3 100644 --- a/poetry.lock +++ b/poetry.lock @@ -25,24 +25,24 @@ tests = ["arrow", "dask[dataframe]", "docker", "pytest", "pytest-mock"] [[package]] name = "aiobotocore" -version = "2.12.3" +version = "2.13.0" description = "Async client for aws services using botocore and aiohttp" optional = true python-versions = ">=3.8" files = [ - {file = "aiobotocore-2.12.3-py3-none-any.whl", hash = "sha256:86737685f4625e8f05c4e7a608a07cc97607263279f66cf6b02b640c4eafd324"}, - {file = "aiobotocore-2.12.3.tar.gz", hash = "sha256:e2a2929207bc5d62eb556106c2224c1fd106d5c65be2eb69f15cc8c34c44c236"}, + {file = "aiobotocore-2.13.0-py3-none-any.whl", hash = "sha256:f812afc678d71b0038fd1ce712ff111ab7f47bab81ce5b4c7d222d4b83bc0cb2"}, + {file = "aiobotocore-2.13.0.tar.gz", hash = "sha256:4badf5cab6ad400216319d14278e2c99ad9b708e28a0f231605a412e632de401"}, ] [package.dependencies] -aiohttp = ">=3.7.4.post0,<4.0.0" +aiohttp = ">=3.9.2,<4.0.0" aioitertools = ">=0.5.1,<1.0.0" -botocore = ">=1.34.41,<1.34.70" +botocore = ">=1.34.70,<1.34.107" wrapt = ">=1.10.10,<2.0.0" [package.extras] -awscli = ["awscli (>=1.32.41,<1.32.70)"] -boto3 = ["boto3 (>=1.34.41,<1.34.70)"] +awscli = ["awscli (>=1.32.70,<1.32.107)"] +boto3 = ["boto3 (>=1.34.70,<1.34.107)"] [[package]] name = "aiohttp" @@ -343,17 +343,17 @@ files = [ [[package]] name = "boto3" -version = "1.34.69" +version = "1.34.106" description = "The AWS SDK for Python" optional = false python-versions = ">=3.8" files = [ - {file = "boto3-1.34.69-py3-none-any.whl", hash = "sha256:2e25ef6bd325217c2da329829478be063155897d8d3b29f31f7f23ab548519b1"}, - {file = "boto3-1.34.69.tar.gz", hash = "sha256:898a5fed26b1351352703421d1a8b886ef2a74be6c97d5ecc92432ae01fda203"}, + {file = "boto3-1.34.106-py3-none-any.whl", hash = "sha256:d3be4e1dd5d546a001cd4da805816934cbde9d395316546e9411fec341ade5cf"}, + {file = "boto3-1.34.106.tar.gz", hash = "sha256:6165b8cf1c7e625628ab28b32f9027064c8f5e5fca1c38d7fc228cd22069a19f"}, ] [package.dependencies] -botocore = ">=1.34.69,<1.35.0" +botocore = ">=1.34.106,<1.35.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.10.0,<0.11.0" @@ -362,13 +362,13 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.34.69" +version = "1.34.106" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">=3.8" files = [ - {file = "botocore-1.34.69-py3-none-any.whl", hash = "sha256:d3802d076d4d507bf506f9845a6970ce43adc3d819dd57c2791f5c19ed6e5950"}, - {file = "botocore-1.34.69.tar.gz", hash = "sha256:d1ab2bff3c2fd51719c2021d9fa2f30fbb9ed0a308f69e9a774ac92c8091380a"}, + {file = "botocore-1.34.106-py3-none-any.whl", hash = "sha256:4baf0e27c2dfc4f4d0dee7c217c716e0782f9b30e8e1fff983fce237d88f73ae"}, + {file = "botocore-1.34.106.tar.gz", hash = "sha256:921fa5202f88c3e58fdcb4b3acffd56d65b24bca47092ee4b27aa988556c0be6"}, ] [package.dependencies] @@ -380,7 +380,7 @@ urllib3 = [ ] [package.extras] -crt = ["awscrt (==0.19.19)"] +crt = ["awscrt (==0.20.9)"] [[package]] name = "build" From c764d6a4cc79f901c6dcbde033b12e6e332a0e9f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 21 May 2024 09:51:47 -0700 Subject: [PATCH 40/80] --- (#754) updated-dependencies: - dependency-name: mkdocs-material dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- mkdocs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs/requirements.txt b/mkdocs/requirements.txt index a8d1239015..ccf518ba9b 100644 --- a/mkdocs/requirements.txt +++ b/mkdocs/requirements.txt @@ -23,6 +23,6 @@ mkdocstrings-python==1.10.2 mkdocs-literate-nav==0.6.1 mkdocs-autorefs==1.0.1 mkdocs-gen-files==0.5.0 -mkdocs-material==9.5.22 +mkdocs-material==9.5.24 mkdocs-material-extensions==1.3.1 mkdocs-section-index==0.3.9 From 245ab876d5e8711607fb726d85637ffc361387a0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 21 May 2024 09:52:13 -0700 Subject: [PATCH 41/80] --- (#755) updated-dependencies: - dependency-name: requests dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index 3de12c85c3..3c075152af 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3584,13 +3584,13 @@ files = [ [[package]] name = "requests" -version = "2.31.0" +version = "2.32.1" description = "Python HTTP for Humans." optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"}, - {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"}, + {file = "requests-2.32.1-py3-none-any.whl", hash = "sha256:21ac9465cdf8c1650fe1ecde8a71669a93d4e6f147550483a2967d08396a56a5"}, + {file = "requests-2.32.1.tar.gz", hash = "sha256:eb97e87e64c79e64e5b8ac75cee9dd1f97f49e289b083ee6be96268930725685"}, ] [package.dependencies] From 82df57ea8accd855ba95f4d601b3c0c11f6f7021 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 21 May 2024 09:52:35 -0700 Subject: [PATCH 42/80] --- (#756) updated-dependencies: - dependency-name: pypa/cibuildwheel dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/python-release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-release.yml b/.github/workflows/python-release.yml index 37f28a76c5..b8d9b5dae3 100644 --- a/.github/workflows/python-release.yml +++ b/.github/workflows/python-release.yml @@ -59,7 +59,7 @@ jobs: if: startsWith(matrix.os, 'ubuntu') - name: Build wheels - uses: pypa/cibuildwheel@v2.18.0 + uses: pypa/cibuildwheel@v2.18.1 with: output-dir: wheelhouse config-file: "pyproject.toml" From aa5a1366ec5ba4ef27cf2547cb90b6cc1dddf4df Mon Sep 17 00:00:00 2001 From: Mehul Batra <66407733+MehulBatra@users.noreply.github.com> Date: Thu, 23 May 2024 02:54:13 +0530 Subject: [PATCH 43/80] [FEAT]register table using iceberg metadata file via pyiceberg (#711) --- pyiceberg/catalog/glue.py | 9 ++++++++- tests/catalog/integration_test_glue.py | 16 ++++++++++++++++ tests/catalog/test_glue.py | 14 ++++++++++++++ 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/glue.py b/pyiceberg/catalog/glue.py index 275cda7ed0..8819c2e266 100644 --- a/pyiceberg/catalog/glue.py +++ b/pyiceberg/catalog/glue.py @@ -417,7 +417,14 @@ def register_table(self, identifier: Union[str, Identifier], metadata_location: Raises: TableAlreadyExistsError: If the table already exists """ - raise NotImplementedError + database_name, table_name = self.identifier_to_database_and_table(identifier) + properties = EMPTY_DICT + io = self._load_file_io(location=metadata_location) + file = io.new_input(metadata_location) + metadata = FromInputFile.table_metadata(file) + table_input = _construct_table_input(table_name, metadata_location, properties, metadata) + self._create_glue_table(database_name=database_name, table_name=table_name, table_input=table_input) + return self.load_table(identifier=identifier) def _commit_table(self, table_request: CommitTableRequest) -> CommitTableResponse: """Update the table. diff --git a/tests/catalog/integration_test_glue.py b/tests/catalog/integration_test_glue.py index 5b4aa58787..ee43779073 100644 --- a/tests/catalog/integration_test_glue.py +++ b/tests/catalog/integration_test_glue.py @@ -570,3 +570,19 @@ def test_table_exists(test_catalog: Catalog, table_schema_nested: Schema, table_ test_catalog.create_namespace(database_name) test_catalog.create_table((database_name, table_name), table_schema_nested) assert test_catalog.table_exists((database_name, table_name)) is True + + +def test_register_table_with_given_location( + test_catalog: Catalog, table_schema_nested: Schema, table_name: str, database_name: str +) -> None: + identifier = (database_name, table_name) + new_identifier = (database_name, f"new_{table_name}") + test_catalog.create_namespace(database_name) + tbl = test_catalog.create_table(identifier, table_schema_nested) + location = tbl.metadata_location + test_catalog.drop_table(identifier) # drops the table but keeps the metadata file + assert not test_catalog.table_exists(identifier) + table = test_catalog.register_table(new_identifier, location) + assert table.identifier == (CATALOG_NAME,) + new_identifier + assert table.metadata_location == location + assert test_catalog.table_exists(new_identifier) diff --git a/tests/catalog/test_glue.py b/tests/catalog/test_glue.py index 5b67b92c68..1aea46d6ef 100644 --- a/tests/catalog/test_glue.py +++ b/tests/catalog/test_glue.py @@ -848,3 +848,17 @@ def test_table_exists( assert test_catalog.table_exists(identifier) is True # Act and Assert for a non-existing table assert test_catalog.table_exists(('non', 'exist')) is False + + +@mock_aws +def test_register_table_with_given_location( + _bucket_initialize: None, moto_endpoint_url: str, metadata_location: str, database_name: str, table_name: str +) -> None: + catalog_name = "glue" + identifier = (database_name, table_name) + location = metadata_location + test_catalog = GlueCatalog(catalog_name, **{"s3.endpoint": moto_endpoint_url, "warehouse": f"s3://{BUCKET_NAME}"}) + test_catalog.create_namespace(namespace=database_name, properties={"location": f"s3://{BUCKET_NAME}/{database_name}.db"}) + table = test_catalog.register_table(identifier, location) + assert table.identifier == (catalog_name,) + identifier + assert test_catalog.table_exists(identifier) is True From 5537cb4394b580b0f8eb78c3c7c549fb863b0e99 Mon Sep 17 00:00:00 2001 From: SeungyeopShin <109323024+SeungyeopShin@users.noreply.github.com> Date: Thu, 23 May 2024 14:20:42 +0900 Subject: [PATCH 44/80] modify doc(backward compatibility) typo (#757) --- mkdocs/docs/configuration.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index 1ca071f009..c0879b1d28 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -298,4 +298,4 @@ PyIceberg uses multiple threads to parallelize operations. The number of workers # Backward Compatibility -Previous versions of Java (`<1.4.0`) implementations incorrectly assume the optional attribute `current-snapshot-id` to be a required attribute in TableMetadata. This means that if `current-snapshot-id` is missing in the metadata file (e.g. on table creation), the application will throw an exception without being able to load the table. This assumption has been corrected in more recent Iceberg versions. However, it is possible to force PyIceberg to create a table with a metadata file that will be compatible with previous versions. This can be configured by setting the `legacy-current-snapshot-id` entry as "True" in the configuration file, or by setting the `LEGACY_CURRENT_SNAPSHOT_ID` environment variable. Refer to the [PR discussion](https://github.com/apache/iceberg-python/pull/473) for more details on the issue +Previous versions of Java (`<1.4.0`) implementations incorrectly assume the optional attribute `current-snapshot-id` to be a required attribute in TableMetadata. This means that if `current-snapshot-id` is missing in the metadata file (e.g. on table creation), the application will throw an exception without being able to load the table. This assumption has been corrected in more recent Iceberg versions. However, it is possible to force PyIceberg to create a table with a metadata file that will be compatible with previous versions. This can be configured by setting the `legacy-current-snapshot-id` entry as "True" in the configuration file, or by setting the `PYICEBERG_LEGACY_CURRENT_SNAPSHOT_ID` environment variable. Refer to the [PR discussion](https://github.com/apache/iceberg-python/pull/473) for more details on the issue From e91766062cdff3ccbce068de115e6a301bae6730 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 23 May 2024 09:19:39 +0200 Subject: [PATCH 45/80] Bump requests from 2.32.1 to 2.32.2 (#759) updated-dependencies: - dependency-name: requests dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index 3c075152af..c480c4a0d8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3584,13 +3584,13 @@ files = [ [[package]] name = "requests" -version = "2.32.1" +version = "2.32.2" description = "Python HTTP for Humans." optional = false python-versions = ">=3.8" files = [ - {file = "requests-2.32.1-py3-none-any.whl", hash = "sha256:21ac9465cdf8c1650fe1ecde8a71669a93d4e6f147550483a2967d08396a56a5"}, - {file = "requests-2.32.1.tar.gz", hash = "sha256:eb97e87e64c79e64e5b8ac75cee9dd1f97f49e289b083ee6be96268930725685"}, + {file = "requests-2.32.2-py3-none-any.whl", hash = "sha256:fc06670dd0ed212426dfeb94fc1b983d917c4f9847c863f313c9dfaaffb7c23c"}, + {file = "requests-2.32.2.tar.gz", hash = "sha256:dd951ff5ecf3e3b3aa26b40703ba77495dab41da839ae72ef3c8e5d8e2433289"}, ] [package.dependencies] From 7083b2e01d1259e75f77a8efd466d1291f5a352f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 23 May 2024 09:19:57 +0200 Subject: [PATCH 46/80] Bump griffe from 0.45.0 to 0.45.1 (#760) updated-dependencies: - dependency-name: griffe dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- mkdocs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs/requirements.txt b/mkdocs/requirements.txt index ccf518ba9b..386f5b2301 100644 --- a/mkdocs/requirements.txt +++ b/mkdocs/requirements.txt @@ -16,7 +16,7 @@ # under the License. mkdocs==1.6.0 -griffe==0.45.0 +griffe==0.45.1 jinja2==3.1.4 mkdocstrings==0.25.1 mkdocstrings-python==1.10.2 From 03a0d65ac05d556d0815e61a016effc2b8993702 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 23 May 2024 09:20:14 +0200 Subject: [PATCH 47/80] Bump mypy-boto3-glue from 1.34.88 to 1.34.110 (#761) updated-dependencies: - dependency-name: mypy-boto3-glue dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index c480c4a0d8..d19640868c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2490,13 +2490,13 @@ files = [ [[package]] name = "mypy-boto3-glue" -version = "1.34.88" -description = "Type annotations for boto3.Glue 1.34.88 service generated with mypy-boto3-builder 7.23.2" +version = "1.34.110" +description = "Type annotations for boto3.Glue 1.34.110 service generated with mypy-boto3-builder 7.24.0" optional = true python-versions = ">=3.8" files = [ - {file = "mypy_boto3_glue-1.34.88-py3-none-any.whl", hash = "sha256:bb5c4ac3ac4806fb19ff3bebe2400635cf0d959e4a086a3de36b0eccbf04febc"}, - {file = "mypy_boto3_glue-1.34.88.tar.gz", hash = "sha256:7626368b66c92236f57008bf56303f3eda1ef2705ffe0d2cd845b1b877eb0596"}, + {file = "mypy_boto3_glue-1.34.110-py3-none-any.whl", hash = "sha256:795eca329426bf1ae3dc95090cccafcd7b3d91c4c594dac4db1fd9d6c72390c9"}, + {file = "mypy_boto3_glue-1.34.110.tar.gz", hash = "sha256:80d39849ac10ad9d57d85b94016fce8caba2cb70a3544b5b8b9bf0713ab3a041"}, ] [package.dependencies] From 996afd0c44717d6ac345b8419bf01b25be2d6051 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 23 May 2024 10:22:54 +0200 Subject: [PATCH 48/80] Bump mkdocstrings-python from 1.10.2 to 1.10.3 (#762) Bumps [mkdocstrings-python](https://github.com/mkdocstrings/python) from 1.10.2 to 1.10.3. - [Release notes](https://github.com/mkdocstrings/python/releases) - [Changelog](https://github.com/mkdocstrings/python/blob/main/CHANGELOG.md) - [Commits](https://github.com/mkdocstrings/python/compare/1.10.2...1.10.3) --- updated-dependencies: - dependency-name: mkdocstrings-python dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- mkdocs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs/requirements.txt b/mkdocs/requirements.txt index 386f5b2301..5d375f50f3 100644 --- a/mkdocs/requirements.txt +++ b/mkdocs/requirements.txt @@ -19,7 +19,7 @@ mkdocs==1.6.0 griffe==0.45.1 jinja2==3.1.4 mkdocstrings==0.25.1 -mkdocstrings-python==1.10.2 +mkdocstrings-python==1.10.3 mkdocs-literate-nav==0.6.1 mkdocs-autorefs==1.0.1 mkdocs-gen-files==0.5.0 From eba4beeff046dd92d234fe7779fdbe76d61bd1bf Mon Sep 17 00:00:00 2001 From: Drew Gallardo Date: Thu, 23 May 2024 02:39:26 -0700 Subject: [PATCH 49/80] Initial implementation of the manifest table (#717) --- mkdocs/docs/api.md | 50 ++++++++++++++ pyiceberg/table/__init__.py | 89 +++++++++++++++++++++++++ tests/integration/test_inspect_table.py | 83 +++++++++++++++++++++++ 3 files changed, 222 insertions(+) diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md index 0bc23fb0dc..70b5fd62eb 100644 --- a/mkdocs/docs/api.md +++ b/mkdocs/docs/api.md @@ -606,6 +606,56 @@ min_snapshots_to_keep: [[null,10]] max_snapshot_age_in_ms: [[null,604800000]] ``` +### Manifests + +To show a table's current file manifests: + +```python +table.inspect.manifests() +``` + +``` +pyarrow.Table +content: int8 not null +path: string not null +length: int64 not null +partition_spec_id: int32 not null +added_snapshot_id: int64 not null +added_data_files_count: int32 not null +existing_data_files_count: int32 not null +deleted_data_files_count: int32 not null +added_delete_files_count: int32 not null +existing_delete_files_count: int32 not null +deleted_delete_files_count: int32 not null +partition_summaries: list> not null + child 0, item: struct + child 0, contains_null: bool not null + child 1, contains_nan: bool + child 2, lower_bound: string + child 3, upper_bound: string +---- +content: [[0]] +path: [["s3://warehouse/default/table_metadata_manifests/metadata/3bf5b4c6-a7a4-4b43-a6ce-ca2b4887945a-m0.avro"]] +length: [[6886]] +partition_spec_id: [[0]] +added_snapshot_id: [[3815834705531553721]] +added_data_files_count: [[1]] +existing_data_files_count: [[0]] +deleted_data_files_count: [[0]] +added_delete_files_count: [[0]] +existing_delete_files_count: [[0]] +deleted_delete_files_count: [[0]] +partition_summaries: [[ -- is_valid: all not null + -- child 0 type: bool +[false] + -- child 1 type: bool +[false] + -- child 2 type: string +["test"] + -- child 3 type: string +["test"]]] +``` + ## Add Files Expert Iceberg users may choose to commit existing parquet files to the Iceberg table as data files, without rewriting them. diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index c57f0d1297..74b0225dbe 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -71,6 +71,7 @@ ManifestEntry, ManifestEntryStatus, ManifestFile, + PartitionFieldSummary, write_manifest, write_manifest_list, ) @@ -3547,6 +3548,94 @@ def update_partitions_map( schema=table_schema, ) + def manifests(self) -> "pa.Table": + import pyarrow as pa + + from pyiceberg.conversions import from_bytes + + partition_summary_schema = pa.struct([ + pa.field("contains_null", pa.bool_(), nullable=False), + pa.field("contains_nan", pa.bool_(), nullable=True), + pa.field("lower_bound", pa.string(), nullable=True), + pa.field("upper_bound", pa.string(), nullable=True), + ]) + + manifest_schema = pa.schema([ + pa.field('content', pa.int8(), nullable=False), + pa.field('path', pa.string(), nullable=False), + pa.field('length', pa.int64(), nullable=False), + pa.field('partition_spec_id', pa.int32(), nullable=False), + pa.field('added_snapshot_id', pa.int64(), nullable=False), + pa.field('added_data_files_count', pa.int32(), nullable=False), + pa.field('existing_data_files_count', pa.int32(), nullable=False), + pa.field('deleted_data_files_count', pa.int32(), nullable=False), + pa.field('added_delete_files_count', pa.int32(), nullable=False), + pa.field('existing_delete_files_count', pa.int32(), nullable=False), + pa.field('deleted_delete_files_count', pa.int32(), nullable=False), + pa.field('partition_summaries', pa.list_(partition_summary_schema), nullable=False), + ]) + + def _partition_summaries_to_rows( + spec: PartitionSpec, partition_summaries: List[PartitionFieldSummary] + ) -> List[Dict[str, Any]]: + rows = [] + for i, field_summary in enumerate(partition_summaries): + field = spec.fields[i] + partition_field_type = spec.partition_type(self.tbl.schema()).fields[i].field_type + lower_bound = ( + ( + field.transform.to_human_string( + partition_field_type, from_bytes(partition_field_type, field_summary.lower_bound) + ) + ) + if field_summary.lower_bound + else None + ) + upper_bound = ( + ( + field.transform.to_human_string( + partition_field_type, from_bytes(partition_field_type, field_summary.upper_bound) + ) + ) + if field_summary.upper_bound + else None + ) + rows.append({ + 'contains_null': field_summary.contains_null, + 'contains_nan': field_summary.contains_nan, + 'lower_bound': lower_bound, + 'upper_bound': upper_bound, + }) + return rows + + specs = self.tbl.metadata.specs() + manifests = [] + if snapshot := self.tbl.metadata.current_snapshot(): + for manifest in snapshot.manifests(self.tbl.io): + is_data_file = manifest.content == ManifestContent.DATA + is_delete_file = manifest.content == ManifestContent.DELETES + manifests.append({ + 'content': manifest.content, + 'path': manifest.manifest_path, + 'length': manifest.manifest_length, + 'partition_spec_id': manifest.partition_spec_id, + 'added_snapshot_id': manifest.added_snapshot_id, + 'added_data_files_count': manifest.added_files_count if is_data_file else 0, + 'existing_data_files_count': manifest.existing_files_count if is_data_file else 0, + 'deleted_data_files_count': manifest.deleted_files_count if is_data_file else 0, + 'added_delete_files_count': manifest.added_files_count if is_delete_file else 0, + 'existing_delete_files_count': manifest.existing_files_count if is_delete_file else 0, + 'deleted_delete_files_count': manifest.deleted_files_count if is_delete_file else 0, + 'partition_summaries': _partition_summaries_to_rows(specs[manifest.partition_spec_id], manifest.partitions) + if manifest.partitions + else [], + }) + + return pa.Table.from_pylist( + manifests, + schema=manifest_schema, + ) + @dataclass(frozen=True) class TablePartition: diff --git a/tests/integration/test_inspect_table.py b/tests/integration/test_inspect_table.py index a884f9d4c0..8665435e43 100644 --- a/tests/integration/test_inspect_table.py +++ b/tests/integration/test_inspect_table.py @@ -445,3 +445,86 @@ def check_pyiceberg_df_equals_spark_df(df: pa.Table, spark_df: DataFrame) -> Non df = tbl.inspect.partitions(snapshot_id=snapshot.snapshot_id) spark_df = spark.sql(f"SELECT * FROM {identifier}.partitions VERSION AS OF {snapshot.snapshot_id}") check_pyiceberg_df_equals_spark_df(df, spark_df) + + +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_inspect_manifests(spark: SparkSession, session_catalog: Catalog, format_version: int) -> None: + identifier = "default.table_metadata_manifests" + try: + session_catalog.drop_table(identifier=identifier) + except NoSuchTableError: + pass + + spark.sql( + f""" + CREATE TABLE {identifier} ( + id int, + data string + ) + PARTITIONED BY (data) + """ + ) + + spark.sql( + f""" + INSERT INTO {identifier} VALUES (1, "a") + """ + ) + + spark.sql( + f""" + INSERT INTO {identifier} VALUES (2, "b") + """ + ) + + df = session_catalog.load_table(identifier).inspect.manifests() + + assert df.column_names == [ + 'content', + 'path', + 'length', + 'partition_spec_id', + 'added_snapshot_id', + 'added_data_files_count', + 'existing_data_files_count', + 'deleted_data_files_count', + 'added_delete_files_count', + 'existing_delete_files_count', + 'deleted_delete_files_count', + 'partition_summaries', + ] + + int_cols = [ + 'content', + 'length', + 'partition_spec_id', + 'added_snapshot_id', + 'added_data_files_count', + 'existing_data_files_count', + 'deleted_data_files_count', + 'added_delete_files_count', + 'existing_delete_files_count', + 'deleted_delete_files_count', + ] + + for column in int_cols: + for value in df[column]: + assert isinstance(value.as_py(), int) + + for value in df["path"]: + assert isinstance(value.as_py(), str) + + for value in df["partition_summaries"]: + assert isinstance(value.as_py(), list) + for row in value: + assert isinstance(row["contains_null"].as_py(), bool) + assert isinstance(row["contains_nan"].as_py(), (bool, type(None))) + assert isinstance(row["lower_bound"].as_py(), (str, type(None))) + assert isinstance(row["upper_bound"].as_py(), (str, type(None))) + + lhs = spark.table(f"{identifier}.manifests").toPandas() + rhs = df.to_pandas() + for column in df.column_names: + for left, right in zip(lhs[column].to_list(), rhs[column].to_list()): + assert left == right, f"Difference in column {column}: {left} != {right}" From 42afc439d362ef1b3dcff03a1ffd959bc0a399ca Mon Sep 17 00:00:00 2001 From: Christian Date: Thu, 23 May 2024 11:41:10 +0200 Subject: [PATCH 50/80] Fix: Table-Exists if Server returns 204 (#739) * Fix: Table-Exists if Server returns 204 * Add test for table exist 204 return code --- pyiceberg/catalog/rest.py | 2 +- tests/catalog/test_rest.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/rest.py b/pyiceberg/catalog/rest.py index 7259f9fa38..afd5818662 100644 --- a/pyiceberg/catalog/rest.py +++ b/pyiceberg/catalog/rest.py @@ -790,4 +790,4 @@ def table_exists(self, identifier: Union[str, Identifier]) -> bool: response = self._session.head( self.url(Endpoints.load_table, prefixed=True, **self._split_identifier_for_path(identifier_tuple)) ) - return response.status_code == 200 + return response.status_code in (200, 204) diff --git a/tests/catalog/test_rest.py b/tests/catalog/test_rest.py index ec5a6a22a4..b5c626d6f0 100644 --- a/tests/catalog/test_rest.py +++ b/tests/catalog/test_rest.py @@ -691,6 +691,16 @@ def test_table_exist_200(rest_mock: Mocker) -> None: assert catalog.table_exists(("fokko", "table")) +def test_table_exist_204(rest_mock: Mocker) -> None: + rest_mock.head( + f"{TEST_URI}v1/namespaces/fokko/tables/table", + status_code=204, + request_headers=TEST_HEADERS, + ) + catalog = RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN) + assert catalog.table_exists(("fokko", "table")) + + def test_table_exist_500(rest_mock: Mocker) -> None: rest_mock.head( f"{TEST_URI}v1/namespaces/fokko/tables/table", From 959718a5ede2bcfae5ccf7e54857bab5736f5aea Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 24 May 2024 23:24:23 -0700 Subject: [PATCH 51/80] Bump duckdb from 0.10.2 to 0.10.3 (#764) Bumps [duckdb](https://github.com/duckdb/duckdb) from 0.10.2 to 0.10.3. - [Release notes](https://github.com/duckdb/duckdb/releases) - [Changelog](https://github.com/duckdb/duckdb/blob/main/tools/release-pip.py) - [Commits](https://github.com/duckdb/duckdb/compare/v0.10.2...v0.10.3) --- updated-dependencies: - dependency-name: duckdb dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 96 ++++++++++++++++++++++++++--------------------------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/poetry.lock b/poetry.lock index d19640868c..218913620b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1030,58 +1030,58 @@ files = [ [[package]] name = "duckdb" -version = "0.10.2" +version = "0.10.3" description = "DuckDB in-process database" optional = true python-versions = ">=3.7.0" files = [ - {file = "duckdb-0.10.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3891d3ac03e12a3e5c43afa3020fe701f64060f52d25f429a1ed7b5d914368d3"}, - {file = "duckdb-0.10.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4f63877651f1fb940e049dc53038eb763856616319acf4f892b1c3ed074f5ab0"}, - {file = "duckdb-0.10.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:06e3a36f04f4d98d2c0bbdd63e517cfbe114a795306e26ec855e62e076af5043"}, - {file = "duckdb-0.10.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf5f95ad5b75c8e65c6508b4df02043dd0b9d97712b9a33236ad77c388ce7861"}, - {file = "duckdb-0.10.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ff62bc98278c98fecbd6eecec5d698ad41ebd654110feaadbf8ac8bb59b1ecf"}, - {file = "duckdb-0.10.2-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cceede13fde095c23cf9a53adf7c414c7bfb21b9a7aa6a4836014fdbecbfca70"}, - {file = "duckdb-0.10.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:acdfff60b7efccd7f731213a9795851256249dfacf80367074b2b2e144f716dd"}, - {file = "duckdb-0.10.2-cp310-cp310-win_amd64.whl", hash = "sha256:4a5d5655cf0bdaf664a6f332afe465e02b08cef715548a0983bb7aef48da06a6"}, - {file = "duckdb-0.10.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a9d15842876d18763e085648656cccc7660a215d16254906db5c4471be2c7732"}, - {file = "duckdb-0.10.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c88cdcdc8452c910e4298223e7d9fca291534ff5aa36090aa49c9e6557550b13"}, - {file = "duckdb-0.10.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:364cd6f5dc8a1010d144d08c410ba9a74c521336ee5bda84fabc6616216a6d6a"}, - {file = "duckdb-0.10.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4c57c11d1060296f5e9ebfb5bb7e5521e0d77912e8f9ff43c90240c3311e9de9"}, - {file = "duckdb-0.10.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:186d86b8dda8e1076170eb770bb2bb73ea88ca907d92885c9695d6515207b205"}, - {file = "duckdb-0.10.2-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5f65b62f31c6bff21afc0261cfe28d238b8f34ec78f339546b12f4740c39552a"}, - {file = "duckdb-0.10.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a860d7466a5c93714cdd94559ce9e1db2ab91914f0941c25e5e93d4ebe36a5fa"}, - {file = "duckdb-0.10.2-cp311-cp311-win_amd64.whl", hash = "sha256:33308190e9c7f05a3a0a2d46008a043effd4eae77011869d7c18fb37acdd9215"}, - {file = "duckdb-0.10.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:3a8b2f1229b4aecb79cd28ffdb99032b1497f0a805d0da1136a9b6115e1afc70"}, - {file = "duckdb-0.10.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d23a6dea61963733a0f45a0d0bbb1361fb2a47410ed5ff308b4a1f869d4eeb6f"}, - {file = "duckdb-0.10.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:20ee0aa27e688aa52a40b434ec41a50431d0b06edeab88edc2feaca18d82c62c"}, - {file = "duckdb-0.10.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:80a6d43d9044f0997a15a92e0c0ff3afd21151a1e572a92f439cc4f56b7090e1"}, - {file = "duckdb-0.10.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6934758cacd06029a5c9f54556a43bd277a86757e22bf8d0dd11ca15c1813d1c"}, - {file = "duckdb-0.10.2-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7a11e2d68bd79044eea5486b1cddb5b915115f537e5c74eeb94c768ce30f9f4b"}, - {file = "duckdb-0.10.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:0bf58385c43b8e448a2fea7e8729054934bf73ea616d1d7ef8184eda07f975e2"}, - {file = "duckdb-0.10.2-cp312-cp312-win_amd64.whl", hash = "sha256:eae75c7014597ded6e7f6dc51e32d48362a31608acd73e9f795748ee94335a54"}, - {file = "duckdb-0.10.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:62e89deff778a7a86f651802b947a3466425f6cce41e9d7d412d39e492932943"}, - {file = "duckdb-0.10.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f87e555fd36ec6da316b727a39fb24c53124a797dfa9b451bdea87b2f20a351f"}, - {file = "duckdb-0.10.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41e8b34b1a944590ebcf82f8cc59d67b084fe99479f048892d60da6c1402c386"}, - {file = "duckdb-0.10.2-cp37-cp37m-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2c68c6dde2773774cf2371522a3959ea2716fc2b3a4891d4066f0e426455fe19"}, - {file = "duckdb-0.10.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ff6a8a0980d0f9398fa461deffa59465dac190d707468478011ea8a5fe1f2c81"}, - {file = "duckdb-0.10.2-cp37-cp37m-win_amd64.whl", hash = "sha256:728dd4ff0efda387a424754e5508d4f8c72a272c2d3ccb036a83286f60b46002"}, - {file = "duckdb-0.10.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c461d6b4619e80170044a9eb999bbf4097e330d3a4974ced0a7eaeb79c7c39f6"}, - {file = "duckdb-0.10.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:909351ff72eb3b50b89761251148d8a186594d8a438e12dcf5494794caff6693"}, - {file = "duckdb-0.10.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:d9eeb8393d69abafd355b869669957eb85b89e4df677e420b9ef0693b7aa6cb4"}, - {file = "duckdb-0.10.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3102bcf5011e8f82ea3c2bde43108774fe5a283a410d292c0843610ea13e2237"}, - {file = "duckdb-0.10.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d64d443613e5f16caf7d67102733538c90f7715867c1a98597efd3babca068e3"}, - {file = "duckdb-0.10.2-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cb31398826d1b7473344e5ee8e0f826370c9752549469ba1327042ace9041f80"}, - {file = "duckdb-0.10.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d09dcec467cd6127d5cc1fb0ce4efbd77e761882d9d772b0f64fc2f79a2a1cde"}, - {file = "duckdb-0.10.2-cp38-cp38-win_amd64.whl", hash = "sha256:82fab1a24faf7c33d8a7afed08b57ee36e8821a3a68a2f1574cd238ea440bba0"}, - {file = "duckdb-0.10.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:38607e6e6618e8ea28c8d9b67aa9e22cfd6d6d673f2e8ab328bd6e867b697f69"}, - {file = "duckdb-0.10.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fb0c23bc8c09615bff38aebcf8e92e6ae74959c67b3c9e5b00edddc730bf22be"}, - {file = "duckdb-0.10.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:00576c11c78c83830ab483bad968e07cd9b5f730e7ffaf5aa5fadee5ac4f71e9"}, - {file = "duckdb-0.10.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:077db692cdda50c4684ef87dc2a68507665804caa90e539dbe819116bda722ad"}, - {file = "duckdb-0.10.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca25984ad9f9a04e46e8359f852668c11569534e3bb8424b80be711303ad2314"}, - {file = "duckdb-0.10.2-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6a72cc40982c7b92cf555e574618fc711033b013bf258b611ba18d7654c89d8c"}, - {file = "duckdb-0.10.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d27b9efd6e788eb561535fdc0cbc7c74aca1ff39f748b7cfc27aa49b00e22da1"}, - {file = "duckdb-0.10.2-cp39-cp39-win_amd64.whl", hash = "sha256:4800469489bc262dda61a7f1d40acedf67cf2454874e9d8bbf07920dc2b147e6"}, - {file = "duckdb-0.10.2.tar.gz", hash = "sha256:0f609c9d5f941f1ecde810f010dd9321cd406a552c1df20318a13fa64247f67f"}, + {file = "duckdb-0.10.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cd25cc8d001c09a19340739ba59d33e12a81ab285b7a6bed37169655e1cefb31"}, + {file = "duckdb-0.10.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2f9259c637b917ca0f4c63887e8d9b35ec248f5d987c886dfc4229d66a791009"}, + {file = "duckdb-0.10.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b48f5f1542f1e4b184e6b4fc188f497be8b9c48127867e7d9a5f4a3e334f88b0"}, + {file = "duckdb-0.10.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e327f7a3951ea154bb56e3fef7da889e790bd9a67ca3c36afc1beb17d3feb6d6"}, + {file = "duckdb-0.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d8b20ed67da004b4481973f4254fd79a0e5af957d2382eac8624b5c527ec48c"}, + {file = "duckdb-0.10.3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d37680b8d7be04e4709db3a66c8b3eb7ceba2a5276574903528632f2b2cc2e60"}, + {file = "duckdb-0.10.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3d34b86d6a2a6dfe8bb757f90bfe7101a3bd9e3022bf19dbddfa4b32680d26a9"}, + {file = "duckdb-0.10.3-cp310-cp310-win_amd64.whl", hash = "sha256:73b1cb283ca0f6576dc18183fd315b4e487a545667ffebbf50b08eb4e8cdc143"}, + {file = "duckdb-0.10.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d917dde19fcec8cadcbef1f23946e85dee626ddc133e1e3f6551f15a61a03c61"}, + {file = "duckdb-0.10.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:46757e0cf5f44b4cb820c48a34f339a9ccf83b43d525d44947273a585a4ed822"}, + {file = "duckdb-0.10.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:338c14d8ac53ac4aa9ec03b6f1325ecfe609ceeb72565124d489cb07f8a1e4eb"}, + {file = "duckdb-0.10.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:651fcb429602b79a3cf76b662a39e93e9c3e6650f7018258f4af344c816dab72"}, + {file = "duckdb-0.10.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d3ae3c73b98b6215dab93cc9bc936b94aed55b53c34ba01dec863c5cab9f8e25"}, + {file = "duckdb-0.10.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:56429b2cfe70e367fb818c2be19f59ce2f6b080c8382c4d10b4f90ba81f774e9"}, + {file = "duckdb-0.10.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b46c02c2e39e3676b1bb0dc7720b8aa953734de4fd1b762e6d7375fbeb1b63af"}, + {file = "duckdb-0.10.3-cp311-cp311-win_amd64.whl", hash = "sha256:bcd460feef56575af2c2443d7394d405a164c409e9794a4d94cb5fdaa24a0ba4"}, + {file = "duckdb-0.10.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:e229a7c6361afbb0d0ab29b1b398c10921263c52957aefe3ace99b0426fdb91e"}, + {file = "duckdb-0.10.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:732b1d3b6b17bf2f32ea696b9afc9e033493c5a3b783c292ca4b0ee7cc7b0e66"}, + {file = "duckdb-0.10.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f5380d4db11fec5021389fb85d614680dc12757ef7c5881262742250e0b58c75"}, + {file = "duckdb-0.10.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:468a4e0c0b13c55f84972b1110060d1b0f854ffeb5900a178a775259ec1562db"}, + {file = "duckdb-0.10.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0fa1e7ff8d18d71defa84e79f5c86aa25d3be80d7cb7bc259a322de6d7cc72da"}, + {file = "duckdb-0.10.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ed1063ed97c02e9cf2e7fd1d280de2d1e243d72268330f45344c69c7ce438a01"}, + {file = "duckdb-0.10.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:22f2aad5bb49c007f3bfcd3e81fdedbc16a2ae41f2915fc278724ca494128b0c"}, + {file = "duckdb-0.10.3-cp312-cp312-win_amd64.whl", hash = "sha256:8f9e2bb00a048eb70b73a494bdc868ce7549b342f7ffec88192a78e5a4e164bd"}, + {file = "duckdb-0.10.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a6c2fc49875b4b54e882d68703083ca6f84b27536d57d623fc872e2f502b1078"}, + {file = "duckdb-0.10.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a66c125d0c30af210f7ee599e7821c3d1a7e09208196dafbf997d4e0cfcb81ab"}, + {file = "duckdb-0.10.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d99dd7a1d901149c7a276440d6e737b2777e17d2046f5efb0c06ad3b8cb066a6"}, + {file = "duckdb-0.10.3-cp37-cp37m-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5ec3bbdb209e6095d202202893763e26c17c88293b88ef986b619e6c8b6715bd"}, + {file = "duckdb-0.10.3-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:2b3dec4ef8ed355d7b7230b40950b30d0def2c387a2e8cd7efc80b9d14134ecf"}, + {file = "duckdb-0.10.3-cp37-cp37m-win_amd64.whl", hash = "sha256:04129f94fb49bba5eea22f941f0fb30337f069a04993048b59e2811f52d564bc"}, + {file = "duckdb-0.10.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:d75d67024fc22c8edfd47747c8550fb3c34fb1cbcbfd567e94939ffd9c9e3ca7"}, + {file = "duckdb-0.10.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f3796e9507c02d0ddbba2e84c994fae131da567ce3d9cbb4cbcd32fadc5fbb26"}, + {file = "duckdb-0.10.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:78e539d85ebd84e3e87ec44d28ad912ca4ca444fe705794e0de9be3dd5550c11"}, + {file = "duckdb-0.10.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7a99b67ac674b4de32073e9bc604b9c2273d399325181ff50b436c6da17bf00a"}, + {file = "duckdb-0.10.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1209a354a763758c4017a1f6a9f9b154a83bed4458287af9f71d84664ddb86b6"}, + {file = "duckdb-0.10.3-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3b735cea64aab39b67c136ab3a571dbf834067f8472ba2f8bf0341bc91bea820"}, + {file = "duckdb-0.10.3-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:816ffb9f758ed98eb02199d9321d592d7a32a6cb6aa31930f4337eb22cfc64e2"}, + {file = "duckdb-0.10.3-cp38-cp38-win_amd64.whl", hash = "sha256:1631184b94c3dc38b13bce4045bf3ae7e1b0ecbfbb8771eb8d751d8ffe1b59b3"}, + {file = "duckdb-0.10.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:fb98c35fc8dd65043bc08a2414dd9f59c680d7e8656295b8969f3f2061f26c52"}, + {file = "duckdb-0.10.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7e75c9f5b6a92b2a6816605c001d30790f6d67ce627a2b848d4d6040686efdf9"}, + {file = "duckdb-0.10.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae786eddf1c2fd003466e13393b9348a44b6061af6fe7bcb380a64cac24e7df7"}, + {file = "duckdb-0.10.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9387da7b7973707b0dea2588749660dd5dd724273222680e985a2dd36787668"}, + {file = "duckdb-0.10.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:538f943bf9fa8a3a7c4fafa05f21a69539d2c8a68e557233cbe9d989ae232899"}, + {file = "duckdb-0.10.3-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6930608f35025a73eb94252964f9f19dd68cf2aaa471da3982cf6694866cfa63"}, + {file = "duckdb-0.10.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:03bc54a9cde5490918aad82d7d2a34290e3dfb78d5b889c6626625c0f141272a"}, + {file = "duckdb-0.10.3-cp39-cp39-win_amd64.whl", hash = "sha256:372b6e3901d85108cafe5df03c872dfb6f0dbff66165a0cf46c47246c1957aa0"}, + {file = "duckdb-0.10.3.tar.gz", hash = "sha256:c5bd84a92bc708d3a6adffe1f554b94c6e76c795826daaaf482afc3d9c636971"}, ] [[package]] From ed83e84edc860be59fb6663b3985afe47ca5340e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 24 May 2024 23:41:59 -0700 Subject: [PATCH 52/80] Bump griffe from 0.45.1 to 0.45.2 (#765) Bumps [griffe](https://github.com/mkdocstrings/griffe) from 0.45.1 to 0.45.2. - [Release notes](https://github.com/mkdocstrings/griffe/releases) - [Changelog](https://github.com/mkdocstrings/griffe/blob/main/CHANGELOG.md) - [Commits](https://github.com/mkdocstrings/griffe/compare/0.45.1...0.45.2) --- updated-dependencies: - dependency-name: griffe dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- mkdocs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs/requirements.txt b/mkdocs/requirements.txt index 5d375f50f3..f1c7b0e06d 100644 --- a/mkdocs/requirements.txt +++ b/mkdocs/requirements.txt @@ -16,7 +16,7 @@ # under the License. mkdocs==1.6.0 -griffe==0.45.1 +griffe==0.45.2 jinja2==3.1.4 mkdocstrings==0.25.1 mkdocstrings-python==1.10.3 From b8023d294ac057abc15a19f05ee212517362d3d5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 24 May 2024 23:42:12 -0700 Subject: [PATCH 53/80] Bump typing-extensions from 4.11.0 to 4.12.0 (#767) Bumps [typing-extensions](https://github.com/python/typing_extensions) from 4.11.0 to 4.12.0. - [Release notes](https://github.com/python/typing_extensions/releases) - [Changelog](https://github.com/python/typing_extensions/blob/main/CHANGELOG.md) - [Commits](https://github.com/python/typing_extensions/compare/4.11.0...4.12.0) --- updated-dependencies: - dependency-name: typing-extensions dependency-type: direct:development update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 218913620b..a0352f0434 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4083,13 +4083,13 @@ telegram = ["requests"] [[package]] name = "typing-extensions" -version = "4.11.0" +version = "4.12.0" description = "Backported and Experimental Type Hints for Python 3.8+" optional = false python-versions = ">=3.8" files = [ - {file = "typing_extensions-4.11.0-py3-none-any.whl", hash = "sha256:c1f94d72897edaf4ce775bb7558d5b79d8126906a14ea5ed1635921406c0387a"}, - {file = "typing_extensions-4.11.0.tar.gz", hash = "sha256:83f085bd5ca59c80295fc2a82ab5dac679cbe02b9f33f7d83af68e241bea51b0"}, + {file = "typing_extensions-4.12.0-py3-none-any.whl", hash = "sha256:b349c66bea9016ac22978d800cfff206d5f9816951f12a7d0ec5578b0a819594"}, + {file = "typing_extensions-4.12.0.tar.gz", hash = "sha256:8cbcdc8606ebcb0d95453ad7dc5065e6237b6aa230a31e81d0f440c30fed5fd8"}, ] [[package]] @@ -4462,4 +4462,4 @@ zstandard = ["zstandard"] [metadata] lock-version = "2.0" python-versions = "^3.8" -content-hash = "2c019a99dfec370111ef19bae1ca7e00f434cec159296f5fcf4aee1b4552ba06" +content-hash = "8024e9ca0aa700346e902b232337c8bad69e5cd6e482db4999446f6177e7646d" diff --git a/pyproject.toml b/pyproject.toml index fafa5231a2..3a928ec47c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,7 +82,7 @@ fastavro = "1.9.4" coverage = { version = "^7.4.2", extras = ["toml"] } requests-mock = "1.12.1" moto = { version = "^5.0.2", extras = ["server"] } -typing-extensions = "4.11.0" +typing-extensions = "4.12.0" pytest-mock = "3.14.0" pyspark = "3.5.1" cython = "3.0.10" From a132be1fdb64f99ae4b3ddaf5dfe7991ba1917c4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 28 May 2024 07:03:46 +0200 Subject: [PATCH 54/80] Bump mkdocs-material from 9.5.24 to 9.5.25 (#770) --- mkdocs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs/requirements.txt b/mkdocs/requirements.txt index f1c7b0e06d..22ded02b4c 100644 --- a/mkdocs/requirements.txt +++ b/mkdocs/requirements.txt @@ -23,6 +23,6 @@ mkdocstrings-python==1.10.3 mkdocs-literate-nav==0.6.1 mkdocs-autorefs==1.0.1 mkdocs-gen-files==0.5.0 -mkdocs-material==9.5.24 +mkdocs-material==9.5.25 mkdocs-material-extensions==1.3.1 mkdocs-section-index==0.3.9 From 8968996fbde6e7892e0b576837acb335a52d9caf Mon Sep 17 00:00:00 2001 From: Kev Wang Date: Mon, 27 May 2024 22:47:46 -0700 Subject: [PATCH 55/80] Add azure configuration variables (#745) --- pyiceberg/io/__init__.py | 7 +++++++ pyiceberg/io/fsspec.py | 21 ++++++++++++++------- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/pyiceberg/io/__init__.py b/pyiceberg/io/__init__.py index 4b5e99d336..1a78f306c6 100644 --- a/pyiceberg/io/__init__.py +++ b/pyiceberg/io/__init__.py @@ -57,6 +57,13 @@ HDFS_PORT = "hdfs.port" HDFS_USER = "hdfs.user" HDFS_KERB_TICKET = "hdfs.kerberos_ticket" +ADLFS_CONNECTION_STRING = "adlfs.connection-string" +ADLFS_ACCOUNT_NAME = "adlfs.account-name" +ADLFS_ACCOUNT_KEY = "adlfs.account-key" +ADLFS_SAS_TOKEN = "adlfs.sas-token" +ADLFS_TENANT_ID = "adlfs.tenant-id" +ADLFS_CLIENT_ID = "adlfs.client-id" +ADLFS_ClIENT_SECRET = "adlfs.client-secret" GCS_TOKEN = "gcs.oauth2.token" GCS_TOKEN_EXPIRES_AT_MS = "gcs.oauth2.token-expires-at" GCS_PROJECT_ID = "gcs.project-id" diff --git a/pyiceberg/io/fsspec.py b/pyiceberg/io/fsspec.py index ee97829c2e..1089c9fe50 100644 --- a/pyiceberg/io/fsspec.py +++ b/pyiceberg/io/fsspec.py @@ -40,6 +40,12 @@ from pyiceberg.catalog import TOKEN from pyiceberg.exceptions import SignError from pyiceberg.io import ( + ADLFS_ACCOUNT_KEY, + ADLFS_ACCOUNT_NAME, + ADLFS_CLIENT_ID, + ADLFS_CONNECTION_STRING, + ADLFS_SAS_TOKEN, + ADLFS_TENANT_ID, GCS_ACCESS, GCS_CACHE_TIMEOUT, GCS_CONSISTENCY, @@ -57,6 +63,7 @@ S3_REGION, S3_SECRET_ACCESS_KEY, S3_SESSION_TOKEN, + ADLFS_ClIENT_SECRET, FileIO, InputFile, InputStream, @@ -163,13 +170,13 @@ def _adlfs(properties: Properties) -> AbstractFileSystem: from adlfs import AzureBlobFileSystem return AzureBlobFileSystem( - connection_string=properties.get("adlfs.connection-string"), - account_name=properties.get("adlfs.account-name"), - account_key=properties.get("adlfs.account-key"), - sas_token=properties.get("adlfs.sas-token"), - tenant_id=properties.get("adlfs.tenant-id"), - client_id=properties.get("adlfs.client-id"), - client_secret=properties.get("adlfs.client-secret"), + connection_string=properties.get(ADLFS_CONNECTION_STRING), + account_name=properties.get(ADLFS_ACCOUNT_NAME), + account_key=properties.get(ADLFS_ACCOUNT_KEY), + sas_token=properties.get(ADLFS_SAS_TOKEN), + tenant_id=properties.get(ADLFS_TENANT_ID), + client_id=properties.get(ADLFS_CLIENT_ID), + client_secret=properties.get(ADLFS_ClIENT_SECRET), ) From ee2a7c52b835c98dd08e74e9ef26c356692304a0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 27 May 2024 23:01:16 -0700 Subject: [PATCH 56/80] Bump moto from 5.0.7 to 5.0.8 (#771) Bumps [moto](https://github.com/getmoto/moto) from 5.0.7 to 5.0.8. - [Release notes](https://github.com/getmoto/moto/releases) - [Changelog](https://github.com/getmoto/moto/blob/master/CHANGELOG.md) - [Commits](https://github.com/getmoto/moto/compare/5.0.7...5.0.8) --- updated-dependencies: - dependency-name: moto dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index a0352f0434..b56e2d89bd 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2212,13 +2212,13 @@ test = ["mypy (>=1.0)", "pytest (>=7.0.0)"] [[package]] name = "moto" -version = "5.0.7" +version = "5.0.8" description = "" optional = false python-versions = ">=3.8" files = [ - {file = "moto-5.0.7-py2.py3-none-any.whl", hash = "sha256:c0214c1361fb1dc85f587d9ce17cd988c6f69ff0ed54d43789654022e0e744f2"}, - {file = "moto-5.0.7.tar.gz", hash = "sha256:f2cde691dc4bc675e318a65f018902ac7f89d61bf2646052f7df215d212f069e"}, + {file = "moto-5.0.8-py2.py3-none-any.whl", hash = "sha256:7d1035e366434bfa9fcc0621f07d5aa724b6846408071d540137a0554c46f214"}, + {file = "moto-5.0.8.tar.gz", hash = "sha256:517fb808dc718bcbdda54c6ffeaca0adc34cf6e10821bfb01216ce420a31765c"}, ] [package.dependencies] From 54aacb41c57edce8ece9278acc7eb0a7e92ef03e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 27 May 2024 23:01:26 -0700 Subject: [PATCH 57/80] Bump coverage from 7.5.1 to 7.5.2 (#772) Bumps [coverage](https://github.com/nedbat/coveragepy) from 7.5.1 to 7.5.2. - [Release notes](https://github.com/nedbat/coveragepy/releases) - [Changelog](https://github.com/nedbat/coveragepy/blob/master/CHANGES.rst) - [Commits](https://github.com/nedbat/coveragepy/compare/7.5.1...7.5.2) --- updated-dependencies: - dependency-name: coverage dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 106 ++++++++++++++++++++++++++-------------------------- 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/poetry.lock b/poetry.lock index b56e2d89bd..4ef706c873 100644 --- a/poetry.lock +++ b/poetry.lock @@ -652,63 +652,63 @@ files = [ [[package]] name = "coverage" -version = "7.5.1" +version = "7.5.2" description = "Code coverage measurement for Python" optional = false python-versions = ">=3.8" files = [ - {file = "coverage-7.5.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0884920835a033b78d1c73b6d3bbcda8161a900f38a488829a83982925f6c2e"}, - {file = "coverage-7.5.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:39afcd3d4339329c5f58de48a52f6e4e50f6578dd6099961cf22228feb25f38f"}, - {file = "coverage-7.5.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a7b0ceee8147444347da6a66be737c9d78f3353b0681715b668b72e79203e4a"}, - {file = "coverage-7.5.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a9ca3f2fae0088c3c71d743d85404cec8df9be818a005ea065495bedc33da35"}, - {file = "coverage-7.5.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fd215c0c7d7aab005221608a3c2b46f58c0285a819565887ee0b718c052aa4e"}, - {file = "coverage-7.5.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4bf0655ab60d754491004a5efd7f9cccefcc1081a74c9ef2da4735d6ee4a6223"}, - {file = "coverage-7.5.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:61c4bf1ba021817de12b813338c9be9f0ad5b1e781b9b340a6d29fc13e7c1b5e"}, - {file = "coverage-7.5.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:db66fc317a046556a96b453a58eced5024af4582a8dbdc0c23ca4dbc0d5b3146"}, - {file = "coverage-7.5.1-cp310-cp310-win32.whl", hash = "sha256:b016ea6b959d3b9556cb401c55a37547135a587db0115635a443b2ce8f1c7228"}, - {file = "coverage-7.5.1-cp310-cp310-win_amd64.whl", hash = "sha256:df4e745a81c110e7446b1cc8131bf986157770fa405fe90e15e850aaf7619bc8"}, - {file = "coverage-7.5.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:796a79f63eca8814ca3317a1ea443645c9ff0d18b188de470ed7ccd45ae79428"}, - {file = "coverage-7.5.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4fc84a37bfd98db31beae3c2748811a3fa72bf2007ff7902f68746d9757f3746"}, - {file = "coverage-7.5.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6175d1a0559986c6ee3f7fccfc4a90ecd12ba0a383dcc2da30c2b9918d67d8a3"}, - {file = "coverage-7.5.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1fc81d5878cd6274ce971e0a3a18a8803c3fe25457165314271cf78e3aae3aa2"}, - {file = "coverage-7.5.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:556cf1a7cbc8028cb60e1ff0be806be2eded2daf8129b8811c63e2b9a6c43bca"}, - {file = "coverage-7.5.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:9981706d300c18d8b220995ad22627647be11a4276721c10911e0e9fa44c83e8"}, - {file = "coverage-7.5.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:d7fed867ee50edf1a0b4a11e8e5d0895150e572af1cd6d315d557758bfa9c057"}, - {file = "coverage-7.5.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:ef48e2707fb320c8f139424a596f5b69955a85b178f15af261bab871873bb987"}, - {file = "coverage-7.5.1-cp311-cp311-win32.whl", hash = "sha256:9314d5678dcc665330df5b69c1e726a0e49b27df0461c08ca12674bcc19ef136"}, - {file = "coverage-7.5.1-cp311-cp311-win_amd64.whl", hash = "sha256:5fa567e99765fe98f4e7d7394ce623e794d7cabb170f2ca2ac5a4174437e90dd"}, - {file = "coverage-7.5.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b6cf3764c030e5338e7f61f95bd21147963cf6aa16e09d2f74f1fa52013c1206"}, - {file = "coverage-7.5.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2ec92012fefebee89a6b9c79bc39051a6cb3891d562b9270ab10ecfdadbc0c34"}, - {file = "coverage-7.5.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:16db7f26000a07efcf6aea00316f6ac57e7d9a96501e990a36f40c965ec7a95d"}, - {file = "coverage-7.5.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:beccf7b8a10b09c4ae543582c1319c6df47d78fd732f854ac68d518ee1fb97fa"}, - {file = "coverage-7.5.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8748731ad392d736cc9ccac03c9845b13bb07d020a33423fa5b3a36521ac6e4e"}, - {file = "coverage-7.5.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7352b9161b33fd0b643ccd1f21f3a3908daaddf414f1c6cb9d3a2fd618bf2572"}, - {file = "coverage-7.5.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:7a588d39e0925f6a2bff87154752481273cdb1736270642aeb3635cb9b4cad07"}, - {file = "coverage-7.5.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:68f962d9b72ce69ea8621f57551b2fa9c70509af757ee3b8105d4f51b92b41a7"}, - {file = "coverage-7.5.1-cp312-cp312-win32.whl", hash = "sha256:f152cbf5b88aaeb836127d920dd0f5e7edff5a66f10c079157306c4343d86c19"}, - {file = "coverage-7.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:5a5740d1fb60ddf268a3811bcd353de34eb56dc24e8f52a7f05ee513b2d4f596"}, - {file = "coverage-7.5.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:e2213def81a50519d7cc56ed643c9e93e0247f5bbe0d1247d15fa520814a7cd7"}, - {file = "coverage-7.5.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5037f8fcc2a95b1f0e80585bd9d1ec31068a9bcb157d9750a172836e98bc7a90"}, - {file = "coverage-7.5.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c3721c2c9e4c4953a41a26c14f4cef64330392a6d2d675c8b1db3b645e31f0e"}, - {file = "coverage-7.5.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca498687ca46a62ae590253fba634a1fe9836bc56f626852fb2720f334c9e4e5"}, - {file = "coverage-7.5.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0cdcbc320b14c3e5877ee79e649677cb7d89ef588852e9583e6b24c2e5072661"}, - {file = "coverage-7.5.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:57e0204b5b745594e5bc14b9b50006da722827f0b8c776949f1135677e88d0b8"}, - {file = "coverage-7.5.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:8fe7502616b67b234482c3ce276ff26f39ffe88adca2acf0261df4b8454668b4"}, - {file = "coverage-7.5.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:9e78295f4144f9dacfed4f92935fbe1780021247c2fabf73a819b17f0ccfff8d"}, - {file = "coverage-7.5.1-cp38-cp38-win32.whl", hash = "sha256:1434e088b41594baa71188a17533083eabf5609e8e72f16ce8c186001e6b8c41"}, - {file = "coverage-7.5.1-cp38-cp38-win_amd64.whl", hash = "sha256:0646599e9b139988b63704d704af8e8df7fa4cbc4a1f33df69d97f36cb0a38de"}, - {file = "coverage-7.5.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4cc37def103a2725bc672f84bd939a6fe4522310503207aae4d56351644682f1"}, - {file = "coverage-7.5.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fc0b4d8bfeabd25ea75e94632f5b6e047eef8adaed0c2161ada1e922e7f7cece"}, - {file = "coverage-7.5.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d0a0f5e06881ecedfe6f3dd2f56dcb057b6dbeb3327fd32d4b12854df36bf26"}, - {file = "coverage-7.5.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9735317685ba6ec7e3754798c8871c2f49aa5e687cc794a0b1d284b2389d1bd5"}, - {file = "coverage-7.5.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d21918e9ef11edf36764b93101e2ae8cc82aa5efdc7c5a4e9c6c35a48496d601"}, - {file = "coverage-7.5.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:c3e757949f268364b96ca894b4c342b41dc6f8f8b66c37878aacef5930db61be"}, - {file = "coverage-7.5.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:79afb6197e2f7f60c4824dd4b2d4c2ec5801ceb6ba9ce5d2c3080e5660d51a4f"}, - {file = "coverage-7.5.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d1d0d98d95dd18fe29dc66808e1accf59f037d5716f86a501fc0256455219668"}, - {file = "coverage-7.5.1-cp39-cp39-win32.whl", hash = "sha256:1cc0fe9b0b3a8364093c53b0b4c0c2dd4bb23acbec4c9240b5f284095ccf7981"}, - {file = "coverage-7.5.1-cp39-cp39-win_amd64.whl", hash = "sha256:dde0070c40ea8bb3641e811c1cfbf18e265d024deff6de52c5950677a8fb1e0f"}, - {file = "coverage-7.5.1-pp38.pp39.pp310-none-any.whl", hash = "sha256:6537e7c10cc47c595828b8a8be04c72144725c383c4702703ff4e42e44577312"}, - {file = "coverage-7.5.1.tar.gz", hash = "sha256:54de9ef3a9da981f7af93eafde4ede199e0846cd819eb27c88e2b712aae9708c"}, + {file = "coverage-7.5.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:554c7327bf0fd688050348e22db7c8e163fb7219f3ecdd4732d7ed606b417263"}, + {file = "coverage-7.5.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d0305e02e40c7cfea5d08d6368576537a74c0eea62b77633179748d3519d6705"}, + {file = "coverage-7.5.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:829fb55ad437d757c70d5b1c51cfda9377f31506a0a3f3ac282bc6a387d6a5f1"}, + {file = "coverage-7.5.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:894b1acded706f1407a662d08e026bfd0ff1e59e9bd32062fea9d862564cfb65"}, + {file = "coverage-7.5.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe76d6dee5e4febefa83998b17926df3a04e5089e3d2b1688c74a9157798d7a2"}, + {file = "coverage-7.5.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:c7ebf2a37e4f5fea3c1a11e1f47cea7d75d0f2d8ef69635ddbd5c927083211fc"}, + {file = "coverage-7.5.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:20e611fc36e1a0fc7bbf957ef9c635c8807d71fbe5643e51b2769b3cc0fb0b51"}, + {file = "coverage-7.5.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7c5c5b7ae2763533152880d5b5b451acbc1089ade2336b710a24b2b0f5239d20"}, + {file = "coverage-7.5.2-cp310-cp310-win32.whl", hash = "sha256:1e4225990a87df898e40ca31c9e830c15c2c53b1d33df592bc8ef314d71f0281"}, + {file = "coverage-7.5.2-cp310-cp310-win_amd64.whl", hash = "sha256:976cd92d9420e6e2aa6ce6a9d61f2b490e07cb468968adf371546b33b829284b"}, + {file = "coverage-7.5.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5997d418c219dcd4dcba64e50671cca849aaf0dac3d7a2eeeb7d651a5bd735b8"}, + {file = "coverage-7.5.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ec27e93bbf5976f0465e8936f02eb5add99bbe4e4e7b233607e4d7622912d68d"}, + {file = "coverage-7.5.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f11f98753800eb1ec872562a398081f6695f91cd01ce39819e36621003ec52a"}, + {file = "coverage-7.5.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e34680049eecb30b6498784c9637c1c74277dcb1db75649a152f8004fbd6646"}, + {file = "coverage-7.5.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e12536446ad4527ac8ed91d8a607813085683bcce27af69e3b31cd72b3c5960"}, + {file = "coverage-7.5.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:3d3f7744b8a8079d69af69d512e5abed4fb473057625588ce126088e50d05493"}, + {file = "coverage-7.5.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:431a3917e32223fcdb90b79fe60185864a9109631ebc05f6c5aa03781a00b513"}, + {file = "coverage-7.5.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a7c6574225f34ce45466f04751d957b5c5e6b69fca9351db017c9249786172ce"}, + {file = "coverage-7.5.2-cp311-cp311-win32.whl", hash = "sha256:2b144d142ec9987276aeff1326edbc0df8ba4afbd7232f0ca10ad57a115e95b6"}, + {file = "coverage-7.5.2-cp311-cp311-win_amd64.whl", hash = "sha256:900532713115ac58bc3491b9d2b52704a05ed408ba0918d57fd72c94bc47fba1"}, + {file = "coverage-7.5.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9a42970ce74c88bdf144df11c52c5cf4ad610d860de87c0883385a1c9d9fa4ab"}, + {file = "coverage-7.5.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:26716a1118c6ce2188283b4b60a898c3be29b480acbd0a91446ced4fe4e780d8"}, + {file = "coverage-7.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:60b66b0363c5a2a79fba3d1cd7430c25bbd92c923d031cae906bdcb6e054d9a2"}, + {file = "coverage-7.5.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e5d22eba19273b2069e4efeff88c897a26bdc64633cbe0357a198f92dca94268"}, + {file = "coverage-7.5.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3bb5b92a0ab3d22dfdbfe845e2fef92717b067bdf41a5b68c7e3e857c0cff1a4"}, + {file = "coverage-7.5.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1aef719b6559b521ae913ddeb38f5048c6d1a3d366865e8b320270b7bc4693c2"}, + {file = "coverage-7.5.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8809c0ea0e8454f756e3bd5c36d04dddf222989216788a25bfd6724bfcee342c"}, + {file = "coverage-7.5.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1acc2e2ef098a1d4bf535758085f508097316d738101a97c3f996bccba963ea5"}, + {file = "coverage-7.5.2-cp312-cp312-win32.whl", hash = "sha256:97de509043d3f0f2b2cd171bdccf408f175c7f7a99d36d566b1ae4dd84107985"}, + {file = "coverage-7.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:8941e35a0e991a7a20a1fa3e3182f82abe357211f2c335a9e6007067c3392fcf"}, + {file = "coverage-7.5.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5662bf0f6fb6757f5c2d6279c541a5af55a39772c2362ed0920b27e3ce0e21f7"}, + {file = "coverage-7.5.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3d9c62cff2ffb4c2a95328488fd7aa96a7a4b34873150650fe76b19c08c9c792"}, + {file = "coverage-7.5.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:74eeaa13e8200ad72fca9c5f37395fb310915cec6f1682b21375e84fd9770e84"}, + {file = "coverage-7.5.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f29bf497d51a5077994b265e976d78b09d9d0dff6ca5763dbb4804534a5d380"}, + {file = "coverage-7.5.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f96aa94739593ae0707eda9813ce363a0a0374a810ae0eced383340fc4a1f73"}, + {file = "coverage-7.5.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:51b6cee539168a912b4b3b040e4042b9e2c9a7ad9c8546c09e4eaeff3eacba6b"}, + {file = "coverage-7.5.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:59a75e6aa5c25b50b5a1499f9718f2edff54257f545718c4fb100f48d570ead4"}, + {file = "coverage-7.5.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:29da75ce20cb0a26d60e22658dd3230713c6c05a3465dd8ad040ffc991aea318"}, + {file = "coverage-7.5.2-cp38-cp38-win32.whl", hash = "sha256:23f2f16958b16152b43a39a5ecf4705757ddd284b3b17a77da3a62aef9c057ef"}, + {file = "coverage-7.5.2-cp38-cp38-win_amd64.whl", hash = "sha256:9e41c94035e5cdb362beed681b58a707e8dc29ea446ea1713d92afeded9d1ddd"}, + {file = "coverage-7.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:06d96b9b19bbe7f049c2be3c4f9e06737ec6d8ef8933c7c3a4c557ef07936e46"}, + {file = "coverage-7.5.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:878243e1206828908a6b4a9ca7b1aa8bee9eb129bf7186fc381d2646f4524ce9"}, + {file = "coverage-7.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:482df956b055d3009d10fce81af6ffab28215d7ed6ad4a15e5c8e67cb7c5251c"}, + {file = "coverage-7.5.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a35c97af60a5492e9e89f8b7153fe24eadfd61cb3a2fb600df1a25b5dab34b7e"}, + {file = "coverage-7.5.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:24bb4c7859a3f757a116521d4d3a8a82befad56ea1bdacd17d6aafd113b0071e"}, + {file = "coverage-7.5.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:e1046aab24c48c694f0793f669ac49ea68acde6a0798ac5388abe0a5615b5ec8"}, + {file = "coverage-7.5.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:448ec61ea9ea7916d5579939362509145caaecf03161f6f13e366aebb692a631"}, + {file = "coverage-7.5.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4a00bd5ba8f1a4114720bef283cf31583d6cb1c510ce890a6da6c4268f0070b7"}, + {file = "coverage-7.5.2-cp39-cp39-win32.whl", hash = "sha256:9f805481d5eff2a96bac4da1570ef662bf970f9a16580dc2c169c8c3183fa02b"}, + {file = "coverage-7.5.2-cp39-cp39-win_amd64.whl", hash = "sha256:2c79f058e7bec26b5295d53b8c39ecb623448c74ccc8378631f5cb5c16a7e02c"}, + {file = "coverage-7.5.2-pp38.pp39.pp310-none-any.whl", hash = "sha256:40dbb8e7727560fe8ab65efcddfec1ae25f30ef02e2f2e5d78cfb52a66781ec5"}, + {file = "coverage-7.5.2.tar.gz", hash = "sha256:13017a63b0e499c59b5ba94a8542fb62864ba3016127d1e4ef30d354fc2b00e9"}, ] [package.dependencies] From 756ae625a2ea0f9c12df78430512ce991f6a1976 Mon Sep 17 00:00:00 2001 From: "Eric L (CCCS)" Date: Tue, 28 May 2024 03:52:24 -0400 Subject: [PATCH 58/80] Introduce hierarchical namespaces into SqlCatalog (#591) * Introduce hierarchical namespaces into SqlCatalog * Fix SqlCatalog unit tests broken from code update. --- pyiceberg/catalog/__init__.py | 25 +- pyiceberg/catalog/sql.py | 159 ++++--- pyiceberg/cli/console.py | 8 +- tests/catalog/test_sql.py | 847 +++++++++++++++++++++++++--------- tests/conftest.py | 13 + 5 files changed, 758 insertions(+), 294 deletions(-) diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index 0b70fe32e1..ea2bc65760 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -588,7 +588,7 @@ def identifier_to_tuple(identifier: Union[str, Identifier]) -> Identifier: If the identifier is a string, it is split into a tuple on '.'. If it is a tuple, it is used as-is. Args: - identifier (str | Identifier: an identifier, either a string or tuple of strings. + identifier (str | Identifier): an identifier, either a string or tuple of strings. Returns: Identifier: a tuple of strings. @@ -619,6 +619,29 @@ def namespace_from(identifier: Union[str, Identifier]) -> Identifier: """ return Catalog.identifier_to_tuple(identifier)[:-1] + @staticmethod + def namespace_to_string( + identifier: Union[str, Identifier], err: Union[Type[ValueError], Type[NoSuchNamespaceError]] = ValueError + ) -> str: + """Transform a namespace identifier into a string. + + Args: + identifier (Union[str, Identifier]): a namespace identifier. + err (Union[Type[ValueError], Type[NoSuchNamespaceError]]): the error type to raise when identifier is empty. + + Returns: + Identifier: Namespace identifier. + """ + tuple_identifier = Catalog.identifier_to_tuple(identifier) + if len(tuple_identifier) < 1: + raise err("Empty namespace identifier") + + # Check if any segment of the tuple is an empty string + if any(segment.strip() == "" for segment in tuple_identifier): + raise err("Namespace identifier contains an empty segment or a segment with only whitespace") + + return ".".join(segment.strip() for segment in tuple_identifier) + @staticmethod def identifier_to_database( identifier: Union[str, Identifier], err: Union[Type[ValueError], Type[NoSuchNamespaceError]] = ValueError diff --git a/pyiceberg/catalog/sql.py b/pyiceberg/catalog/sql.py index 978109b2a3..6c198767e7 100644 --- a/pyiceberg/catalog/sql.py +++ b/pyiceberg/catalog/sql.py @@ -43,6 +43,7 @@ from pyiceberg.catalog import ( METADATA_LOCATION, + Catalog, MetastoreCatalog, PropertiesUpdateSummary, ) @@ -94,6 +95,16 @@ class IcebergNamespaceProperties(SqlCatalogBaseTable): class SqlCatalog(MetastoreCatalog): + """Implementation of a SQL based catalog. + + In the `JDBCCatalog` implementation, a `Namespace` is composed of a list of strings separated by dots: `'ns1.ns2.ns3'`. + And you can have as many levels as you want, but you need at least one. The `SqlCatalog` honors the same convention. + + In the `JDBCCatalog` implementation, a `TableIdentifier` is composed of an optional `Namespace` and a table name. + When a `Namespace` is present, the full name will be `'ns1.ns2.ns3.table'`. A valid `TableIdentifier` could be `'name'` (no namespace). + The `SqlCatalog` has a different convention where a `TableIdentifier` requires a `Namespace`. + """ + def __init__(self, name: str, **properties: str): super().__init__(name, **properties) @@ -136,7 +147,7 @@ def _convert_orm_to_iceberg(self, orm_table: IcebergTables) -> Table: file = io.new_input(metadata_location) metadata = FromInputFile.table_metadata(file) return Table( - identifier=(self.name, table_namespace, table_name), + identifier=(self.name,) + Catalog.identifier_to_tuple(table_namespace) + (table_name,), metadata=metadata, metadata_location=metadata_location, io=self._load_file_io(metadata.properties, metadata_location), @@ -173,11 +184,14 @@ def create_table( """ schema: Schema = self._convert_schema_if_needed(schema) # type: ignore - database_name, table_name = self.identifier_to_database_and_table(identifier) - if not self._namespace_exists(database_name): - raise NoSuchNamespaceError(f"Namespace does not exist: {database_name}") + identifier_nocatalog = self.identifier_to_tuple_without_catalog(identifier) + namespace_identifier = Catalog.namespace_from(identifier_nocatalog) + table_name = Catalog.table_name_from(identifier_nocatalog) + if not self._namespace_exists(namespace_identifier): + raise NoSuchNamespaceError(f"Namespace does not exist: {namespace_identifier}") - location = self._resolve_table_location(location, database_name, table_name) + namespace = Catalog.namespace_to_string(namespace_identifier) + location = self._resolve_table_location(location, namespace, table_name) metadata_location = self._get_metadata_location(location=location) metadata = new_table_metadata( location=location, schema=schema, partition_spec=partition_spec, sort_order=sort_order, properties=properties @@ -190,7 +204,7 @@ def create_table( session.add( IcebergTables( catalog_name=self.name, - table_namespace=database_name, + table_namespace=namespace, table_name=table_name, metadata_location=metadata_location, previous_metadata_location=None, @@ -198,7 +212,7 @@ def create_table( ) session.commit() except IntegrityError as e: - raise TableAlreadyExistsError(f"Table {database_name}.{table_name} already exists") from e + raise TableAlreadyExistsError(f"Table {namespace}.{table_name} already exists") from e return self.load_table(identifier=identifier) @@ -216,16 +230,19 @@ def register_table(self, identifier: Union[str, Identifier], metadata_location: TableAlreadyExistsError: If the table already exists NoSuchNamespaceError: If namespace does not exist """ - database_name, table_name = self.identifier_to_database_and_table(identifier) - if not self._namespace_exists(database_name): - raise NoSuchNamespaceError(f"Namespace does not exist: {database_name}") + identifier_tuple = self.identifier_to_tuple_without_catalog(identifier) + namespace_tuple = Catalog.namespace_from(identifier_tuple) + namespace = Catalog.namespace_to_string(namespace_tuple) + table_name = Catalog.table_name_from(identifier_tuple) + if not self._namespace_exists(namespace): + raise NoSuchNamespaceError(f"Namespace does not exist: {namespace}") with Session(self.engine) as session: try: session.add( IcebergTables( catalog_name=self.name, - table_namespace=database_name, + table_namespace=namespace, table_name=table_name, metadata_location=metadata_location, previous_metadata_location=None, @@ -233,7 +250,7 @@ def register_table(self, identifier: Union[str, Identifier], metadata_location: ) session.commit() except IntegrityError as e: - raise TableAlreadyExistsError(f"Table {database_name}.{table_name} already exists") from e + raise TableAlreadyExistsError(f"Table {namespace}.{table_name} already exists") from e return self.load_table(identifier=identifier) @@ -253,17 +270,19 @@ def load_table(self, identifier: Union[str, Identifier]) -> Table: NoSuchTableError: If a table with the name does not exist. """ identifier_tuple = self.identifier_to_tuple_without_catalog(identifier) - database_name, table_name = self.identifier_to_database_and_table(identifier_tuple, NoSuchTableError) + namespace_tuple = Catalog.namespace_from(identifier_tuple) + namespace = Catalog.namespace_to_string(namespace_tuple) + table_name = Catalog.table_name_from(identifier_tuple) with Session(self.engine) as session: stmt = select(IcebergTables).where( IcebergTables.catalog_name == self.name, - IcebergTables.table_namespace == database_name, + IcebergTables.table_namespace == namespace, IcebergTables.table_name == table_name, ) result = session.scalar(stmt) if result: return self._convert_orm_to_iceberg(result) - raise NoSuchTableError(f"Table does not exist: {database_name}.{table_name}") + raise NoSuchTableError(f"Table does not exist: {namespace}.{table_name}") def drop_table(self, identifier: Union[str, Identifier]) -> None: """Drop a table. @@ -275,18 +294,20 @@ def drop_table(self, identifier: Union[str, Identifier]) -> None: NoSuchTableError: If a table with the name does not exist. """ identifier_tuple = self.identifier_to_tuple_without_catalog(identifier) - database_name, table_name = self.identifier_to_database_and_table(identifier_tuple, NoSuchTableError) + namespace_tuple = Catalog.namespace_from(identifier_tuple) + namespace = Catalog.namespace_to_string(namespace_tuple) + table_name = Catalog.table_name_from(identifier_tuple) with Session(self.engine) as session: if self.engine.dialect.supports_sane_rowcount: res = session.execute( delete(IcebergTables).where( IcebergTables.catalog_name == self.name, - IcebergTables.table_namespace == database_name, + IcebergTables.table_namespace == namespace, IcebergTables.table_name == table_name, ) ) if res.rowcount < 1: - raise NoSuchTableError(f"Table does not exist: {database_name}.{table_name}") + raise NoSuchTableError(f"Table does not exist: {namespace}.{table_name}") else: try: tbl = ( @@ -294,14 +315,14 @@ def drop_table(self, identifier: Union[str, Identifier]) -> None: .with_for_update(of=IcebergTables) .filter( IcebergTables.catalog_name == self.name, - IcebergTables.table_namespace == database_name, + IcebergTables.table_namespace == namespace, IcebergTables.table_name == table_name, ) .one() ) session.delete(tbl) except NoResultFound as e: - raise NoSuchTableError(f"Table does not exist: {database_name}.{table_name}") from e + raise NoSuchTableError(f"Table does not exist: {namespace}.{table_name}") from e session.commit() def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: Union[str, Identifier]) -> Table: @@ -320,10 +341,15 @@ def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: U NoSuchNamespaceError: If the target namespace does not exist. """ from_identifier_tuple = self.identifier_to_tuple_without_catalog(from_identifier) - from_database_name, from_table_name = self.identifier_to_database_and_table(from_identifier_tuple, NoSuchTableError) - to_database_name, to_table_name = self.identifier_to_database_and_table(to_identifier) - if not self._namespace_exists(to_database_name): - raise NoSuchNamespaceError(f"Namespace does not exist: {to_database_name}") + to_identifier_tuple = self.identifier_to_tuple_without_catalog(to_identifier) + from_namespace_tuple = Catalog.namespace_from(from_identifier_tuple) + from_namespace = Catalog.namespace_to_string(from_namespace_tuple) + from_table_name = Catalog.table_name_from(from_identifier_tuple) + to_namespace_tuple = Catalog.namespace_from(to_identifier_tuple) + to_namespace = Catalog.namespace_to_string(to_namespace_tuple) + to_table_name = Catalog.table_name_from(to_identifier_tuple) + if not self._namespace_exists(to_namespace): + raise NoSuchNamespaceError(f"Namespace does not exist: {to_namespace}") with Session(self.engine) as session: try: if self.engine.dialect.supports_sane_rowcount: @@ -331,10 +357,10 @@ def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: U update(IcebergTables) .where( IcebergTables.catalog_name == self.name, - IcebergTables.table_namespace == from_database_name, + IcebergTables.table_namespace == from_namespace, IcebergTables.table_name == from_table_name, ) - .values(table_namespace=to_database_name, table_name=to_table_name) + .values(table_namespace=to_namespace, table_name=to_table_name) ) result = session.execute(stmt) if result.rowcount < 1: @@ -346,18 +372,18 @@ def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: U .with_for_update(of=IcebergTables) .filter( IcebergTables.catalog_name == self.name, - IcebergTables.table_namespace == from_database_name, + IcebergTables.table_namespace == from_namespace, IcebergTables.table_name == from_table_name, ) .one() ) - tbl.table_namespace = to_database_name + tbl.table_namespace = to_namespace tbl.table_name = to_table_name except NoResultFound as e: raise NoSuchTableError(f"Table does not exist: {from_table_name}") from e session.commit() except IntegrityError as e: - raise TableAlreadyExistsError(f"Table {to_database_name}.{to_table_name} already exists") from e + raise TableAlreadyExistsError(f"Table {to_namespace}.{to_table_name} already exists") from e return self.load_table(to_identifier) def _commit_table(self, table_request: CommitTableRequest) -> CommitTableResponse: @@ -377,7 +403,9 @@ def _commit_table(self, table_request: CommitTableRequest) -> CommitTableRespons tuple(table_request.identifier.namespace.root + [table_request.identifier.name]) ) current_table = self.load_table(identifier_tuple) - database_name, table_name = self.identifier_to_database_and_table(identifier_tuple, NoSuchTableError) + namespace_tuple = Catalog.namespace_from(identifier_tuple) + namespace = Catalog.namespace_to_string(namespace_tuple) + table_name = Catalog.table_name_from(identifier_tuple) base_metadata = current_table.metadata for requirement in table_request.requirements: requirement.validate(base_metadata) @@ -398,7 +426,7 @@ def _commit_table(self, table_request: CommitTableRequest) -> CommitTableRespons update(IcebergTables) .where( IcebergTables.catalog_name == self.name, - IcebergTables.table_namespace == database_name, + IcebergTables.table_namespace == namespace, IcebergTables.table_name == table_name, IcebergTables.metadata_location == current_table.metadata_location, ) @@ -406,7 +434,7 @@ def _commit_table(self, table_request: CommitTableRequest) -> CommitTableRespons ) result = session.execute(stmt) if result.rowcount < 1: - raise CommitFailedException(f"Table has been updated by another process: {database_name}.{table_name}") + raise CommitFailedException(f"Table has been updated by another process: {namespace}.{table_name}") else: try: tbl = ( @@ -414,7 +442,7 @@ def _commit_table(self, table_request: CommitTableRequest) -> CommitTableRespons .with_for_update(of=IcebergTables) .filter( IcebergTables.catalog_name == self.name, - IcebergTables.table_namespace == database_name, + IcebergTables.table_namespace == namespace, IcebergTables.table_name == table_name, IcebergTables.metadata_location == current_table.metadata_location, ) @@ -423,13 +451,14 @@ def _commit_table(self, table_request: CommitTableRequest) -> CommitTableRespons tbl.metadata_location = new_metadata_location tbl.previous_metadata_location = current_table.metadata_location except NoResultFound as e: - raise CommitFailedException(f"Table has been updated by another process: {database_name}.{table_name}") from e + raise CommitFailedException(f"Table has been updated by another process: {namespace}.{table_name}") from e session.commit() return CommitTableResponse(metadata=updated_metadata, metadata_location=new_metadata_location) def _namespace_exists(self, identifier: Union[str, Identifier]) -> bool: - namespace = self.identifier_to_database(identifier) + namespace_tuple = Catalog.identifier_to_tuple(identifier) + namespace = Catalog.namespace_to_string(namespace_tuple, NoSuchNamespaceError) with Session(self.engine) as session: stmt = ( select(IcebergTables) @@ -462,18 +491,20 @@ def create_namespace(self, namespace: Union[str, Identifier], properties: Proper Raises: NamespaceAlreadyExistsError: If a namespace with the given name already exists. """ + if self._namespace_exists(namespace): + raise NamespaceAlreadyExistsError(f"Namespace {namespace} already exists") + if not properties: properties = IcebergNamespaceProperties.NAMESPACE_MINIMAL_PROPERTIES - database_name = self.identifier_to_database(namespace) - if self._namespace_exists(database_name): - raise NamespaceAlreadyExistsError(f"Database {database_name} already exists") - create_properties = properties if properties else IcebergNamespaceProperties.NAMESPACE_MINIMAL_PROPERTIES with Session(self.engine) as session: for key, value in create_properties.items(): session.add( IcebergNamespaceProperties( - catalog_name=self.name, namespace=database_name, property_key=key, property_value=value + catalog_name=self.name, + namespace=Catalog.namespace_to_string(namespace, NoSuchNamespaceError), + property_key=key, + property_value=value, ) ) session.commit() @@ -488,16 +519,16 @@ def drop_namespace(self, namespace: Union[str, Identifier]) -> None: NoSuchNamespaceError: If a namespace with the given name does not exist. NamespaceNotEmptyError: If the namespace is not empty. """ - database_name = self.identifier_to_database(namespace, NoSuchNamespaceError) - if self._namespace_exists(database_name): - if tables := self.list_tables(database_name): - raise NamespaceNotEmptyError(f"Database {database_name} is not empty. {len(tables)} tables exist.") + if self._namespace_exists(namespace): + namespace_str = Catalog.namespace_to_string(namespace) + if tables := self.list_tables(namespace): + raise NamespaceNotEmptyError(f"Namespace {namespace_str} is not empty. {len(tables)} tables exist.") with Session(self.engine) as session: session.execute( delete(IcebergNamespaceProperties).where( IcebergNamespaceProperties.catalog_name == self.name, - IcebergNamespaceProperties.namespace == database_name, + IcebergNamespaceProperties.namespace == namespace_str, ) ) session.commit() @@ -516,14 +547,14 @@ def list_tables(self, namespace: Union[str, Identifier]) -> List[Identifier]: Raises: NoSuchNamespaceError: If a namespace with the given name does not exist. """ - database_name = self.identifier_to_database(namespace, NoSuchNamespaceError) + if namespace and not self._namespace_exists(namespace): + raise NoSuchNamespaceError(f"Namespace does not exist: {namespace}") - stmt = select(IcebergTables).where( - IcebergTables.catalog_name == self.name, IcebergTables.table_namespace == database_name - ) + namespace = Catalog.namespace_to_string(namespace) + stmt = select(IcebergTables).where(IcebergTables.catalog_name == self.name, IcebergTables.table_namespace == namespace) with Session(self.engine) as session: result = session.scalars(stmt) - return [(table.table_namespace, table.table_name) for table in result] + return [(Catalog.identifier_to_tuple(table.table_namespace) + (table.table_name,)) for table in result] def list_namespaces(self, namespace: Union[str, Identifier] = ()) -> List[Identifier]: """List namespaces from the given namespace. If not given, list top-level namespaces from the catalog. @@ -543,15 +574,15 @@ def list_namespaces(self, namespace: Union[str, Identifier] = ()) -> List[Identi table_stmt = select(IcebergTables.table_namespace).where(IcebergTables.catalog_name == self.name) namespace_stmt = select(IcebergNamespaceProperties.namespace).where(IcebergNamespaceProperties.catalog_name == self.name) if namespace: - database_name = self.identifier_to_database(namespace, NoSuchNamespaceError) - table_stmt = table_stmt.where(IcebergTables.table_namespace.like(database_name)) - namespace_stmt = namespace_stmt.where(IcebergNamespaceProperties.namespace.like(database_name)) + namespace_str = Catalog.namespace_to_string(namespace, NoSuchNamespaceError) + table_stmt = table_stmt.where(IcebergTables.table_namespace.like(namespace_str)) + namespace_stmt = namespace_stmt.where(IcebergNamespaceProperties.namespace.like(namespace_str)) stmt = union( table_stmt, namespace_stmt, ) with Session(self.engine) as session: - return [self.identifier_to_tuple(namespace_col) for namespace_col in session.execute(stmt).scalars()] + return [Catalog.identifier_to_tuple(namespace_col) for namespace_col in session.execute(stmt).scalars()] def load_namespace_properties(self, namespace: Union[str, Identifier]) -> Properties: """Get properties for a namespace. @@ -565,12 +596,12 @@ def load_namespace_properties(self, namespace: Union[str, Identifier]) -> Proper Raises: NoSuchNamespaceError: If a namespace with the given name does not exist. """ - database_name = self.identifier_to_database(namespace) - if not self._namespace_exists(database_name): - raise NoSuchNamespaceError(f"Database {database_name} does not exists") + namespace_str = Catalog.namespace_to_string(namespace) + if not self._namespace_exists(namespace): + raise NoSuchNamespaceError(f"Namespace {namespace_str} does not exists") stmt = select(IcebergNamespaceProperties).where( - IcebergNamespaceProperties.catalog_name == self.name, IcebergNamespaceProperties.namespace == database_name + IcebergNamespaceProperties.catalog_name == self.name, IcebergNamespaceProperties.namespace == namespace_str ) with Session(self.engine) as session: result = session.scalars(stmt) @@ -590,9 +621,9 @@ def update_namespace_properties( NoSuchNamespaceError: If a namespace with the given name does not exist. ValueError: If removals and updates have overlapping keys. """ - database_name = self.identifier_to_database(namespace) - if not self._namespace_exists(database_name): - raise NoSuchNamespaceError(f"Database {database_name} does not exists") + namespace_str = Catalog.namespace_to_string(namespace) + if not self._namespace_exists(namespace): + raise NoSuchNamespaceError(f"Namespace {namespace_str} does not exists") current_properties = self.load_namespace_properties(namespace=namespace) properties_update_summary = self._get_updated_props_and_update_summary( @@ -603,7 +634,7 @@ def update_namespace_properties( if removals: delete_stmt = delete(IcebergNamespaceProperties).where( IcebergNamespaceProperties.catalog_name == self.name, - IcebergNamespaceProperties.namespace == database_name, + IcebergNamespaceProperties.namespace == namespace_str, IcebergNamespaceProperties.property_key.in_(removals), ) session.execute(delete_stmt) @@ -614,14 +645,14 @@ def update_namespace_properties( # This is not a problem since it runs in a single transaction delete_stmt = delete(IcebergNamespaceProperties).where( IcebergNamespaceProperties.catalog_name == self.name, - IcebergNamespaceProperties.namespace == database_name, + IcebergNamespaceProperties.namespace == namespace_str, IcebergNamespaceProperties.property_key.in_(set(updates.keys())), ) session.execute(delete_stmt) insert_stmt = insert(IcebergNamespaceProperties) for property_key, property_value in updates.items(): insert_stmt = insert_stmt.values( - catalog_name=self.name, namespace=database_name, property_key=property_key, property_value=property_value + catalog_name=self.name, namespace=namespace_str, property_key=property_key, property_value=property_value ) session.execute(insert_stmt) session.commit() diff --git a/pyiceberg/cli/console.py b/pyiceberg/cli/console.py index 0fbda10960..d1833df081 100644 --- a/pyiceberg/cli/console.py +++ b/pyiceberg/cli/console.py @@ -112,9 +112,13 @@ def list(ctx: Context, parent: Optional[str]) -> None: # pylint: disable=redefi """List tables or namespaces.""" catalog, output = _catalog_and_output(ctx) - identifiers = catalog.list_namespaces(parent or ()) - if not identifiers and parent: + identifiers = [] + if parent: + # Do we have tables under parent namespace? identifiers = catalog.list_tables(parent) + if not identifiers: + # List hierarchical namespaces if parent, root namespaces otherwise. + identifiers = catalog.list_namespaces(parent or ()) output.identifiers(identifiers) diff --git a/tests/catalog/test_sql.py b/tests/catalog/test_sql.py index efa7b746a9..285cfd9ab9 100644 --- a/tests/catalog/test_sql.py +++ b/tests/catalog/test_sql.py @@ -17,7 +17,7 @@ import os from pathlib import Path -from typing import Generator, List +from typing import Any, Generator, List import pyarrow as pa import pytest @@ -25,6 +25,9 @@ from pytest_lazyfixture import lazy_fixture from sqlalchemy.exc import ArgumentError, IntegrityError +from pyiceberg.catalog import ( + Catalog, +) from pyiceberg.catalog.sql import SqlCatalog from pyiceberg.exceptions import ( CommitFailedException, @@ -52,51 +55,90 @@ from pyiceberg.types import IntegerType -@pytest.fixture(name="random_identifier") -def fixture_random_identifier(warehouse: Path, database_name: str, table_name: str) -> Identifier: +@pytest.fixture(scope="module") +def catalog_name() -> str: + return "test_sql_catalog" + + +@pytest.fixture(name="random_table_identifier") +def fixture_random_table_identifier(warehouse: Path, database_name: str, table_name: str) -> Identifier: os.makedirs(f"{warehouse}/{database_name}.db/{table_name}/metadata/", exist_ok=True) return database_name, table_name -@pytest.fixture(name="another_random_identifier") -def fixture_another_random_identifier(warehouse: Path, database_name: str, table_name: str) -> Identifier: +@pytest.fixture(name="random_table_identifier_with_catalog") +def fixture_random_table_identifier_with_catalog( + warehouse: Path, catalog_name: str, database_name: str, table_name: str +) -> Identifier: + os.makedirs(f"{warehouse}/{database_name}.db/{table_name}/metadata/", exist_ok=True) + return catalog_name, database_name, table_name + + +@pytest.fixture(name="another_random_table_identifier") +def fixture_another_random_table_identifier(warehouse: Path, database_name: str, table_name: str) -> Identifier: database_name = database_name + "_new" table_name = table_name + "_new" os.makedirs(f"{warehouse}/{database_name}.db/{table_name}/metadata/", exist_ok=True) return database_name, table_name +@pytest.fixture(name="another_random_table_identifier_with_catalog") +def fixture_another_random_table_identifier_with_catalog( + warehouse: Path, catalog_name: str, database_name: str, table_name: str +) -> Identifier: + database_name = database_name + "_new" + table_name = table_name + "_new" + os.makedirs(f"{warehouse}/{database_name}.db/{table_name}/metadata/", exist_ok=True) + return catalog_name, database_name, table_name + + +@pytest.fixture(name="random_hierarchical_identifier") +def fixture_random_hierarchical_identifier(warehouse: Path, hierarchical_namespace_name: str, table_name: str) -> Identifier: + os.makedirs(f"{warehouse}/{hierarchical_namespace_name}.db/{table_name}/metadata/", exist_ok=True) + return Catalog.identifier_to_tuple(".".join((hierarchical_namespace_name, table_name))) + + +@pytest.fixture(name="another_random_hierarchical_identifier") +def fixture_another_random_hierarchical_identifier( + warehouse: Path, hierarchical_namespace_name: str, table_name: str +) -> Identifier: + hierarchical_namespace_name = hierarchical_namespace_name + "_new" + table_name = table_name + "_new" + os.makedirs(f"{warehouse}/{hierarchical_namespace_name}.db/{table_name}/metadata/", exist_ok=True) + return Catalog.identifier_to_tuple(".".join((hierarchical_namespace_name, table_name))) + + @pytest.fixture(scope="module") -def catalog_memory(warehouse: Path) -> Generator[SqlCatalog, None, None]: +def catalog_memory(catalog_name: str, warehouse: Path) -> Generator[SqlCatalog, None, None]: props = { "uri": "sqlite:///:memory:", "warehouse": f"file://{warehouse}", } - catalog = SqlCatalog("test_sql_catalog", **props) + catalog = SqlCatalog(catalog_name, **props) catalog.create_tables() yield catalog catalog.destroy_tables() @pytest.fixture(scope="module") -def catalog_sqlite(warehouse: Path) -> Generator[SqlCatalog, None, None]: +def catalog_sqlite(catalog_name: str, warehouse: Path) -> Generator[SqlCatalog, None, None]: props = { "uri": f"sqlite:////{warehouse}/sql-catalog.db", "warehouse": f"file://{warehouse}", } - catalog = SqlCatalog("test_sql_catalog", **props) + catalog = SqlCatalog(catalog_name, **props) catalog.create_tables() yield catalog catalog.destroy_tables() @pytest.fixture(scope="module") -def catalog_sqlite_without_rowcount(warehouse: Path) -> Generator[SqlCatalog, None, None]: +def catalog_sqlite_without_rowcount(catalog_name: str, warehouse: Path) -> Generator[SqlCatalog, None, None]: props = { "uri": f"sqlite:////{warehouse}/sql-catalog.db", "warehouse": f"file://{warehouse}", } - catalog = SqlCatalog("test_sql_catalog", **props) + catalog = SqlCatalog(catalog_name, **props) catalog.engine.dialect.supports_sane_rowcount = False catalog.create_tables() yield catalog @@ -104,26 +146,26 @@ def catalog_sqlite_without_rowcount(warehouse: Path) -> Generator[SqlCatalog, No @pytest.fixture(scope="module") -def catalog_sqlite_fsspec(warehouse: Path) -> Generator[SqlCatalog, None, None]: +def catalog_sqlite_fsspec(catalog_name: str, warehouse: Path) -> Generator[SqlCatalog, None, None]: props = { "uri": f"sqlite:////{warehouse}/sql-catalog.db", "warehouse": f"file://{warehouse}", PY_IO_IMPL: FSSPEC_FILE_IO, } - catalog = SqlCatalog("test_sql_catalog", **props) + catalog = SqlCatalog(catalog_name, **props) catalog.create_tables() yield catalog catalog.destroy_tables() -def test_creation_with_no_uri() -> None: +def test_creation_with_no_uri(catalog_name: str) -> None: with pytest.raises(NoSuchPropertyException): - SqlCatalog("test_ddb_catalog", not_uri="unused") + SqlCatalog(catalog_name, not_uri="unused") -def test_creation_with_unsupported_uri() -> None: +def test_creation_with_unsupported_uri(catalog_name: str) -> None: with pytest.raises(ArgumentError): - SqlCatalog("test_ddb_catalog", uri="unsupported:xxx") + SqlCatalog(catalog_name, uri="unsupported:xxx") @pytest.mark.parametrize( @@ -146,13 +188,22 @@ def test_create_tables_idempotency(catalog: SqlCatalog) -> None: lazy_fixture('catalog_sqlite'), ], ) -def test_create_table_default_sort_order(catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - table = catalog.create_table(random_identifier, table_schema_nested) +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_create_table_default_sort_order(catalog: SqlCatalog, table_schema_nested: Schema, table_identifier: Identifier) -> None: + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + table = catalog.create_table(table_identifier, table_schema_nested) assert table.sort_order().order_id == 0, "Order ID must match" assert table.sort_order().is_unsorted is True, "Order must be unsorted" - catalog.drop_table(random_identifier) + catalog.drop_table(table_identifier) @pytest.mark.parametrize( @@ -162,15 +213,24 @@ def test_create_table_default_sort_order(catalog: SqlCatalog, table_schema_neste lazy_fixture('catalog_sqlite'), ], ) -def test_create_v1_table(catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - table = catalog.create_table(random_identifier, table_schema_nested, properties={"format-version": "1"}) +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_create_v1_table(catalog: SqlCatalog, table_schema_nested: Schema, table_identifier: Identifier) -> None: + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + table = catalog.create_table(table_identifier, table_schema_nested, properties={"format-version": "1"}) assert table.sort_order().order_id == 0, "Order ID must match" assert table.sort_order().is_unsorted is True, "Order must be unsorted" assert table.format_version == 1 assert table.spec() == UNPARTITIONED_PARTITION_SPEC - catalog.drop_table(random_identifier) + catalog.drop_table(table_identifier) @pytest.mark.parametrize( @@ -180,17 +240,26 @@ def test_create_v1_table(catalog: SqlCatalog, table_schema_nested: Schema, rando lazy_fixture('catalog_sqlite'), ], ) +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) def test_create_table_with_pyarrow_schema( catalog: SqlCatalog, pyarrow_schema_simple_without_ids: pa.Schema, iceberg_table_schema_simple: Schema, - random_identifier: Identifier, + table_identifier: Identifier, ) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - table = catalog.create_table(random_identifier, pyarrow_schema_simple_without_ids) + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + table = catalog.create_table(table_identifier, pyarrow_schema_simple_without_ids) assert table.schema() == iceberg_table_schema_simple - catalog.drop_table(random_identifier) + catalog.drop_table(table_identifier) @pytest.mark.parametrize( @@ -200,7 +269,15 @@ def test_create_table_with_pyarrow_schema( lazy_fixture('catalog_sqlite'), ], ) -def test_write_pyarrow_schema(catalog: SqlCatalog, random_identifier: Identifier) -> None: +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_write_pyarrow_schema(catalog: SqlCatalog, table_identifier: Identifier) -> None: import pyarrow as pa pyarrow_table = pa.Table.from_arrays( @@ -217,9 +294,10 @@ def test_write_pyarrow_schema(catalog: SqlCatalog, random_identifier: Identifier pa.field('large', pa.large_string(), nullable=True), ]), ) - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - table = catalog.create_table(random_identifier, pyarrow_table.schema) + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + table = catalog.create_table(table_identifier, pyarrow_table.schema) table.overwrite(pyarrow_table) @@ -230,18 +308,27 @@ def test_write_pyarrow_schema(catalog: SqlCatalog, random_identifier: Identifier lazy_fixture('catalog_sqlite'), ], ) -def test_create_table_custom_sort_order(catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_create_table_custom_sort_order(catalog: SqlCatalog, table_schema_nested: Schema, table_identifier: Identifier) -> None: + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) order = SortOrder(SortField(source_id=2, transform=IdentityTransform(), null_order=NullOrder.NULLS_FIRST)) - table = catalog.create_table(random_identifier, table_schema_nested, sort_order=order) + table = catalog.create_table(table_identifier, table_schema_nested, sort_order=order) given_sort_order = table.sort_order() assert given_sort_order.order_id == 1, "Order ID must match" assert len(given_sort_order.fields) == 1, "Order must have 1 field" assert given_sort_order.fields[0].direction == SortDirection.ASC, "Direction must match" assert given_sort_order.fields[0].null_order == NullOrder.NULLS_FIRST, "Null order must match" assert isinstance(given_sort_order.fields[0].transform, IdentityTransform), "Transform must match" - catalog.drop_table(random_identifier) + catalog.drop_table(table_identifier) @pytest.mark.parametrize( @@ -251,17 +338,26 @@ def test_create_table_custom_sort_order(catalog: SqlCatalog, table_schema_nested lazy_fixture('catalog_sqlite'), ], ) +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) def test_create_table_with_default_warehouse_location( - warehouse: Path, catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier + warehouse: Path, catalog: SqlCatalog, table_schema_nested: Schema, table_identifier: Identifier ) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - catalog.create_table(random_identifier, table_schema_nested) - table = catalog.load_table(random_identifier) - assert table.identifier == (catalog.name,) + random_identifier + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + catalog.create_table(table_identifier, table_schema_nested) + table = catalog.load_table(table_identifier) + assert table.identifier == (catalog.name,) + table_identifier_nocatalog assert table.metadata_location.startswith(f"file://{warehouse}") assert os.path.exists(table.metadata_location[len("file://") :]) - catalog.drop_table(random_identifier) + catalog.drop_table(table_identifier) @pytest.mark.parametrize( @@ -271,19 +367,29 @@ def test_create_table_with_default_warehouse_location( lazy_fixture('catalog_sqlite'), ], ) +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) def test_create_table_with_given_location_removes_trailing_slash( - warehouse: Path, catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier + warehouse: Path, catalog: SqlCatalog, table_schema_nested: Schema, table_identifier: Identifier ) -> None: - database_name, table_name = random_identifier - location = f"file://{warehouse}/{database_name}.db/{table_name}-given" - catalog.create_namespace(database_name) - catalog.create_table(random_identifier, table_schema_nested, location=f"{location}/") - table = catalog.load_table(random_identifier) - assert table.identifier == (catalog.name,) + random_identifier + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + table_name = Catalog.table_name_from(table_identifier_nocatalog) + location = f"file://{warehouse}/{catalog.name}.db/{table_name}-given" + catalog.create_namespace(namespace) + catalog.create_table(table_identifier, table_schema_nested, location=f"{location}/") + table = catalog.load_table(table_identifier) + assert table.identifier == (catalog.name,) + table_identifier_nocatalog assert table.metadata_location.startswith(f"file://{warehouse}") assert os.path.exists(table.metadata_location[len("file://") :]) assert table.location() == location - catalog.drop_table(random_identifier) + catalog.drop_table(table_identifier) @pytest.mark.parametrize( @@ -293,12 +399,21 @@ def test_create_table_with_given_location_removes_trailing_slash( lazy_fixture('catalog_sqlite'), ], ) -def test_create_duplicated_table(catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - catalog.create_table(random_identifier, table_schema_nested) +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_create_duplicated_table(catalog: SqlCatalog, table_schema_nested: Schema, table_identifier: Identifier) -> None: + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + catalog.create_table(table_identifier, table_schema_nested) with pytest.raises(TableAlreadyExistsError): - catalog.create_table(random_identifier, table_schema_nested) + catalog.create_table(table_identifier, table_schema_nested) @pytest.mark.parametrize( @@ -308,13 +423,22 @@ def test_create_duplicated_table(catalog: SqlCatalog, table_schema_nested: Schem lazy_fixture('catalog_sqlite'), ], ) +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) def test_create_table_if_not_exists_duplicated_table( - catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier + catalog: SqlCatalog, table_schema_nested: Schema, table_identifier: Identifier ) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - table1 = catalog.create_table(random_identifier, table_schema_nested) - table2 = catalog.create_table_if_not_exists(random_identifier, table_schema_nested) + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + table1 = catalog.create_table(table_identifier, table_schema_nested) + table2 = catalog.create_table_if_not_exists(table_identifier, table_schema_nested) assert table1.identifier == table2.identifier @@ -339,7 +463,7 @@ def test_create_table_with_non_existing_namespace(catalog: SqlCatalog, table_sch ], ) def test_create_table_without_namespace(catalog: SqlCatalog, table_schema_nested: Schema, table_name: str) -> None: - with pytest.raises(ValueError): + with pytest.raises(NoSuchNamespaceError): catalog.create_table(table_name, table_schema_nested) @@ -350,14 +474,23 @@ def test_create_table_without_namespace(catalog: SqlCatalog, table_schema_nested lazy_fixture('catalog_sqlite'), ], ) -def test_register_table(catalog: SqlCatalog, random_identifier: Identifier, metadata_location: str) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - table = catalog.register_table(random_identifier, metadata_location) - assert table.identifier == (catalog.name,) + random_identifier +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_register_table(catalog: SqlCatalog, table_identifier: Identifier, metadata_location: str) -> None: + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + table = catalog.register_table(table_identifier, metadata_location) + assert table.identifier == (catalog.name,) + table_identifier_nocatalog assert table.metadata_location == metadata_location assert os.path.exists(metadata_location) - catalog.drop_table(random_identifier) + catalog.drop_table(table_identifier) @pytest.mark.parametrize( @@ -367,12 +500,21 @@ def test_register_table(catalog: SqlCatalog, random_identifier: Identifier, meta lazy_fixture('catalog_sqlite'), ], ) -def test_register_existing_table(catalog: SqlCatalog, random_identifier: Identifier, metadata_location: str) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - catalog.register_table(random_identifier, metadata_location) +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_register_existing_table(catalog: SqlCatalog, table_identifier: Identifier, metadata_location: str) -> None: + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + catalog.register_table(table_identifier, metadata_location) with pytest.raises(TableAlreadyExistsError): - catalog.register_table(random_identifier, metadata_location) + catalog.register_table(table_identifier, metadata_location) @pytest.mark.parametrize( @@ -407,11 +549,20 @@ def test_register_table_without_namespace(catalog: SqlCatalog, metadata_location lazy_fixture('catalog_sqlite'), ], ) -def test_load_table(catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - table = catalog.create_table(random_identifier, table_schema_nested) - loaded_table = catalog.load_table(random_identifier) +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_load_table(catalog: SqlCatalog, table_schema_nested: Schema, table_identifier: Identifier) -> None: + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + table = catalog.create_table(table_identifier, table_schema_nested) + loaded_table = catalog.load_table(table_identifier) assert table.identifier == loaded_table.identifier assert table.metadata_location == loaded_table.metadata_location assert table.metadata == loaded_table.metadata @@ -424,12 +575,21 @@ def test_load_table(catalog: SqlCatalog, table_schema_nested: Schema, random_ide lazy_fixture('catalog_sqlite'), ], ) -def test_load_table_from_self_identifier(catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - table = catalog.create_table(random_identifier, table_schema_nested) - intermediate = catalog.load_table(random_identifier) - assert intermediate.identifier == (catalog.name,) + random_identifier +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_load_table_from_self_identifier(catalog: SqlCatalog, table_schema_nested: Schema, table_identifier: Identifier) -> None: + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + table = catalog.create_table(table_identifier, table_schema_nested) + intermediate = catalog.load_table(table_identifier) + assert intermediate.identifier == (catalog.name,) + table_identifier_nocatalog loaded_table = catalog.load_table(intermediate.identifier) assert table.identifier == loaded_table.identifier assert table.metadata_location == loaded_table.metadata_location @@ -444,14 +604,23 @@ def test_load_table_from_self_identifier(catalog: SqlCatalog, table_schema_neste lazy_fixture('catalog_sqlite_without_rowcount'), ], ) -def test_drop_table(catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - table = catalog.create_table(random_identifier, table_schema_nested) - assert table.identifier == (catalog.name,) + random_identifier - catalog.drop_table(random_identifier) +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_drop_table(catalog: SqlCatalog, table_schema_nested: Schema, table_identifier: Identifier) -> None: + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + table = catalog.create_table(table_identifier, table_schema_nested) + assert table.identifier == (catalog.name,) + table_identifier_nocatalog + catalog.drop_table(table_identifier) with pytest.raises(NoSuchTableError): - catalog.load_table(random_identifier) + catalog.load_table(table_identifier) @pytest.mark.parametrize( @@ -462,16 +631,25 @@ def test_drop_table(catalog: SqlCatalog, table_schema_nested: Schema, random_ide lazy_fixture('catalog_sqlite_without_rowcount'), ], ) -def test_drop_table_from_self_identifier(catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - table = catalog.create_table(random_identifier, table_schema_nested) - assert table.identifier == (catalog.name,) + random_identifier +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_drop_table_from_self_identifier(catalog: SqlCatalog, table_schema_nested: Schema, table_identifier: Identifier) -> None: + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + table = catalog.create_table(table_identifier, table_schema_nested) + assert table.identifier == (catalog.name,) + table_identifier_nocatalog catalog.drop_table(table.identifier) with pytest.raises(NoSuchTableError): catalog.load_table(table.identifier) with pytest.raises(NoSuchTableError): - catalog.load_table(random_identifier) + catalog.load_table(table_identifier) @pytest.mark.parametrize( @@ -482,9 +660,17 @@ def test_drop_table_from_self_identifier(catalog: SqlCatalog, table_schema_neste lazy_fixture('catalog_sqlite_without_rowcount'), ], ) -def test_drop_table_that_does_not_exist(catalog: SqlCatalog, random_identifier: Identifier) -> None: +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_drop_table_that_does_not_exist(catalog: SqlCatalog, table_identifier: Identifier) -> None: with pytest.raises(NoSuchTableError): - catalog.drop_table(random_identifier) + catalog.drop_table(table_identifier) @pytest.mark.parametrize( @@ -495,21 +681,39 @@ def test_drop_table_that_does_not_exist(catalog: SqlCatalog, random_identifier: lazy_fixture('catalog_sqlite_without_rowcount'), ], ) +@pytest.mark.parametrize( + "from_table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +@pytest.mark.parametrize( + "to_table_identifier", + [ + lazy_fixture("another_random_table_identifier"), + lazy_fixture("another_random_hierarchical_identifier"), + lazy_fixture("another_random_table_identifier_with_catalog"), + ], +) def test_rename_table( - catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier, another_random_identifier: Identifier + catalog: SqlCatalog, table_schema_nested: Schema, from_table_identifier: Identifier, to_table_identifier: Identifier ) -> None: - from_database_name, _from_table_name = random_identifier - to_database_name, _to_table_name = another_random_identifier - catalog.create_namespace(from_database_name) - catalog.create_namespace(to_database_name) - table = catalog.create_table(random_identifier, table_schema_nested) - assert table.identifier == (catalog.name,) + random_identifier - catalog.rename_table(random_identifier, another_random_identifier) - new_table = catalog.load_table(another_random_identifier) - assert new_table.identifier == (catalog.name,) + another_random_identifier + from_table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(from_table_identifier) + to_table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(to_table_identifier) + from_namespace = Catalog.namespace_from(from_table_identifier_nocatalog) + to_namespace = Catalog.namespace_from(to_table_identifier_nocatalog) + catalog.create_namespace(from_namespace) + catalog.create_namespace(to_namespace) + table = catalog.create_table(from_table_identifier, table_schema_nested) + assert table.identifier == (catalog.name,) + from_table_identifier_nocatalog + catalog.rename_table(from_table_identifier, to_table_identifier) + new_table = catalog.load_table(to_table_identifier) + assert new_table.identifier == (catalog.name,) + to_table_identifier_nocatalog assert new_table.metadata_location == table.metadata_location with pytest.raises(NoSuchTableError): - catalog.load_table(random_identifier) + catalog.load_table(from_table_identifier) @pytest.mark.parametrize( @@ -520,23 +724,41 @@ def test_rename_table( lazy_fixture('catalog_sqlite_without_rowcount'), ], ) +@pytest.mark.parametrize( + "from_table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +@pytest.mark.parametrize( + "to_table_identifier", + [ + lazy_fixture("another_random_table_identifier"), + lazy_fixture("another_random_hierarchical_identifier"), + lazy_fixture("another_random_table_identifier_with_catalog"), + ], +) def test_rename_table_from_self_identifier( - catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier, another_random_identifier: Identifier + catalog: SqlCatalog, table_schema_nested: Schema, from_table_identifier: Identifier, to_table_identifier: Identifier ) -> None: - from_database_name, _from_table_name = random_identifier - to_database_name, _to_table_name = another_random_identifier - catalog.create_namespace(from_database_name) - catalog.create_namespace(to_database_name) - table = catalog.create_table(random_identifier, table_schema_nested) - assert table.identifier == (catalog.name,) + random_identifier - catalog.rename_table(table.identifier, another_random_identifier) - new_table = catalog.load_table(another_random_identifier) - assert new_table.identifier == (catalog.name,) + another_random_identifier + from_table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(from_table_identifier) + to_table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(to_table_identifier) + from_namespace = Catalog.namespace_from(from_table_identifier_nocatalog) + to_namespace = Catalog.namespace_from(to_table_identifier_nocatalog) + catalog.create_namespace(from_namespace) + catalog.create_namespace(to_namespace) + table = catalog.create_table(from_table_identifier, table_schema_nested) + assert table.identifier == (catalog.name,) + from_table_identifier_nocatalog + catalog.rename_table(table.identifier, to_table_identifier) + new_table = catalog.load_table(to_table_identifier) + assert new_table.identifier == (catalog.name,) + to_table_identifier_nocatalog assert new_table.metadata_location == table.metadata_location with pytest.raises(NoSuchTableError): catalog.load_table(table.identifier) with pytest.raises(NoSuchTableError): - catalog.load_table(random_identifier) + catalog.load_table(from_table_identifier) @pytest.mark.parametrize( @@ -547,19 +769,37 @@ def test_rename_table_from_self_identifier( lazy_fixture('catalog_sqlite_without_rowcount'), ], ) +@pytest.mark.parametrize( + "from_table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +@pytest.mark.parametrize( + "to_table_identifier", + [ + lazy_fixture("another_random_table_identifier"), + lazy_fixture("another_random_hierarchical_identifier"), + lazy_fixture("another_random_table_identifier_with_catalog"), + ], +) def test_rename_table_to_existing_one( - catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier, another_random_identifier: Identifier + catalog: SqlCatalog, table_schema_nested: Schema, from_table_identifier: Identifier, to_table_identifier: Identifier ) -> None: - from_database_name, _from_table_name = random_identifier - to_database_name, _to_table_name = another_random_identifier - catalog.create_namespace(from_database_name) - catalog.create_namespace(to_database_name) - table = catalog.create_table(random_identifier, table_schema_nested) - assert table.identifier == (catalog.name,) + random_identifier - new_table = catalog.create_table(another_random_identifier, table_schema_nested) - assert new_table.identifier == (catalog.name,) + another_random_identifier + from_table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(from_table_identifier) + to_table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(to_table_identifier) + from_namespace = Catalog.namespace_from(from_table_identifier_nocatalog) + to_namespace = Catalog.namespace_from(to_table_identifier_nocatalog) + catalog.create_namespace(from_namespace) + catalog.create_namespace(to_namespace) + table = catalog.create_table(from_table_identifier, table_schema_nested) + assert table.identifier == (catalog.name,) + from_table_identifier_nocatalog + new_table = catalog.create_table(to_table_identifier, table_schema_nested) + assert new_table.identifier == (catalog.name,) + to_table_identifier_nocatalog with pytest.raises(TableAlreadyExistsError): - catalog.rename_table(random_identifier, another_random_identifier) + catalog.rename_table(from_table_identifier, to_table_identifier) @pytest.mark.parametrize( @@ -570,11 +810,28 @@ def test_rename_table_to_existing_one( lazy_fixture('catalog_sqlite_without_rowcount'), ], ) -def test_rename_missing_table(catalog: SqlCatalog, random_identifier: Identifier, another_random_identifier: Identifier) -> None: - to_database_name, _to_table_name = another_random_identifier - catalog.create_namespace(to_database_name) +@pytest.mark.parametrize( + "from_table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +@pytest.mark.parametrize( + "to_table_identifier", + [ + lazy_fixture("another_random_table_identifier"), + lazy_fixture("another_random_hierarchical_identifier"), + lazy_fixture("another_random_table_identifier_with_catalog"), + ], +) +def test_rename_missing_table(catalog: SqlCatalog, from_table_identifier: Identifier, to_table_identifier: Identifier) -> None: + to_table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(to_table_identifier) + to_namespace = Catalog.namespace_from(to_table_identifier_nocatalog) + catalog.create_namespace(to_namespace) with pytest.raises(NoSuchTableError): - catalog.rename_table(random_identifier, another_random_identifier) + catalog.rename_table(from_table_identifier, to_table_identifier) @pytest.mark.parametrize( @@ -585,15 +842,32 @@ def test_rename_missing_table(catalog: SqlCatalog, random_identifier: Identifier lazy_fixture('catalog_sqlite_without_rowcount'), ], ) +@pytest.mark.parametrize( + "from_table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +@pytest.mark.parametrize( + "to_table_identifier", + [ + lazy_fixture("another_random_table_identifier"), + lazy_fixture("another_random_hierarchical_identifier"), + lazy_fixture("another_random_table_identifier_with_catalog"), + ], +) def test_rename_table_to_missing_namespace( - catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier, another_random_identifier: Identifier + catalog: SqlCatalog, table_schema_nested: Schema, from_table_identifier: Identifier, to_table_identifier: Identifier ) -> None: - from_database_name, _from_table_name = random_identifier - catalog.create_namespace(from_database_name) - table = catalog.create_table(random_identifier, table_schema_nested) - assert table.identifier == (catalog.name,) + random_identifier + from_table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(from_table_identifier) + from_namespace = Catalog.namespace_from(from_table_identifier_nocatalog) + catalog.create_namespace(from_namespace) + table = catalog.create_table(from_table_identifier, table_schema_nested) + assert table.identifier == (catalog.name,) + from_table_identifier_nocatalog with pytest.raises(NoSuchNamespaceError): - catalog.rename_table(random_identifier, another_random_identifier) + catalog.rename_table(from_table_identifier, to_table_identifier) @pytest.mark.parametrize( @@ -603,22 +877,40 @@ def test_rename_table_to_missing_namespace( lazy_fixture('catalog_sqlite'), ], ) +@pytest.mark.parametrize( + "table_identifier_1", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +@pytest.mark.parametrize( + "table_identifier_2", + [ + lazy_fixture("another_random_table_identifier"), + lazy_fixture("another_random_hierarchical_identifier"), + lazy_fixture("another_random_table_identifier_with_catalog"), + ], +) def test_list_tables( - catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier, another_random_identifier: Identifier + catalog: SqlCatalog, table_schema_nested: Schema, table_identifier_1: Identifier, table_identifier_2: Identifier ) -> None: - database_name_1, _table_name_1 = random_identifier - database_name_2, _table_name_2 = another_random_identifier - catalog.create_namespace(database_name_1) - catalog.create_namespace(database_name_2) - catalog.create_table(random_identifier, table_schema_nested) - catalog.create_table(another_random_identifier, table_schema_nested) - identifier_list = catalog.list_tables(database_name_1) + table_identifier_1_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier_1) + table_identifier_2_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier_2) + namespace_1 = Catalog.namespace_from(table_identifier_1_nocatalog) + namespace_2 = Catalog.namespace_from(table_identifier_2_nocatalog) + catalog.create_namespace(namespace_1) + catalog.create_namespace(namespace_2) + catalog.create_table(table_identifier_1, table_schema_nested) + catalog.create_table(table_identifier_2, table_schema_nested) + identifier_list = catalog.list_tables(namespace_1) assert len(identifier_list) == 1 - assert random_identifier in identifier_list + assert table_identifier_1_nocatalog in identifier_list - identifier_list = catalog.list_tables(database_name_2) + identifier_list = catalog.list_tables(namespace_2) assert len(identifier_list) == 1 - assert another_random_identifier in identifier_list + assert table_identifier_2_nocatalog in identifier_list @pytest.mark.parametrize( @@ -628,9 +920,10 @@ def test_list_tables( lazy_fixture('catalog_sqlite'), ], ) -def test_create_namespace(catalog: SqlCatalog, database_name: str) -> None: - catalog.create_namespace(database_name) - assert (database_name,) in catalog.list_namespaces() +@pytest.mark.parametrize("namespace", [lazy_fixture("database_name"), lazy_fixture("hierarchical_namespace_name")]) +def test_list_tables_when_missing_namespace(catalog: SqlCatalog, namespace: str) -> None: + with pytest.raises(NoSuchNamespaceError): + catalog.list_tables(namespace) @pytest.mark.parametrize( @@ -654,10 +947,24 @@ def test_create_namespace_if_not_exists(catalog: SqlCatalog, database_name: str) lazy_fixture('catalog_sqlite'), ], ) -def test_create_duplicate_namespace(catalog: SqlCatalog, database_name: str) -> None: - catalog.create_namespace(database_name) +@pytest.mark.parametrize("namespace", [lazy_fixture("database_name"), lazy_fixture("hierarchical_namespace_name")]) +def test_create_namespace(catalog: SqlCatalog, namespace: str) -> None: + catalog.create_namespace(namespace) + assert (Catalog.identifier_to_tuple(namespace)) in catalog.list_namespaces() + + +@pytest.mark.parametrize( + 'catalog', + [ + lazy_fixture('catalog_memory'), + lazy_fixture('catalog_sqlite'), + ], +) +@pytest.mark.parametrize("namespace", [lazy_fixture("database_name"), lazy_fixture("hierarchical_namespace_name")]) +def test_create_duplicate_namespace(catalog: SqlCatalog, namespace: str) -> None: + catalog.create_namespace(namespace) with pytest.raises(NamespaceAlreadyExistsError): - catalog.create_namespace(database_name) + catalog.create_namespace(namespace) @pytest.mark.parametrize( @@ -667,10 +974,11 @@ def test_create_duplicate_namespace(catalog: SqlCatalog, database_name: str) -> lazy_fixture('catalog_sqlite'), ], ) -def test_create_namespaces_sharing_same_prefix(catalog: SqlCatalog, database_name: str) -> None: - catalog.create_namespace(database_name + "_1") +@pytest.mark.parametrize("namespace", [lazy_fixture("database_name"), lazy_fixture("hierarchical_namespace_name")]) +def test_create_namespaces_sharing_same_prefix(catalog: SqlCatalog, namespace: str) -> None: + catalog.create_namespace(namespace + "_1") # Second namespace is a prefix of the first one, make sure it can be added. - catalog.create_namespace(database_name) + catalog.create_namespace(namespace) @pytest.mark.parametrize( @@ -680,16 +988,17 @@ def test_create_namespaces_sharing_same_prefix(catalog: SqlCatalog, database_nam lazy_fixture('catalog_sqlite'), ], ) -def test_create_namespace_with_comment_and_location(catalog: SqlCatalog, database_name: str) -> None: +@pytest.mark.parametrize("namespace", [lazy_fixture("database_name"), lazy_fixture("hierarchical_namespace_name")]) +def test_create_namespace_with_comment_and_location(catalog: SqlCatalog, namespace: str) -> None: test_location = "/test/location" test_properties = { "comment": "this is a test description", "location": test_location, } - catalog.create_namespace(namespace=database_name, properties=test_properties) + catalog.create_namespace(namespace=namespace, properties=test_properties) loaded_database_list = catalog.list_namespaces() - assert (database_name,) in loaded_database_list - properties = catalog.load_namespace_properties(database_name) + assert Catalog.identifier_to_tuple(namespace) in loaded_database_list + properties = catalog.load_namespace_properties(namespace) assert properties["comment"] == "this is a test description" assert properties["location"] == test_location @@ -701,13 +1010,27 @@ def test_create_namespace_with_comment_and_location(catalog: SqlCatalog, databas lazy_fixture('catalog_sqlite'), ], ) +@pytest.mark.parametrize("namespace", [lazy_fixture("database_name"), lazy_fixture("hierarchical_namespace_name")]) @pytest.mark.filterwarnings("ignore") -def test_create_namespace_with_null_properties(catalog: SqlCatalog, database_name: str) -> None: +def test_create_namespace_with_null_properties(catalog: SqlCatalog, namespace: str) -> None: with pytest.raises(IntegrityError): - catalog.create_namespace(namespace=database_name, properties={None: "value"}) # type: ignore + catalog.create_namespace(namespace=namespace, properties={None: "value"}) # type: ignore with pytest.raises(IntegrityError): - catalog.create_namespace(namespace=database_name, properties={"key": None}) + catalog.create_namespace(namespace=namespace, properties={"key": None}) + + +@pytest.mark.parametrize( + 'catalog', + [ + lazy_fixture('catalog_memory'), + lazy_fixture('catalog_sqlite'), + ], +) +@pytest.mark.parametrize("empty_namespace", ["", (), (""), ("", ""), " ", (" ")]) +def test_create_namespace_with_empty_identifier(catalog: SqlCatalog, empty_namespace: Any) -> None: + with pytest.raises(NoSuchNamespaceError): + catalog.create_namespace(empty_namespace) @pytest.mark.parametrize( @@ -717,13 +1040,17 @@ def test_create_namespace_with_null_properties(catalog: SqlCatalog, database_nam lazy_fixture('catalog_sqlite'), ], ) -def test_list_namespaces(catalog: SqlCatalog, database_list: List[str]) -> None: - for database_name in database_list: - catalog.create_namespace(database_name) - db_list = catalog.list_namespaces() - for database_name in database_list: - assert (database_name,) in db_list - assert len(catalog.list_namespaces(database_name)) == 1 +@pytest.mark.parametrize("namespace_list", [lazy_fixture("database_list"), lazy_fixture("hierarchical_namespace_list")]) +def test_list_namespaces(catalog: SqlCatalog, namespace_list: List[str]) -> None: + for namespace in namespace_list: + catalog.create_namespace(namespace) + # Test global list + ns_list = catalog.list_namespaces() + for namespace in namespace_list: + assert Catalog.identifier_to_tuple(namespace) in ns_list + # Test individual namespace list + assert len(one_namespace := catalog.list_namespaces(namespace)) == 1 + assert Catalog.identifier_to_tuple(namespace) == one_namespace[0] @pytest.mark.parametrize( @@ -745,16 +1072,25 @@ def test_list_non_existing_namespaces(catalog: SqlCatalog) -> None: lazy_fixture('catalog_sqlite'), ], ) -def test_drop_namespace(catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier) -> None: - database_name, table_name = random_identifier - catalog.create_namespace(database_name) - assert (database_name,) in catalog.list_namespaces() - catalog.create_table((database_name, table_name), table_schema_nested) +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_drop_namespace(catalog: SqlCatalog, table_schema_nested: Schema, table_identifier: Identifier) -> None: + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + assert namespace in catalog.list_namespaces() + catalog.create_table(table_identifier, table_schema_nested) with pytest.raises(NamespaceNotEmptyError): - catalog.drop_namespace(database_name) - catalog.drop_table((database_name, table_name)) - catalog.drop_namespace(database_name) - assert (database_name,) not in catalog.list_namespaces() + catalog.drop_namespace(namespace) + catalog.drop_table(table_identifier) + catalog.drop_namespace(namespace) + assert namespace not in catalog.list_namespaces() @pytest.mark.parametrize( @@ -764,18 +1100,19 @@ def test_drop_namespace(catalog: SqlCatalog, table_schema_nested: Schema, random lazy_fixture('catalog_sqlite'), ], ) -def test_load_namespace_properties(catalog: SqlCatalog, database_name: str) -> None: +@pytest.mark.parametrize("namespace", [lazy_fixture("database_name"), lazy_fixture("hierarchical_namespace_name")]) +def test_load_namespace_properties(catalog: SqlCatalog, namespace: str) -> None: warehouse_location = "/test/location" test_properties = { "comment": "this is a test description", - "location": f"{warehouse_location}/{database_name}.db", + "location": f"{warehouse_location}/{namespace}.db", "test_property1": "1", "test_property2": "2", "test_property3": "3", } - catalog.create_namespace(database_name, test_properties) - listed_properties = catalog.load_namespace_properties(database_name) + catalog.create_namespace(namespace, test_properties) + listed_properties = catalog.load_namespace_properties(namespace) for k, v in listed_properties.items(): assert k in test_properties assert v == test_properties[k] @@ -788,9 +1125,10 @@ def test_load_namespace_properties(catalog: SqlCatalog, database_name: str) -> N lazy_fixture('catalog_sqlite'), ], ) -def test_load_empty_namespace_properties(catalog: SqlCatalog, database_name: str) -> None: - catalog.create_namespace(database_name) - listed_properties = catalog.load_namespace_properties(database_name) +@pytest.mark.parametrize("namespace", [lazy_fixture("database_name"), lazy_fixture("hierarchical_namespace_name")]) +def test_load_empty_namespace_properties(catalog: SqlCatalog, namespace: str) -> None: + catalog.create_namespace(namespace) + listed_properties = catalog.load_namespace_properties(namespace) assert listed_properties == {"exists": "true"} @@ -813,19 +1151,20 @@ def test_load_namespace_properties_non_existing_namespace(catalog: SqlCatalog) - lazy_fixture('catalog_sqlite'), ], ) -def test_update_namespace_properties(catalog: SqlCatalog, database_name: str) -> None: +@pytest.mark.parametrize("namespace", [lazy_fixture("database_name"), lazy_fixture("hierarchical_namespace_name")]) +def test_update_namespace_properties(catalog: SqlCatalog, namespace: str) -> None: warehouse_location = "/test/location" test_properties = { "comment": "this is a test description", - "location": f"{warehouse_location}/{database_name}.db", + "location": f"{warehouse_location}/{namespace}.db", "test_property1": "1", "test_property2": "2", "test_property3": "3", } removals = {"test_property1", "test_property2", "test_property3", "should_not_removed"} updates = {"test_property4": "4", "test_property5": "5", "comment": "updated test description"} - catalog.create_namespace(database_name, test_properties) - update_report = catalog.update_namespace_properties(database_name, removals, updates) + catalog.create_namespace(namespace, test_properties) + update_report = catalog.update_namespace_properties(namespace, removals, updates) for k in updates.keys(): assert k in update_report.updated for k in removals: @@ -833,7 +1172,7 @@ def test_update_namespace_properties(catalog: SqlCatalog, database_name: str) -> assert k in update_report.missing else: assert k in update_report.removed - assert "updated test description" == catalog.load_namespace_properties(database_name)["comment"] + assert "updated test description" == catalog.load_namespace_properties(namespace)["comment"] @pytest.mark.parametrize( @@ -844,10 +1183,19 @@ def test_update_namespace_properties(catalog: SqlCatalog, database_name: str) -> lazy_fixture('catalog_sqlite_without_rowcount'), ], ) -def test_commit_table(catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - table = catalog.create_table(random_identifier, table_schema_nested) +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_commit_table(catalog: SqlCatalog, table_schema_nested: Schema, table_identifier: Identifier) -> None: + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + table = catalog.create_table(table_identifier, table_schema_nested) assert catalog._parse_metadata_version(table.metadata_location) == 0 assert table.metadata.current_schema_id == 0 @@ -878,10 +1226,19 @@ def test_commit_table(catalog: SqlCatalog, table_schema_nested: Schema, random_i lazy_fixture('catalog_sqlite_fsspec'), ], ) -def test_append_table(catalog: SqlCatalog, table_schema_simple: Schema, random_identifier: Identifier) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - table = catalog.create_table(random_identifier, table_schema_simple) +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_append_table(catalog: SqlCatalog, table_schema_simple: Schema, table_identifier: Identifier) -> None: + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + table = catalog.create_table(table_identifier, table_schema_simple) df = pa.Table.from_pydict( { @@ -918,11 +1275,20 @@ def test_append_table(catalog: SqlCatalog, table_schema_simple: Schema, random_i lazy_fixture('catalog_sqlite_without_rowcount'), ], ) -def test_concurrent_commit_table(catalog: SqlCatalog, table_schema_simple: Schema, random_identifier: Identifier) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - table_a = catalog.create_table(random_identifier, table_schema_simple) - table_b = catalog.load_table(random_identifier) +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_concurrent_commit_table(catalog: SqlCatalog, table_schema_simple: Schema, table_identifier: Identifier) -> None: + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + table_a = catalog.create_table(table_identifier, table_schema_simple) + table_b = catalog.load_table(table_identifier) with table_a.update_schema() as update: update.add_column(path="b", field_type=IntegerType()) @@ -992,12 +1358,21 @@ def test_write_and_evolve(catalog: SqlCatalog, format_version: int) -> None: lazy_fixture('catalog_sqlite_without_rowcount'), ], ) -def test_table_properties_int_value(catalog: SqlCatalog, table_schema_simple: Schema, random_identifier: Identifier) -> None: +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_table_properties_int_value(catalog: SqlCatalog, table_schema_simple: Schema, table_identifier: Identifier) -> None: # table properties can be set to int, but still serialized to string - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) property_with_int = {"property_name": 42} - table = catalog.create_table(random_identifier, table_schema_simple, properties=property_with_int) + table = catalog.create_table(table_identifier, table_schema_simple, properties=property_with_int) assert isinstance(table.properties["property_name"], str) @@ -1009,14 +1384,23 @@ def test_table_properties_int_value(catalog: SqlCatalog, table_schema_simple: Sc lazy_fixture('catalog_sqlite_without_rowcount'), ], ) +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) def test_table_properties_raise_for_none_value( - catalog: SqlCatalog, table_schema_simple: Schema, random_identifier: Identifier + catalog: SqlCatalog, table_schema_simple: Schema, table_identifier: Identifier ) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) property_with_none = {"property_name": None} with pytest.raises(ValidationError) as exc_info: - _ = catalog.create_table(random_identifier, table_schema_simple, properties=property_with_none) + _ = catalog.create_table(table_identifier, table_schema_simple, properties=property_with_none) assert "None type is not a supported value in properties: property_name" in str(exc_info.value) @@ -1027,11 +1411,20 @@ def test_table_properties_raise_for_none_value( lazy_fixture('catalog_sqlite'), ], ) -def test_table_exists(catalog: SqlCatalog, table_schema_simple: Schema, random_identifier: Identifier) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - catalog.create_table(random_identifier, table_schema_simple, properties={"format-version": "2"}) - existing_table = random_identifier +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_table_exists(catalog: SqlCatalog, table_schema_simple: Schema, table_identifier: Identifier) -> None: + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + catalog.create_table(table_identifier, table_schema_simple, properties={"format-version": "2"}) + existing_table = table_identifier # Act and Assert for an existing table assert catalog.table_exists(existing_table) is True diff --git a/tests/conftest.py b/tests/conftest.py index 6679543694..4baefafef4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1878,6 +1878,19 @@ def database_list(database_name: str) -> List[str]: return [f"{database_name}_{idx}" for idx in range(NUM_TABLES)] +@pytest.fixture() +def hierarchical_namespace_name() -> str: + prefix = "my_iceberg_ns-" + random_tag1 = "".join(choice(string.ascii_letters) for _ in range(RANDOM_LENGTH)) + random_tag2 = "".join(choice(string.ascii_letters) for _ in range(RANDOM_LENGTH)) + return ".".join([prefix + random_tag1, prefix + random_tag2]).lower() + + +@pytest.fixture() +def hierarchical_namespace_list(hierarchical_namespace_name: str) -> List[str]: + return [f"{hierarchical_namespace_name}_{idx}" for idx in range(NUM_TABLES)] + + BUCKET_NAME = "test_bucket" TABLE_METADATA_LOCATION_REGEX = re.compile( r"""s3://test_bucket/my_iceberg_database-[a-z]{20}.db/ From 4fb8ba24290692dee02ec39e4b7480d75105e220 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 29 May 2024 06:54:45 +0200 Subject: [PATCH 59/80] Bump coverage from 7.5.2 to 7.5.3 (#776) Bumps [coverage](https://github.com/nedbat/coveragepy) from 7.5.2 to 7.5.3. - [Release notes](https://github.com/nedbat/coveragepy/releases) - [Changelog](https://github.com/nedbat/coveragepy/blob/master/CHANGES.rst) - [Commits](https://github.com/nedbat/coveragepy/compare/7.5.2...7.5.3) --- updated-dependencies: - dependency-name: coverage dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 106 ++++++++++++++++++++++++++-------------------------- 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/poetry.lock b/poetry.lock index 4ef706c873..7e413b58df 100644 --- a/poetry.lock +++ b/poetry.lock @@ -652,63 +652,63 @@ files = [ [[package]] name = "coverage" -version = "7.5.2" +version = "7.5.3" description = "Code coverage measurement for Python" optional = false python-versions = ">=3.8" files = [ - {file = "coverage-7.5.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:554c7327bf0fd688050348e22db7c8e163fb7219f3ecdd4732d7ed606b417263"}, - {file = "coverage-7.5.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d0305e02e40c7cfea5d08d6368576537a74c0eea62b77633179748d3519d6705"}, - {file = "coverage-7.5.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:829fb55ad437d757c70d5b1c51cfda9377f31506a0a3f3ac282bc6a387d6a5f1"}, - {file = "coverage-7.5.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:894b1acded706f1407a662d08e026bfd0ff1e59e9bd32062fea9d862564cfb65"}, - {file = "coverage-7.5.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe76d6dee5e4febefa83998b17926df3a04e5089e3d2b1688c74a9157798d7a2"}, - {file = "coverage-7.5.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:c7ebf2a37e4f5fea3c1a11e1f47cea7d75d0f2d8ef69635ddbd5c927083211fc"}, - {file = "coverage-7.5.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:20e611fc36e1a0fc7bbf957ef9c635c8807d71fbe5643e51b2769b3cc0fb0b51"}, - {file = "coverage-7.5.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7c5c5b7ae2763533152880d5b5b451acbc1089ade2336b710a24b2b0f5239d20"}, - {file = "coverage-7.5.2-cp310-cp310-win32.whl", hash = "sha256:1e4225990a87df898e40ca31c9e830c15c2c53b1d33df592bc8ef314d71f0281"}, - {file = "coverage-7.5.2-cp310-cp310-win_amd64.whl", hash = "sha256:976cd92d9420e6e2aa6ce6a9d61f2b490e07cb468968adf371546b33b829284b"}, - {file = "coverage-7.5.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5997d418c219dcd4dcba64e50671cca849aaf0dac3d7a2eeeb7d651a5bd735b8"}, - {file = "coverage-7.5.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ec27e93bbf5976f0465e8936f02eb5add99bbe4e4e7b233607e4d7622912d68d"}, - {file = "coverage-7.5.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f11f98753800eb1ec872562a398081f6695f91cd01ce39819e36621003ec52a"}, - {file = "coverage-7.5.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e34680049eecb30b6498784c9637c1c74277dcb1db75649a152f8004fbd6646"}, - {file = "coverage-7.5.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e12536446ad4527ac8ed91d8a607813085683bcce27af69e3b31cd72b3c5960"}, - {file = "coverage-7.5.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:3d3f7744b8a8079d69af69d512e5abed4fb473057625588ce126088e50d05493"}, - {file = "coverage-7.5.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:431a3917e32223fcdb90b79fe60185864a9109631ebc05f6c5aa03781a00b513"}, - {file = "coverage-7.5.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a7c6574225f34ce45466f04751d957b5c5e6b69fca9351db017c9249786172ce"}, - {file = "coverage-7.5.2-cp311-cp311-win32.whl", hash = "sha256:2b144d142ec9987276aeff1326edbc0df8ba4afbd7232f0ca10ad57a115e95b6"}, - {file = "coverage-7.5.2-cp311-cp311-win_amd64.whl", hash = "sha256:900532713115ac58bc3491b9d2b52704a05ed408ba0918d57fd72c94bc47fba1"}, - {file = "coverage-7.5.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9a42970ce74c88bdf144df11c52c5cf4ad610d860de87c0883385a1c9d9fa4ab"}, - {file = "coverage-7.5.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:26716a1118c6ce2188283b4b60a898c3be29b480acbd0a91446ced4fe4e780d8"}, - {file = "coverage-7.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:60b66b0363c5a2a79fba3d1cd7430c25bbd92c923d031cae906bdcb6e054d9a2"}, - {file = "coverage-7.5.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e5d22eba19273b2069e4efeff88c897a26bdc64633cbe0357a198f92dca94268"}, - {file = "coverage-7.5.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3bb5b92a0ab3d22dfdbfe845e2fef92717b067bdf41a5b68c7e3e857c0cff1a4"}, - {file = "coverage-7.5.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1aef719b6559b521ae913ddeb38f5048c6d1a3d366865e8b320270b7bc4693c2"}, - {file = "coverage-7.5.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8809c0ea0e8454f756e3bd5c36d04dddf222989216788a25bfd6724bfcee342c"}, - {file = "coverage-7.5.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1acc2e2ef098a1d4bf535758085f508097316d738101a97c3f996bccba963ea5"}, - {file = "coverage-7.5.2-cp312-cp312-win32.whl", hash = "sha256:97de509043d3f0f2b2cd171bdccf408f175c7f7a99d36d566b1ae4dd84107985"}, - {file = "coverage-7.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:8941e35a0e991a7a20a1fa3e3182f82abe357211f2c335a9e6007067c3392fcf"}, - {file = "coverage-7.5.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5662bf0f6fb6757f5c2d6279c541a5af55a39772c2362ed0920b27e3ce0e21f7"}, - {file = "coverage-7.5.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3d9c62cff2ffb4c2a95328488fd7aa96a7a4b34873150650fe76b19c08c9c792"}, - {file = "coverage-7.5.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:74eeaa13e8200ad72fca9c5f37395fb310915cec6f1682b21375e84fd9770e84"}, - {file = "coverage-7.5.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f29bf497d51a5077994b265e976d78b09d9d0dff6ca5763dbb4804534a5d380"}, - {file = "coverage-7.5.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f96aa94739593ae0707eda9813ce363a0a0374a810ae0eced383340fc4a1f73"}, - {file = "coverage-7.5.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:51b6cee539168a912b4b3b040e4042b9e2c9a7ad9c8546c09e4eaeff3eacba6b"}, - {file = "coverage-7.5.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:59a75e6aa5c25b50b5a1499f9718f2edff54257f545718c4fb100f48d570ead4"}, - {file = "coverage-7.5.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:29da75ce20cb0a26d60e22658dd3230713c6c05a3465dd8ad040ffc991aea318"}, - {file = "coverage-7.5.2-cp38-cp38-win32.whl", hash = "sha256:23f2f16958b16152b43a39a5ecf4705757ddd284b3b17a77da3a62aef9c057ef"}, - {file = "coverage-7.5.2-cp38-cp38-win_amd64.whl", hash = "sha256:9e41c94035e5cdb362beed681b58a707e8dc29ea446ea1713d92afeded9d1ddd"}, - {file = "coverage-7.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:06d96b9b19bbe7f049c2be3c4f9e06737ec6d8ef8933c7c3a4c557ef07936e46"}, - {file = "coverage-7.5.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:878243e1206828908a6b4a9ca7b1aa8bee9eb129bf7186fc381d2646f4524ce9"}, - {file = "coverage-7.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:482df956b055d3009d10fce81af6ffab28215d7ed6ad4a15e5c8e67cb7c5251c"}, - {file = "coverage-7.5.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a35c97af60a5492e9e89f8b7153fe24eadfd61cb3a2fb600df1a25b5dab34b7e"}, - {file = "coverage-7.5.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:24bb4c7859a3f757a116521d4d3a8a82befad56ea1bdacd17d6aafd113b0071e"}, - {file = "coverage-7.5.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:e1046aab24c48c694f0793f669ac49ea68acde6a0798ac5388abe0a5615b5ec8"}, - {file = "coverage-7.5.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:448ec61ea9ea7916d5579939362509145caaecf03161f6f13e366aebb692a631"}, - {file = "coverage-7.5.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4a00bd5ba8f1a4114720bef283cf31583d6cb1c510ce890a6da6c4268f0070b7"}, - {file = "coverage-7.5.2-cp39-cp39-win32.whl", hash = "sha256:9f805481d5eff2a96bac4da1570ef662bf970f9a16580dc2c169c8c3183fa02b"}, - {file = "coverage-7.5.2-cp39-cp39-win_amd64.whl", hash = "sha256:2c79f058e7bec26b5295d53b8c39ecb623448c74ccc8378631f5cb5c16a7e02c"}, - {file = "coverage-7.5.2-pp38.pp39.pp310-none-any.whl", hash = "sha256:40dbb8e7727560fe8ab65efcddfec1ae25f30ef02e2f2e5d78cfb52a66781ec5"}, - {file = "coverage-7.5.2.tar.gz", hash = "sha256:13017a63b0e499c59b5ba94a8542fb62864ba3016127d1e4ef30d354fc2b00e9"}, + {file = "coverage-7.5.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a6519d917abb15e12380406d721e37613e2a67d166f9fb7e5a8ce0375744cd45"}, + {file = "coverage-7.5.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:aea7da970f1feccf48be7335f8b2ca64baf9b589d79e05b9397a06696ce1a1ec"}, + {file = "coverage-7.5.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:923b7b1c717bd0f0f92d862d1ff51d9b2b55dbbd133e05680204465f454bb286"}, + {file = "coverage-7.5.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62bda40da1e68898186f274f832ef3e759ce929da9a9fd9fcf265956de269dbc"}, + {file = "coverage-7.5.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d8b7339180d00de83e930358223c617cc343dd08e1aa5ec7b06c3a121aec4e1d"}, + {file = "coverage-7.5.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:25a5caf742c6195e08002d3b6c2dd6947e50efc5fc2c2205f61ecb47592d2d83"}, + {file = "coverage-7.5.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:05ac5f60faa0c704c0f7e6a5cbfd6f02101ed05e0aee4d2822637a9e672c998d"}, + {file = "coverage-7.5.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:239a4e75e09c2b12ea478d28815acf83334d32e722e7433471fbf641c606344c"}, + {file = "coverage-7.5.3-cp310-cp310-win32.whl", hash = "sha256:a5812840d1d00eafae6585aba38021f90a705a25b8216ec7f66aebe5b619fb84"}, + {file = "coverage-7.5.3-cp310-cp310-win_amd64.whl", hash = "sha256:33ca90a0eb29225f195e30684ba4a6db05dbef03c2ccd50b9077714c48153cac"}, + {file = "coverage-7.5.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f81bc26d609bf0fbc622c7122ba6307993c83c795d2d6f6f6fd8c000a770d974"}, + {file = "coverage-7.5.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7cec2af81f9e7569280822be68bd57e51b86d42e59ea30d10ebdbb22d2cb7232"}, + {file = "coverage-7.5.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:55f689f846661e3f26efa535071775d0483388a1ccfab899df72924805e9e7cd"}, + {file = "coverage-7.5.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:50084d3516aa263791198913a17354bd1dc627d3c1639209640b9cac3fef5807"}, + {file = "coverage-7.5.3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:341dd8f61c26337c37988345ca5c8ccabeff33093a26953a1ac72e7d0103c4fb"}, + {file = "coverage-7.5.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ab0b028165eea880af12f66086694768f2c3139b2c31ad5e032c8edbafca6ffc"}, + {file = "coverage-7.5.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:5bc5a8c87714b0c67cfeb4c7caa82b2d71e8864d1a46aa990b5588fa953673b8"}, + {file = "coverage-7.5.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:38a3b98dae8a7c9057bd91fbf3415c05e700a5114c5f1b5b0ea5f8f429ba6614"}, + {file = "coverage-7.5.3-cp311-cp311-win32.whl", hash = "sha256:fcf7d1d6f5da887ca04302db8e0e0cf56ce9a5e05f202720e49b3e8157ddb9a9"}, + {file = "coverage-7.5.3-cp311-cp311-win_amd64.whl", hash = "sha256:8c836309931839cca658a78a888dab9676b5c988d0dd34ca247f5f3e679f4e7a"}, + {file = "coverage-7.5.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:296a7d9bbc598e8744c00f7a6cecf1da9b30ae9ad51c566291ff1314e6cbbed8"}, + {file = "coverage-7.5.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:34d6d21d8795a97b14d503dcaf74226ae51eb1f2bd41015d3ef332a24d0a17b3"}, + {file = "coverage-7.5.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e317953bb4c074c06c798a11dbdd2cf9979dbcaa8ccc0fa4701d80042d4ebf1"}, + {file = "coverage-7.5.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:705f3d7c2b098c40f5b81790a5fedb274113373d4d1a69e65f8b68b0cc26f6db"}, + {file = "coverage-7.5.3-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1196e13c45e327d6cd0b6e471530a1882f1017eb83c6229fc613cd1a11b53cd"}, + {file = "coverage-7.5.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:015eddc5ccd5364dcb902eaecf9515636806fa1e0d5bef5769d06d0f31b54523"}, + {file = "coverage-7.5.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:fd27d8b49e574e50caa65196d908f80e4dff64d7e592d0c59788b45aad7e8b35"}, + {file = "coverage-7.5.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:33fc65740267222fc02975c061eb7167185fef4cc8f2770267ee8bf7d6a42f84"}, + {file = "coverage-7.5.3-cp312-cp312-win32.whl", hash = "sha256:7b2a19e13dfb5c8e145c7a6ea959485ee8e2204699903c88c7d25283584bfc08"}, + {file = "coverage-7.5.3-cp312-cp312-win_amd64.whl", hash = "sha256:0bbddc54bbacfc09b3edaec644d4ac90c08ee8ed4844b0f86227dcda2d428fcb"}, + {file = "coverage-7.5.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f78300789a708ac1f17e134593f577407d52d0417305435b134805c4fb135adb"}, + {file = "coverage-7.5.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b368e1aee1b9b75757942d44d7598dcd22a9dbb126affcbba82d15917f0cc155"}, + {file = "coverage-7.5.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f836c174c3a7f639bded48ec913f348c4761cbf49de4a20a956d3431a7c9cb24"}, + {file = "coverage-7.5.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:244f509f126dc71369393ce5fea17c0592c40ee44e607b6d855e9c4ac57aac98"}, + {file = "coverage-7.5.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c4c2872b3c91f9baa836147ca33650dc5c172e9273c808c3c3199c75490e709d"}, + {file = "coverage-7.5.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:dd4b3355b01273a56b20c219e74e7549e14370b31a4ffe42706a8cda91f19f6d"}, + {file = "coverage-7.5.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:f542287b1489c7a860d43a7d8883e27ca62ab84ca53c965d11dac1d3a1fab7ce"}, + {file = "coverage-7.5.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:75e3f4e86804023e991096b29e147e635f5e2568f77883a1e6eed74512659ab0"}, + {file = "coverage-7.5.3-cp38-cp38-win32.whl", hash = "sha256:c59d2ad092dc0551d9f79d9d44d005c945ba95832a6798f98f9216ede3d5f485"}, + {file = "coverage-7.5.3-cp38-cp38-win_amd64.whl", hash = "sha256:fa21a04112c59ad54f69d80e376f7f9d0f5f9123ab87ecd18fbb9ec3a2beed56"}, + {file = "coverage-7.5.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f5102a92855d518b0996eb197772f5ac2a527c0ec617124ad5242a3af5e25f85"}, + {file = "coverage-7.5.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d1da0a2e3b37b745a2b2a678a4c796462cf753aebf94edcc87dcc6b8641eae31"}, + {file = "coverage-7.5.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8383a6c8cefba1b7cecc0149415046b6fc38836295bc4c84e820872eb5478b3d"}, + {file = "coverage-7.5.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9aad68c3f2566dfae84bf46295a79e79d904e1c21ccfc66de88cd446f8686341"}, + {file = "coverage-7.5.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e079c9ec772fedbade9d7ebc36202a1d9ef7291bc9b3a024ca395c4d52853d7"}, + {file = "coverage-7.5.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bde997cac85fcac227b27d4fb2c7608a2c5f6558469b0eb704c5726ae49e1c52"}, + {file = "coverage-7.5.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:990fb20b32990b2ce2c5f974c3e738c9358b2735bc05075d50a6f36721b8f303"}, + {file = "coverage-7.5.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:3d5a67f0da401e105753d474369ab034c7bae51a4c31c77d94030d59e41df5bd"}, + {file = "coverage-7.5.3-cp39-cp39-win32.whl", hash = "sha256:e08c470c2eb01977d221fd87495b44867a56d4d594f43739a8028f8646a51e0d"}, + {file = "coverage-7.5.3-cp39-cp39-win_amd64.whl", hash = "sha256:1d2a830ade66d3563bb61d1e3c77c8def97b30ed91e166c67d0632c018f380f0"}, + {file = "coverage-7.5.3-pp38.pp39.pp310-none-any.whl", hash = "sha256:3538d8fb1ee9bdd2e2692b3b18c22bb1c19ffbefd06880f5ac496e42d7bb3884"}, + {file = "coverage-7.5.3.tar.gz", hash = "sha256:04aefca5190d1dc7a53a4c1a5a7f8568811306d7a8ee231c42fb69215571944f"}, ] [package.dependencies] From ec8d7dc2bc8fff87e8192c4eef377945bd0e4015 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 29 May 2024 08:33:25 +0200 Subject: [PATCH 60/80] Bump pydantic from 2.7.1 to 2.7.2 (#775) Bumps [pydantic](https://github.com/pydantic/pydantic) from 2.7.1 to 2.7.2. - [Release notes](https://github.com/pydantic/pydantic/releases) - [Changelog](https://github.com/pydantic/pydantic/blob/main/HISTORY.md) - [Commits](https://github.com/pydantic/pydantic/compare/v2.7.1...v2.7.2) --- updated-dependencies: - dependency-name: pydantic dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 168 ++++++++++++++++++++++++++-------------------------- 1 file changed, 84 insertions(+), 84 deletions(-) diff --git a/poetry.lock b/poetry.lock index 7e413b58df..35c0f9ee0d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3024,18 +3024,18 @@ files = [ [[package]] name = "pydantic" -version = "2.7.1" +version = "2.7.2" description = "Data validation using Python type hints" optional = false python-versions = ">=3.8" files = [ - {file = "pydantic-2.7.1-py3-none-any.whl", hash = "sha256:e029badca45266732a9a79898a15ae2e8b14840b1eabbb25844be28f0b33f3d5"}, - {file = "pydantic-2.7.1.tar.gz", hash = "sha256:e9dbb5eada8abe4d9ae5f46b9939aead650cd2b68f249bb3a8139dbe125803cc"}, + {file = "pydantic-2.7.2-py3-none-any.whl", hash = "sha256:834ab954175f94e6e68258537dc49402c4a5e9d0409b9f1b86b7e934a8372de7"}, + {file = "pydantic-2.7.2.tar.gz", hash = "sha256:71b2945998f9c9b7919a45bde9a50397b289937d215ae141c1d0903ba7149fd7"}, ] [package.dependencies] annotated-types = ">=0.4.0" -pydantic-core = "2.18.2" +pydantic-core = "2.18.3" typing-extensions = ">=4.6.1" [package.extras] @@ -3043,90 +3043,90 @@ email = ["email-validator (>=2.0.0)"] [[package]] name = "pydantic-core" -version = "2.18.2" +version = "2.18.3" description = "Core functionality for Pydantic validation and serialization" optional = false python-versions = ">=3.8" files = [ - {file = "pydantic_core-2.18.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:9e08e867b306f525802df7cd16c44ff5ebbe747ff0ca6cf3fde7f36c05a59a81"}, - {file = "pydantic_core-2.18.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f0a21cbaa69900cbe1a2e7cad2aa74ac3cf21b10c3efb0fa0b80305274c0e8a2"}, - {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0680b1f1f11fda801397de52c36ce38ef1c1dc841a0927a94f226dea29c3ae3d"}, - {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:95b9d5e72481d3780ba3442eac863eae92ae43a5f3adb5b4d0a1de89d42bb250"}, - {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4fcf5cd9c4b655ad666ca332b9a081112cd7a58a8b5a6ca7a3104bc950f2038"}, - {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b5155ff768083cb1d62f3e143b49a8a3432e6789a3abee8acd005c3c7af1c74"}, - {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:553ef617b6836fc7e4df130bb851e32fe357ce36336d897fd6646d6058d980af"}, - {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b89ed9eb7d616ef5714e5590e6cf7f23b02d0d539767d33561e3675d6f9e3857"}, - {file = "pydantic_core-2.18.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:75f7e9488238e920ab6204399ded280dc4c307d034f3924cd7f90a38b1829563"}, - {file = "pydantic_core-2.18.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ef26c9e94a8c04a1b2924149a9cb081836913818e55681722d7f29af88fe7b38"}, - {file = "pydantic_core-2.18.2-cp310-none-win32.whl", hash = "sha256:182245ff6b0039e82b6bb585ed55a64d7c81c560715d1bad0cbad6dfa07b4027"}, - {file = "pydantic_core-2.18.2-cp310-none-win_amd64.whl", hash = "sha256:e23ec367a948b6d812301afc1b13f8094ab7b2c280af66ef450efc357d2ae543"}, - {file = "pydantic_core-2.18.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:219da3f096d50a157f33645a1cf31c0ad1fe829a92181dd1311022f986e5fbe3"}, - {file = "pydantic_core-2.18.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cc1cfd88a64e012b74e94cd00bbe0f9c6df57049c97f02bb07d39e9c852e19a4"}, - {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:05b7133a6e6aeb8df37d6f413f7705a37ab4031597f64ab56384c94d98fa0e90"}, - {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:224c421235f6102e8737032483f43c1a8cfb1d2f45740c44166219599358c2cd"}, - {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b14d82cdb934e99dda6d9d60dc84a24379820176cc4a0d123f88df319ae9c150"}, - {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2728b01246a3bba6de144f9e3115b532ee44bd6cf39795194fb75491824a1413"}, - {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:470b94480bb5ee929f5acba6995251ada5e059a5ef3e0dfc63cca287283ebfa6"}, - {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:997abc4df705d1295a42f95b4eec4950a37ad8ae46d913caeee117b6b198811c"}, - {file = "pydantic_core-2.18.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:75250dbc5290e3f1a0f4618db35e51a165186f9034eff158f3d490b3fed9f8a0"}, - {file = "pydantic_core-2.18.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4456f2dca97c425231d7315737d45239b2b51a50dc2b6f0c2bb181fce6207664"}, - {file = "pydantic_core-2.18.2-cp311-none-win32.whl", hash = "sha256:269322dcc3d8bdb69f054681edff86276b2ff972447863cf34c8b860f5188e2e"}, - {file = "pydantic_core-2.18.2-cp311-none-win_amd64.whl", hash = "sha256:800d60565aec896f25bc3cfa56d2277d52d5182af08162f7954f938c06dc4ee3"}, - {file = "pydantic_core-2.18.2-cp311-none-win_arm64.whl", hash = "sha256:1404c69d6a676245199767ba4f633cce5f4ad4181f9d0ccb0577e1f66cf4c46d"}, - {file = "pydantic_core-2.18.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:fb2bd7be70c0fe4dfd32c951bc813d9fe6ebcbfdd15a07527796c8204bd36242"}, - {file = "pydantic_core-2.18.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6132dd3bd52838acddca05a72aafb6eab6536aa145e923bb50f45e78b7251043"}, - {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7d904828195733c183d20a54230c0df0eb46ec746ea1a666730787353e87182"}, - {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c9bd70772c720142be1020eac55f8143a34ec9f82d75a8e7a07852023e46617f"}, - {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2b8ed04b3582771764538f7ee7001b02e1170223cf9b75dff0bc698fadb00cf3"}, - {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e6dac87ddb34aaec85f873d737e9d06a3555a1cc1a8e0c44b7f8d5daeb89d86f"}, - {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ca4ae5a27ad7a4ee5170aebce1574b375de390bc01284f87b18d43a3984df72"}, - {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:886eec03591b7cf058467a70a87733b35f44707bd86cf64a615584fd72488b7c"}, - {file = "pydantic_core-2.18.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ca7b0c1f1c983e064caa85f3792dd2fe3526b3505378874afa84baf662e12241"}, - {file = "pydantic_core-2.18.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4b4356d3538c3649337df4074e81b85f0616b79731fe22dd11b99499b2ebbdf3"}, - {file = "pydantic_core-2.18.2-cp312-none-win32.whl", hash = "sha256:8b172601454f2d7701121bbec3425dd71efcb787a027edf49724c9cefc14c038"}, - {file = "pydantic_core-2.18.2-cp312-none-win_amd64.whl", hash = "sha256:b1bd7e47b1558ea872bd16c8502c414f9e90dcf12f1395129d7bb42a09a95438"}, - {file = "pydantic_core-2.18.2-cp312-none-win_arm64.whl", hash = "sha256:98758d627ff397e752bc339272c14c98199c613f922d4a384ddc07526c86a2ec"}, - {file = "pydantic_core-2.18.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:9fdad8e35f278b2c3eb77cbdc5c0a49dada440657bf738d6905ce106dc1de439"}, - {file = "pydantic_core-2.18.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1d90c3265ae107f91a4f279f4d6f6f1d4907ac76c6868b27dc7fb33688cfb347"}, - {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:390193c770399861d8df9670fb0d1874f330c79caaca4642332df7c682bf6b91"}, - {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:82d5d4d78e4448683cb467897fe24e2b74bb7b973a541ea1dcfec1d3cbce39fb"}, - {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4774f3184d2ef3e14e8693194f661dea5a4d6ca4e3dc8e39786d33a94865cefd"}, - {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d4d938ec0adf5167cb335acb25a4ee69a8107e4984f8fbd2e897021d9e4ca21b"}, - {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0e8b1be28239fc64a88a8189d1df7fad8be8c1ae47fcc33e43d4be15f99cc70"}, - {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:868649da93e5a3d5eacc2b5b3b9235c98ccdbfd443832f31e075f54419e1b96b"}, - {file = "pydantic_core-2.18.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:78363590ef93d5d226ba21a90a03ea89a20738ee5b7da83d771d283fd8a56761"}, - {file = "pydantic_core-2.18.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:852e966fbd035a6468fc0a3496589b45e2208ec7ca95c26470a54daed82a0788"}, - {file = "pydantic_core-2.18.2-cp38-none-win32.whl", hash = "sha256:6a46e22a707e7ad4484ac9ee9f290f9d501df45954184e23fc29408dfad61350"}, - {file = "pydantic_core-2.18.2-cp38-none-win_amd64.whl", hash = "sha256:d91cb5ea8b11607cc757675051f61b3d93f15eca3cefb3e6c704a5d6e8440f4e"}, - {file = "pydantic_core-2.18.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:ae0a8a797a5e56c053610fa7be147993fe50960fa43609ff2a9552b0e07013e8"}, - {file = "pydantic_core-2.18.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:042473b6280246b1dbf530559246f6842b56119c2926d1e52b631bdc46075f2a"}, - {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a388a77e629b9ec814c1b1e6b3b595fe521d2cdc625fcca26fbc2d44c816804"}, - {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e25add29b8f3b233ae90ccef2d902d0ae0432eb0d45370fe315d1a5cf231004b"}, - {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f459a5ce8434614dfd39bbebf1041952ae01da6bed9855008cb33b875cb024c0"}, - {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eff2de745698eb46eeb51193a9f41d67d834d50e424aef27df2fcdee1b153845"}, - {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8309f67285bdfe65c372ea3722b7a5642680f3dba538566340a9d36e920b5f0"}, - {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f93a8a2e3938ff656a7c1bc57193b1319960ac015b6e87d76c76bf14fe0244b4"}, - {file = "pydantic_core-2.18.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:22057013c8c1e272eb8d0eebc796701167d8377441ec894a8fed1af64a0bf399"}, - {file = "pydantic_core-2.18.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:cfeecd1ac6cc1fb2692c3d5110781c965aabd4ec5d32799773ca7b1456ac636b"}, - {file = "pydantic_core-2.18.2-cp39-none-win32.whl", hash = "sha256:0d69b4c2f6bb3e130dba60d34c0845ba31b69babdd3f78f7c0c8fae5021a253e"}, - {file = "pydantic_core-2.18.2-cp39-none-win_amd64.whl", hash = "sha256:d9319e499827271b09b4e411905b24a426b8fb69464dfa1696258f53a3334641"}, - {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a1874c6dd4113308bd0eb568418e6114b252afe44319ead2b4081e9b9521fe75"}, - {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:ccdd111c03bfd3666bd2472b674c6899550e09e9f298954cfc896ab92b5b0e6d"}, - {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e18609ceaa6eed63753037fc06ebb16041d17d28199ae5aba0052c51449650a9"}, - {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e5c584d357c4e2baf0ff7baf44f4994be121e16a2c88918a5817331fc7599d7"}, - {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43f0f463cf89ace478de71a318b1b4f05ebc456a9b9300d027b4b57c1a2064fb"}, - {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e1b395e58b10b73b07b7cf740d728dd4ff9365ac46c18751bf8b3d8cca8f625a"}, - {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:0098300eebb1c837271d3d1a2cd2911e7c11b396eac9661655ee524a7f10587b"}, - {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:36789b70d613fbac0a25bb07ab3d9dba4d2e38af609c020cf4d888d165ee0bf3"}, - {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3f9a801e7c8f1ef8718da265bba008fa121243dfe37c1cea17840b0944dfd72c"}, - {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:3a6515ebc6e69d85502b4951d89131ca4e036078ea35533bb76327f8424531ce"}, - {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20aca1e2298c56ececfd8ed159ae4dde2df0781988c97ef77d5c16ff4bd5b400"}, - {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:223ee893d77a310a0391dca6df00f70bbc2f36a71a895cecd9a0e762dc37b349"}, - {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2334ce8c673ee93a1d6a65bd90327588387ba073c17e61bf19b4fd97d688d63c"}, - {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:cbca948f2d14b09d20268cda7b0367723d79063f26c4ffc523af9042cad95592"}, - {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:b3ef08e20ec49e02d5c6717a91bb5af9b20f1805583cb0adfe9ba2c6b505b5ae"}, - {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:c6fdc8627910eed0c01aed6a390a252fe3ea6d472ee70fdde56273f198938374"}, - {file = "pydantic_core-2.18.2.tar.gz", hash = "sha256:2e29d20810dfc3043ee13ac7d9e25105799817683348823f305ab3f349b9386e"}, + {file = "pydantic_core-2.18.3-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:744697428fcdec6be5670460b578161d1ffe34743a5c15656be7ea82b008197c"}, + {file = "pydantic_core-2.18.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:37b40c05ced1ba4218b14986fe6f283d22e1ae2ff4c8e28881a70fb81fbfcda7"}, + {file = "pydantic_core-2.18.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:544a9a75622357076efb6b311983ff190fbfb3c12fc3a853122b34d3d358126c"}, + {file = "pydantic_core-2.18.3-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e2e253af04ceaebde8eb201eb3f3e3e7e390f2d275a88300d6a1959d710539e2"}, + {file = "pydantic_core-2.18.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:855ec66589c68aa367d989da5c4755bb74ee92ccad4fdb6af942c3612c067e34"}, + {file = "pydantic_core-2.18.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d3e42bb54e7e9d72c13ce112e02eb1b3b55681ee948d748842171201a03a98a"}, + {file = "pydantic_core-2.18.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6ac9ffccc9d2e69d9fba841441d4259cb668ac180e51b30d3632cd7abca2b9b"}, + {file = "pydantic_core-2.18.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c56eca1686539fa0c9bda992e7bd6a37583f20083c37590413381acfc5f192d6"}, + {file = "pydantic_core-2.18.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:17954d784bf8abfc0ec2a633108207ebc4fa2df1a0e4c0c3ccbaa9bb01d2c426"}, + {file = "pydantic_core-2.18.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:98ed737567d8f2ecd54f7c8d4f8572ca7c7921ede93a2e52939416170d357812"}, + {file = "pydantic_core-2.18.3-cp310-none-win32.whl", hash = "sha256:9f9e04afebd3ed8c15d67a564ed0a34b54e52136c6d40d14c5547b238390e779"}, + {file = "pydantic_core-2.18.3-cp310-none-win_amd64.whl", hash = "sha256:45e4ffbae34f7ae30d0047697e724e534a7ec0a82ef9994b7913a412c21462a0"}, + {file = "pydantic_core-2.18.3-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:b9ebe8231726c49518b16b237b9fe0d7d361dd221302af511a83d4ada01183ab"}, + {file = "pydantic_core-2.18.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b8e20e15d18bf7dbb453be78a2d858f946f5cdf06c5072453dace00ab652e2b2"}, + {file = "pydantic_core-2.18.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c0d9ff283cd3459fa0bf9b0256a2b6f01ac1ff9ffb034e24457b9035f75587cb"}, + {file = "pydantic_core-2.18.3-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2f7ef5f0ebb77ba24c9970da18b771711edc5feaf00c10b18461e0f5f5949231"}, + {file = "pydantic_core-2.18.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:73038d66614d2e5cde30435b5afdced2b473b4c77d4ca3a8624dd3e41a9c19be"}, + {file = "pydantic_core-2.18.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6afd5c867a74c4d314c557b5ea9520183fadfbd1df4c2d6e09fd0d990ce412cd"}, + {file = "pydantic_core-2.18.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd7df92f28d351bb9f12470f4c533cf03d1b52ec5a6e5c58c65b183055a60106"}, + {file = "pydantic_core-2.18.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:80aea0ffeb1049336043d07799eace1c9602519fb3192916ff525b0287b2b1e4"}, + {file = "pydantic_core-2.18.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:aaee40f25bba38132e655ffa3d1998a6d576ba7cf81deff8bfa189fb43fd2bbe"}, + {file = "pydantic_core-2.18.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9128089da8f4fe73f7a91973895ebf2502539d627891a14034e45fb9e707e26d"}, + {file = "pydantic_core-2.18.3-cp311-none-win32.whl", hash = "sha256:fec02527e1e03257aa25b1a4dcbe697b40a22f1229f5d026503e8b7ff6d2eda7"}, + {file = "pydantic_core-2.18.3-cp311-none-win_amd64.whl", hash = "sha256:58ff8631dbab6c7c982e6425da8347108449321f61fe427c52ddfadd66642af7"}, + {file = "pydantic_core-2.18.3-cp311-none-win_arm64.whl", hash = "sha256:3fc1c7f67f34c6c2ef9c213e0f2a351797cda98249d9ca56a70ce4ebcaba45f4"}, + {file = "pydantic_core-2.18.3-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f0928cde2ae416a2d1ebe6dee324709c6f73e93494d8c7aea92df99aab1fc40f"}, + {file = "pydantic_core-2.18.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0bee9bb305a562f8b9271855afb6ce00223f545de3d68560b3c1649c7c5295e9"}, + {file = "pydantic_core-2.18.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e862823be114387257dacbfa7d78547165a85d7add33b446ca4f4fae92c7ff5c"}, + {file = "pydantic_core-2.18.3-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6a36f78674cbddc165abab0df961b5f96b14461d05feec5e1f78da58808b97e7"}, + {file = "pydantic_core-2.18.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ba905d184f62e7ddbb7a5a751d8a5c805463511c7b08d1aca4a3e8c11f2e5048"}, + {file = "pydantic_core-2.18.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7fdd362f6a586e681ff86550b2379e532fee63c52def1c666887956748eaa326"}, + {file = "pydantic_core-2.18.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:24b214b7ee3bd3b865e963dbed0f8bc5375f49449d70e8d407b567af3222aae4"}, + {file = "pydantic_core-2.18.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:691018785779766127f531674fa82bb368df5b36b461622b12e176c18e119022"}, + {file = "pydantic_core-2.18.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:60e4c625e6f7155d7d0dcac151edf5858102bc61bf959d04469ca6ee4e8381bd"}, + {file = "pydantic_core-2.18.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a4e651e47d981c1b701dcc74ab8fec5a60a5b004650416b4abbef13db23bc7be"}, + {file = "pydantic_core-2.18.3-cp312-none-win32.whl", hash = "sha256:ffecbb5edb7f5ffae13599aec33b735e9e4c7676ca1633c60f2c606beb17efc5"}, + {file = "pydantic_core-2.18.3-cp312-none-win_amd64.whl", hash = "sha256:2c8333f6e934733483c7eddffdb094c143b9463d2af7e6bd85ebcb2d4a1b82c6"}, + {file = "pydantic_core-2.18.3-cp312-none-win_arm64.whl", hash = "sha256:7a20dded653e516a4655f4c98e97ccafb13753987434fe7cf044aa25f5b7d417"}, + {file = "pydantic_core-2.18.3-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:eecf63195be644b0396f972c82598cd15693550f0ff236dcf7ab92e2eb6d3522"}, + {file = "pydantic_core-2.18.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2c44efdd3b6125419c28821590d7ec891c9cb0dff33a7a78d9d5c8b6f66b9702"}, + {file = "pydantic_core-2.18.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6e59fca51ffbdd1638b3856779342ed69bcecb8484c1d4b8bdb237d0eb5a45e2"}, + {file = "pydantic_core-2.18.3-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:70cf099197d6b98953468461d753563b28e73cf1eade2ffe069675d2657ed1d5"}, + {file = "pydantic_core-2.18.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:63081a49dddc6124754b32a3774331467bfc3d2bd5ff8f10df36a95602560361"}, + {file = "pydantic_core-2.18.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:370059b7883485c9edb9655355ff46d912f4b03b009d929220d9294c7fd9fd60"}, + {file = "pydantic_core-2.18.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a64faeedfd8254f05f5cf6fc755023a7e1606af3959cfc1a9285744cc711044"}, + {file = "pydantic_core-2.18.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:19d2e725de0f90d8671f89e420d36c3dd97639b98145e42fcc0e1f6d492a46dc"}, + {file = "pydantic_core-2.18.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:67bc078025d70ec5aefe6200ef094576c9d86bd36982df1301c758a9fff7d7f4"}, + {file = "pydantic_core-2.18.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:adf952c3f4100e203cbaf8e0c907c835d3e28f9041474e52b651761dc248a3c0"}, + {file = "pydantic_core-2.18.3-cp38-none-win32.whl", hash = "sha256:9a46795b1f3beb167eaee91736d5d17ac3a994bf2215a996aed825a45f897558"}, + {file = "pydantic_core-2.18.3-cp38-none-win_amd64.whl", hash = "sha256:200ad4e3133cb99ed82342a101a5abf3d924722e71cd581cc113fe828f727fbc"}, + {file = "pydantic_core-2.18.3-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:304378b7bf92206036c8ddd83a2ba7b7d1a5b425acafff637172a3aa72ad7083"}, + {file = "pydantic_core-2.18.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c826870b277143e701c9ccf34ebc33ddb4d072612683a044e7cce2d52f6c3fef"}, + {file = "pydantic_core-2.18.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e201935d282707394f3668380e41ccf25b5794d1b131cdd96b07f615a33ca4b1"}, + {file = "pydantic_core-2.18.3-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5560dda746c44b48bf82b3d191d74fe8efc5686a9ef18e69bdabccbbb9ad9442"}, + {file = "pydantic_core-2.18.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6b32c2a1f8032570842257e4c19288eba9a2bba4712af542327de9a1204faff8"}, + {file = "pydantic_core-2.18.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:929c24e9dea3990bc8bcd27c5f2d3916c0c86f5511d2caa69e0d5290115344a9"}, + {file = "pydantic_core-2.18.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1a8376fef60790152564b0eab376b3e23dd6e54f29d84aad46f7b264ecca943"}, + {file = "pydantic_core-2.18.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dccf3ef1400390ddd1fb55bf0632209d39140552d068ee5ac45553b556780e06"}, + {file = "pydantic_core-2.18.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:41dbdcb0c7252b58fa931fec47937edb422c9cb22528f41cb8963665c372caf6"}, + {file = "pydantic_core-2.18.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:666e45cf071669fde468886654742fa10b0e74cd0fa0430a46ba6056b24fb0af"}, + {file = "pydantic_core-2.18.3-cp39-none-win32.whl", hash = "sha256:f9c08cabff68704a1b4667d33f534d544b8a07b8e5d039c37067fceb18789e78"}, + {file = "pydantic_core-2.18.3-cp39-none-win_amd64.whl", hash = "sha256:4afa5f5973e8572b5c0dcb4e2d4fda7890e7cd63329bd5cc3263a25c92ef0026"}, + {file = "pydantic_core-2.18.3-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:77319771a026f7c7d29c6ebc623de889e9563b7087911b46fd06c044a12aa5e9"}, + {file = "pydantic_core-2.18.3-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:df11fa992e9f576473038510d66dd305bcd51d7dd508c163a8c8fe148454e059"}, + {file = "pydantic_core-2.18.3-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d531076bdfb65af593326ffd567e6ab3da145020dafb9187a1d131064a55f97c"}, + {file = "pydantic_core-2.18.3-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d33ce258e4e6e6038f2b9e8b8a631d17d017567db43483314993b3ca345dcbbb"}, + {file = "pydantic_core-2.18.3-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1f9cd7f5635b719939019be9bda47ecb56e165e51dd26c9a217a433e3d0d59a9"}, + {file = "pydantic_core-2.18.3-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:cd4a032bb65cc132cae1fe3e52877daecc2097965cd3914e44fbd12b00dae7c5"}, + {file = "pydantic_core-2.18.3-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:82f2718430098bcdf60402136c845e4126a189959d103900ebabb6774a5d9fdb"}, + {file = "pydantic_core-2.18.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c0037a92cf0c580ed14e10953cdd26528e8796307bb8bb312dc65f71547df04d"}, + {file = "pydantic_core-2.18.3-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b95a0972fac2b1ff3c94629fc9081b16371dad870959f1408cc33b2f78ad347a"}, + {file = "pydantic_core-2.18.3-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:a62e437d687cc148381bdd5f51e3e81f5b20a735c55f690c5be94e05da2b0d5c"}, + {file = "pydantic_core-2.18.3-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b367a73a414bbb08507da102dc2cde0fa7afe57d09b3240ce82a16d608a7679c"}, + {file = "pydantic_core-2.18.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ecce4b2360aa3f008da3327d652e74a0e743908eac306198b47e1c58b03dd2b"}, + {file = "pydantic_core-2.18.3-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:bd4435b8d83f0c9561a2a9585b1de78f1abb17cb0cef5f39bf6a4b47d19bafe3"}, + {file = "pydantic_core-2.18.3-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:616221a6d473c5b9aa83fa8982745441f6a4a62a66436be9445c65f241b86c94"}, + {file = "pydantic_core-2.18.3-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:7e6382ce89a92bc1d0c0c5edd51e931432202b9080dc921d8d003e616402efd1"}, + {file = "pydantic_core-2.18.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:ff58f379345603d940e461eae474b6bbb6dab66ed9a851ecd3cb3709bf4dcf6a"}, + {file = "pydantic_core-2.18.3.tar.gz", hash = "sha256:432e999088d85c8f36b9a3f769a8e2b57aabd817bbb729a90d1fe7f18f6f1f39"}, ] [package.dependencies] From 7552e03d77f057fc6e1b07104d7b8a06a0a21cd1 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 30 May 2024 06:14:22 +0200 Subject: [PATCH 61/80] Bump requests from 2.32.2 to 2.32.3 (#778) Bumps [requests](https://github.com/psf/requests) from 2.32.2 to 2.32.3. - [Release notes](https://github.com/psf/requests/releases) - [Changelog](https://github.com/psf/requests/blob/main/HISTORY.md) - [Commits](https://github.com/psf/requests/compare/v2.32.2...v2.32.3) --- updated-dependencies: - dependency-name: requests dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index 35c0f9ee0d..95118015c6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3584,13 +3584,13 @@ files = [ [[package]] name = "requests" -version = "2.32.2" +version = "2.32.3" description = "Python HTTP for Humans." optional = false python-versions = ">=3.8" files = [ - {file = "requests-2.32.2-py3-none-any.whl", hash = "sha256:fc06670dd0ed212426dfeb94fc1b983d917c4f9847c863f313c9dfaaffb7c23c"}, - {file = "requests-2.32.2.tar.gz", hash = "sha256:dd951ff5ecf3e3b3aa26b40703ba77495dab41da839ae72ef3c8e5d8e2433289"}, + {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, + {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, ] [package.dependencies] From e08cc9dd704ae46149e0644f1c9cbf1509360613 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 30 May 2024 06:31:05 +0200 Subject: [PATCH 62/80] Bump getdaft from 0.2.24 to 0.2.25 (#779) Bumps [getdaft](https://github.com/Eventual-Inc/Daft) from 0.2.24 to 0.2.25. - [Release notes](https://github.com/Eventual-Inc/Daft/releases) - [Commits](https://github.com/Eventual-Inc/Daft/compare/v0.2.24...v0.2.25) --- updated-dependencies: - dependency-name: getdaft dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/poetry.lock b/poetry.lock index 95118015c6..7931ee0e38 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1344,17 +1344,17 @@ gcsfuse = ["fusepy"] [[package]] name = "getdaft" -version = "0.2.24" +version = "0.2.25" description = "Distributed Dataframes for Multimodal Data" optional = true python-versions = ">=3.8" files = [ - {file = "getdaft-0.2.24-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:6dbb2c25f14c008fe1323590dc86bbed9d0de8b470aa62c0844bb218864b42da"}, - {file = "getdaft-0.2.24-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:1c27ff4e3e00275db611c8fad5edefc1a24f8494093ce18f0b846b147b4d6cd6"}, - {file = "getdaft-0.2.24-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae0d0ae1238fa5eb2ddfbefbc52e47aa6f9d00e9621dde0ecbee70be43cee8e8"}, - {file = "getdaft-0.2.24-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:473881f9406d166dace7f12a3cb74915f8901b628f6d9f0900fdf69cf05b0031"}, - {file = "getdaft-0.2.24-cp38-abi3-win_amd64.whl", hash = "sha256:c77266e55245c95a5c972dd49a47a764cde1b2007cc30ab08c2f25f7a36d6697"}, - {file = "getdaft-0.2.24.tar.gz", hash = "sha256:1fa4eae81ab101bed544ee64e3128e2df4f267a87640cd1473e00f944c32a216"}, + {file = "getdaft-0.2.25-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:7aab5bdf4af6b9bb0f7e0555cd36762d57da97ed026017f3a4b00f97bf5bf7f1"}, + {file = "getdaft-0.2.25-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:12a95f0ce9206c77a439ace0dc705d13acbe0e8278907ad2e57f62e0c01330ad"}, + {file = "getdaft-0.2.25-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cfeef90e2f446f65e0e7292431e5354995fe693cf9bbbd434dafd4b8971ea83"}, + {file = "getdaft-0.2.25-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1b86a42e7310de613a0fb30d68a70ee0678e6605023e48a3c1dd28f8752d380e"}, + {file = "getdaft-0.2.25-cp38-abi3-win_amd64.whl", hash = "sha256:fbb3437e666478d06e661d961e5fd10b8cc33385bd2bafafcd22daf403fe6df1"}, + {file = "getdaft-0.2.25.tar.gz", hash = "sha256:60b2ca7d39447ba4b19eab6ccfd6fc706914ecf43d0080a13c832b013dda589b"}, ] [package.dependencies] From d3ad61c5d4cdbf908d667e2ed3ef5ad2d9f15fbe Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Thu, 30 May 2024 09:04:42 +0200 Subject: [PATCH 63/80] Remove `record_fields` from the `Record` class (#580) First step towards https://github.com/apache/iceberg-python/issues/579 --- pyiceberg/manifest.py | 3 ++- pyiceberg/partitioning.py | 4 ++-- pyiceberg/table/snapshots.py | 4 ++-- pyiceberg/typedef.py | 7 +++---- tests/integration/test_rest_manifest.py | 5 +++-- 5 files changed, 12 insertions(+), 11 deletions(-) diff --git a/pyiceberg/manifest.py b/pyiceberg/manifest.py index 3b8138b61a..defe5958c5 100644 --- a/pyiceberg/manifest.py +++ b/pyiceberg/manifest.py @@ -18,6 +18,7 @@ import math from abc import ABC, abstractmethod +from copy import copy from enum import Enum from types import TracebackType from typing import ( @@ -909,7 +910,7 @@ def __init__(self, output_file: OutputFile, snapshot_id: int, parent_snapshot_id self._sequence_number = sequence_number def prepare_manifest(self, manifest_file: ManifestFile) -> ManifestFile: - wrapped_manifest_file = ManifestFile(*manifest_file.record_fields()) + wrapped_manifest_file = copy(manifest_file) if wrapped_manifest_file.sequence_number == UNASSIGNED_SEQ: # if the sequence number is being assigned here, then the manifest must be created by the current operation. diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py index a3cf255341..f4e53a59a5 100644 --- a/pyiceberg/partitioning.py +++ b/pyiceberg/partitioning.py @@ -229,9 +229,9 @@ def partition_to_path(self, data: Record, schema: Schema) -> str: field_strs = [] value_strs = [] - for pos, value in enumerate(data.record_fields()): + for pos in range(len(self.fields)): partition_field = self.fields[pos] - value_str = partition_field.transform.to_human_string(field_types[pos].field_type, value=value) + value_str = partition_field.transform.to_human_string(field_types[pos].field_type, value=data[pos]) value_str = quote(value_str, safe='') value_strs.append(value_str) diff --git a/pyiceberg/table/snapshots.py b/pyiceberg/table/snapshots.py index f74ac4b7d4..79eb8b0b8a 100644 --- a/pyiceberg/table/snapshots.py +++ b/pyiceberg/table/snapshots.py @@ -274,14 +274,14 @@ def set_partition_summary_limit(self, limit: int) -> None: def add_file(self, data_file: DataFile, schema: Schema, partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC) -> None: self.metrics.add_file(data_file) - if len(data_file.partition.record_fields()) != 0: + if len(data_file.partition) > 0: self.update_partition_metrics(partition_spec=partition_spec, file=data_file, is_add_file=True, schema=schema) def remove_file( self, data_file: DataFile, schema: Schema, partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC ) -> None: self.metrics.remove_file(data_file) - if len(data_file.partition.record_fields()) != 0: + if len(data_file.partition) > 0: self.update_partition_metrics(partition_spec=partition_spec, file=data_file, is_add_file=False, schema=schema) def update_partition_metrics(self, partition_spec: PartitionSpec, file: DataFile, is_add_file: bool, schema: Schema) -> None: diff --git a/pyiceberg/typedef.py b/pyiceberg/typedef.py index 6ccf9526ba..26f4d4d5ac 100644 --- a/pyiceberg/typedef.py +++ b/pyiceberg/typedef.py @@ -25,7 +25,6 @@ Callable, Dict, Generic, - List, Literal, Optional, Protocol, @@ -198,9 +197,9 @@ def __repr__(self) -> str: """Return the string representation of the Record class.""" return f"{self.__class__.__name__}[{', '.join(f'{key}={repr(value)}' for key, value in self.__dict__.items() if not key.startswith('_'))}]" - def record_fields(self) -> List[str]: - """Return values of all the fields of the Record class except those specified in skip_fields.""" - return [self.__getattribute__(v) if hasattr(self, v) else None for v in self._position_to_field_name] + def __len__(self) -> int: + """Return the number of fields in the Record class.""" + return len(self._position_to_field_name) def __hash__(self) -> int: """Return hash value of the Record class.""" diff --git a/tests/integration/test_rest_manifest.py b/tests/integration/test_rest_manifest.py index 8191209ae6..0e768c6e68 100644 --- a/tests/integration/test_rest_manifest.py +++ b/tests/integration/test_rest_manifest.py @@ -17,6 +17,7 @@ # pylint:disable=redefined-outer-name import inspect +from copy import copy from enum import Enum from tempfile import TemporaryDirectory from typing import Any @@ -26,7 +27,7 @@ from pyiceberg.catalog import Catalog, load_catalog from pyiceberg.io.pyarrow import PyArrowFileIO -from pyiceberg.manifest import DataFile, ManifestEntry, write_manifest +from pyiceberg.manifest import DataFile, write_manifest from pyiceberg.table import Table from pyiceberg.utils.lazydict import LazyDict @@ -99,7 +100,7 @@ def test_write_sample_manifest(table_test_all_types: Table) -> None: sort_order_id=entry.data_file.sort_order_id, spec_id=entry.data_file.spec_id, ) - wrapped_entry_v2 = ManifestEntry(*entry.record_fields()) + wrapped_entry_v2 = copy(entry) wrapped_entry_v2.data_file = wrapped_data_file_v2_debug wrapped_entry_v2_dict = todict(wrapped_entry_v2) # This one should not be written From cf3bf8a977f80f986237bc62293666de327871b3 Mon Sep 17 00:00:00 2001 From: Honah J Date: Thu, 30 May 2024 06:44:59 -0700 Subject: [PATCH 64/80] Unify to double quotes using Ruff (#781) --- pyiceberg/catalog/hive.py | 2 +- pyiceberg/catalog/rest.py | 2 +- pyiceberg/expressions/parser.py | 10 +- pyiceberg/partitioning.py | 2 +- pyiceberg/schema.py | 10 +- pyiceberg/table/__init__.py | 238 ++++++------ pyiceberg/table/metadata.py | 6 +- pyiceberg/table/name_mapping.py | 14 +- pyiceberg/table/refs.py | 8 +- pyiceberg/table/snapshots.py | 66 ++-- pyiceberg/typedef.py | 2 +- pyiceberg/utils/config.py | 2 +- ruff.toml | 2 +- tests/avro/test_file.py | 10 +- tests/catalog/integration_test_glue.py | 2 +- tests/catalog/test_dynamodb.py | 6 +- tests/catalog/test_glue.py | 4 +- tests/catalog/test_hive.py | 160 ++++---- tests/catalog/test_sql.py | 348 +++++++++--------- tests/conftest.py | 56 +-- tests/expressions/test_expressions.py | 8 +- tests/integration/test_add_files.py | 8 +- tests/integration/test_inspect_table.py | 218 +++++------ tests/integration/test_partition_evolution.py | 78 ++-- tests/integration/test_partitioning_key.py | 16 +- tests/integration/test_reads.py | 60 +-- tests/integration/test_rest_manifest.py | 2 +- tests/integration/test_rest_schema.py | 22 +- .../test_writes/test_partitioned_writes.py | 60 +-- tests/integration/test_writes/test_writes.py | 232 ++++++------ tests/io/test_pyarrow.py | 8 +- tests/io/test_pyarrow_visitor.py | 78 ++-- tests/table/test_init.py | 48 +-- tests/table/test_metadata.py | 2 +- tests/table/test_name_mapping.py | 160 ++++---- tests/table/test_snapshots.py | 162 ++++---- tests/test_serializers.py | 2 +- tests/test_transforms.py | 6 +- tests/utils/test_config.py | 4 +- tests/utils/test_decimal.py | 4 +- 40 files changed, 1064 insertions(+), 1064 deletions(-) diff --git a/pyiceberg/catalog/hive.py b/pyiceberg/catalog/hive.py index 708ae8c9d4..13b57b6ea9 100644 --- a/pyiceberg/catalog/hive.py +++ b/pyiceberg/catalog/hive.py @@ -146,7 +146,7 @@ def __init__(self, uri: str, ugi: Optional[str] = None): protocol = TBinaryProtocol.TBinaryProtocol(transport) self._client = Client(protocol) - self._ugi = ugi.split(':') if ugi else None + self._ugi = ugi.split(":") if ugi else None def __enter__(self) -> Client: self._transport.open() diff --git a/pyiceberg/catalog/rest.py b/pyiceberg/catalog/rest.py index afd5818662..2474b89853 100644 --- a/pyiceberg/catalog/rest.py +++ b/pyiceberg/catalog/rest.py @@ -152,7 +152,7 @@ class CreateTableRequest(IcebergBaseModel): properties: Dict[str, str] = Field(default_factory=dict) # validators - @field_validator('properties', mode='before') + @field_validator("properties", mode="before") def transform_properties_dict_value_to_str(cls, properties: Properties) -> Dict[str, str]: return transform_dict_value_to_str(properties) diff --git a/pyiceberg/expressions/parser.py b/pyiceberg/expressions/parser.py index 8873907813..107d2349db 100644 --- a/pyiceberg/expressions/parser.py +++ b/pyiceberg/expressions/parser.py @@ -78,7 +78,7 @@ identifier = Word(alphas, alphanums + "_$").set_results_name("identifier") column = DelimitedList(identifier, delim=".", combine=False).set_results_name("column") -like_regex = r'(?P(?(?(?(? BooleanExpression: match = re.search(like_regex, literal_like.value) - if match and match.groupdict()['invalid_wildcard']: + if match and match.groupdict()["invalid_wildcard"]: raise ValueError("LIKE expressions only supports wildcard, '%', at the end of a string") - elif match and match.groupdict()['valid_wildcard']: - return StartsWith(result.column, StringLiteral(literal_like.value[:-1].replace('\\%', '%'))) + elif match and match.groupdict()["valid_wildcard"]: + return StartsWith(result.column, StringLiteral(literal_like.value[:-1].replace("\\%", "%"))) else: - return EqualTo(result.column, StringLiteral(literal_like.value.replace('\\%', '%'))) + return EqualTo(result.column, StringLiteral(literal_like.value.replace("\\%", "%"))) predicate = (comparison | in_check | null_check | nan_check | starts_check | boolean).set_results_name("predicate") diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py index f4e53a59a5..481207db7a 100644 --- a/pyiceberg/partitioning.py +++ b/pyiceberg/partitioning.py @@ -233,7 +233,7 @@ def partition_to_path(self, data: Record, schema: Schema) -> str: partition_field = self.fields[pos] value_str = partition_field.transform.to_human_string(field_types[pos].field_type, value=data[pos]) - value_str = quote(value_str, safe='') + value_str = quote(value_str, safe="") value_strs.append(value_str) field_strs.append(partition_field.name) diff --git a/pyiceberg/schema.py b/pyiceberg/schema.py index b2739d8618..77f1addbf5 100644 --- a/pyiceberg/schema.py +++ b/pyiceberg/schema.py @@ -1311,11 +1311,11 @@ def _valid_avro_name(name: str) -> bool: length = len(name) assert length > 0, ValueError("Can not validate empty avro name") first = name[0] - if not (first.isalpha() or first == '_'): + if not (first.isalpha() or first == "_"): return False for character in name[1:]: - if not (character.isalnum() or character == '_'): + if not (character.isalnum() or character == "_"): return False return True @@ -1323,17 +1323,17 @@ def _valid_avro_name(name: str) -> bool: def _sanitize_name(name: str) -> str: sb = [] first = name[0] - if not (first.isalpha() or first == '_'): + if not (first.isalpha() or first == "_"): sb.append(_sanitize_char(first)) else: sb.append(first) for character in name[1:]: - if not (character.isalnum() or character == '_'): + if not (character.isalnum() or character == "_"): sb.append(_sanitize_char(character)) else: sb.append(character) - return ''.join(sb) + return "".join(sb) def _sanitize_char(character: str) -> str: diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 74b0225dbe..aa108de08b 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -568,17 +568,17 @@ def commit_transaction(self) -> Table: class AssignUUIDUpdate(IcebergBaseModel): - action: Literal['assign-uuid'] = Field(default="assign-uuid") + action: Literal["assign-uuid"] = Field(default="assign-uuid") uuid: uuid.UUID class UpgradeFormatVersionUpdate(IcebergBaseModel): - action: Literal['upgrade-format-version'] = Field(default="upgrade-format-version") + action: Literal["upgrade-format-version"] = Field(default="upgrade-format-version") format_version: int = Field(alias="format-version") class AddSchemaUpdate(IcebergBaseModel): - action: Literal['add-schema'] = Field(default="add-schema") + action: Literal["add-schema"] = Field(default="add-schema") schema_: Schema = Field(alias="schema") # This field is required: https://github.com/apache/iceberg/pull/7445 last_column_id: int = Field(alias="last-column-id") @@ -587,47 +587,47 @@ class AddSchemaUpdate(IcebergBaseModel): class SetCurrentSchemaUpdate(IcebergBaseModel): - action: Literal['set-current-schema'] = Field(default="set-current-schema") + action: Literal["set-current-schema"] = Field(default="set-current-schema") schema_id: int = Field( alias="schema-id", description="Schema ID to set as current, or -1 to set last added schema", default=-1 ) class AddPartitionSpecUpdate(IcebergBaseModel): - action: Literal['add-spec'] = Field(default="add-spec") + action: Literal["add-spec"] = Field(default="add-spec") spec: PartitionSpec initial_change: bool = Field(default=False, exclude=True) class SetDefaultSpecUpdate(IcebergBaseModel): - action: Literal['set-default-spec'] = Field(default="set-default-spec") + action: Literal["set-default-spec"] = Field(default="set-default-spec") spec_id: int = Field( alias="spec-id", description="Partition spec ID to set as the default, or -1 to set last added spec", default=-1 ) class AddSortOrderUpdate(IcebergBaseModel): - action: Literal['add-sort-order'] = Field(default="add-sort-order") + action: Literal["add-sort-order"] = Field(default="add-sort-order") sort_order: SortOrder = Field(alias="sort-order") initial_change: bool = Field(default=False, exclude=True) class SetDefaultSortOrderUpdate(IcebergBaseModel): - action: Literal['set-default-sort-order'] = Field(default="set-default-sort-order") + action: Literal["set-default-sort-order"] = Field(default="set-default-sort-order") sort_order_id: int = Field( alias="sort-order-id", description="Sort order ID to set as the default, or -1 to set last added sort order", default=-1 ) class AddSnapshotUpdate(IcebergBaseModel): - action: Literal['add-snapshot'] = Field(default="add-snapshot") + action: Literal["add-snapshot"] = Field(default="add-snapshot") snapshot: Snapshot class SetSnapshotRefUpdate(IcebergBaseModel): - action: Literal['set-snapshot-ref'] = Field(default="set-snapshot-ref") + action: Literal["set-snapshot-ref"] = Field(default="set-snapshot-ref") ref_name: str = Field(alias="ref-name") type: Literal["tag", "branch"] snapshot_id: int = Field(alias="snapshot-id") @@ -637,31 +637,31 @@ class SetSnapshotRefUpdate(IcebergBaseModel): class RemoveSnapshotsUpdate(IcebergBaseModel): - action: Literal['remove-snapshots'] = Field(default="remove-snapshots") + action: Literal["remove-snapshots"] = Field(default="remove-snapshots") snapshot_ids: List[int] = Field(alias="snapshot-ids") class RemoveSnapshotRefUpdate(IcebergBaseModel): - action: Literal['remove-snapshot-ref'] = Field(default="remove-snapshot-ref") + action: Literal["remove-snapshot-ref"] = Field(default="remove-snapshot-ref") ref_name: str = Field(alias="ref-name") class SetLocationUpdate(IcebergBaseModel): - action: Literal['set-location'] = Field(default="set-location") + action: Literal["set-location"] = Field(default="set-location") location: str class SetPropertiesUpdate(IcebergBaseModel): - action: Literal['set-properties'] = Field(default="set-properties") + action: Literal["set-properties"] = Field(default="set-properties") updates: Dict[str, str] - @field_validator('updates', mode='before') + @field_validator("updates", mode="before") def transform_properties_dict_value_to_str(cls, properties: Properties) -> Dict[str, str]: return transform_dict_value_to_str(properties) class RemovePropertiesUpdate(IcebergBaseModel): - action: Literal['remove-properties'] = Field(default="remove-properties") + action: Literal["remove-properties"] = Field(default="remove-properties") removals: List[str] @@ -683,7 +683,7 @@ class RemovePropertiesUpdate(IcebergBaseModel): SetPropertiesUpdate, RemovePropertiesUpdate, ], - Field(discriminator='action'), + Field(discriminator="action"), ] @@ -1142,7 +1142,7 @@ def validate(self, base_metadata: Optional[TableMetadata]) -> None: AssertDefaultSpecId, AssertDefaultSortOrderId, ], - Field(discriminator='type'), + Field(discriminator="type"), ] UpdatesAndRequirements = Tuple[Tuple[TableUpdate, ...], Tuple[TableRequirement, ...]] @@ -1153,7 +1153,7 @@ class Namespace(IcebergRootModel[List[str]]): root: List[str] = Field( ..., - description='Reference to one or more levels of a namespace', + description="Reference to one or more levels of a namespace", ) @@ -1793,7 +1793,7 @@ class Move: other_field_id: Optional[int] = None -U = TypeVar('U') +U = TypeVar("U") class UpdateTableMetadata(ABC, Generic[U]): @@ -2682,13 +2682,13 @@ class AddFileTask: def _new_manifest_path(location: str, num: int, commit_uuid: uuid.UUID) -> str: - return f'{location}/metadata/{commit_uuid}-m{num}.avro' + return f"{location}/metadata/{commit_uuid}-m{num}.avro" def _generate_manifest_list_path(location: str, snapshot_id: int, attempt: int, commit_uuid: uuid.UUID) -> str: # Mimics the behavior in Java: # https://github.com/apache/iceberg/blob/c862b9177af8e2d83122220764a056f3b96fd00c/core/src/main/java/org/apache/iceberg/SnapshotProducer.java#L491 - return f'{location}/metadata/snap-{snapshot_id}-{attempt}-{commit_uuid}.avro' + return f"{location}/metadata/snap-{snapshot_id}-{attempt}-{commit_uuid}.avro" def _dataframe_to_data_files( @@ -3242,7 +3242,7 @@ def _partition_field(self, transform_key: Tuple[int, Transform[Any, Any]], name: new_field_id = self._new_field_id() if name is None: - tmp_field = PartitionField(transform_key[0], new_field_id, transform_key[1], 'unassigned_field_name') + tmp_field = PartitionField(transform_key[0], new_field_id, transform_key[1], "unassigned_field_name") name = _visit_partition_field(self._transaction.table_metadata.schema(), tmp_field, _PartitionNameGenerator()) return PartitionField(transform_key[0], new_field_id, transform_key[1], name) @@ -3281,12 +3281,12 @@ def snapshots(self) -> "pa.Table": import pyarrow as pa snapshots_schema = pa.schema([ - pa.field('committed_at', pa.timestamp(unit='ms'), nullable=False), - pa.field('snapshot_id', pa.int64(), nullable=False), - pa.field('parent_id', pa.int64(), nullable=True), - pa.field('operation', pa.string(), nullable=True), - pa.field('manifest_list', pa.string(), nullable=False), - pa.field('summary', pa.map_(pa.string(), pa.string()), nullable=True), + pa.field("committed_at", pa.timestamp(unit="ms"), nullable=False), + pa.field("snapshot_id", pa.int64(), nullable=False), + pa.field("parent_id", pa.int64(), nullable=True), + pa.field("operation", pa.string(), nullable=True), + pa.field("manifest_list", pa.string(), nullable=False), + pa.field("summary", pa.map_(pa.string(), pa.string()), nullable=True), ]) snapshots = [] for snapshot in self.tbl.metadata.snapshots: @@ -3298,12 +3298,12 @@ def snapshots(self) -> "pa.Table": additional_properties = None snapshots.append({ - 'committed_at': datetime.utcfromtimestamp(snapshot.timestamp_ms / 1000.0), - 'snapshot_id': snapshot.snapshot_id, - 'parent_id': snapshot.parent_snapshot_id, - 'operation': str(operation), - 'manifest_list': snapshot.manifest_list, - 'summary': additional_properties, + "committed_at": datetime.utcfromtimestamp(snapshot.timestamp_ms / 1000.0), + "snapshot_id": snapshot.snapshot_id, + "parent_id": snapshot.parent_snapshot_id, + "operation": str(operation), + "manifest_list": snapshot.manifest_list, + "summary": additional_properties, }) return pa.Table.from_pylist( @@ -3340,33 +3340,33 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType: pa_record_struct = schema_to_pyarrow(partition_record) entries_schema = pa.schema([ - pa.field('status', pa.int8(), nullable=False), - pa.field('snapshot_id', pa.int64(), nullable=False), - pa.field('sequence_number', pa.int64(), nullable=False), - pa.field('file_sequence_number', pa.int64(), nullable=False), + pa.field("status", pa.int8(), nullable=False), + pa.field("snapshot_id", pa.int64(), nullable=False), + pa.field("sequence_number", pa.int64(), nullable=False), + pa.field("file_sequence_number", pa.int64(), nullable=False), pa.field( - 'data_file', + "data_file", pa.struct([ - pa.field('content', pa.int8(), nullable=False), - pa.field('file_path', pa.string(), nullable=False), - pa.field('file_format', pa.string(), nullable=False), - pa.field('partition', pa_record_struct, nullable=False), - pa.field('record_count', pa.int64(), nullable=False), - pa.field('file_size_in_bytes', pa.int64(), nullable=False), - pa.field('column_sizes', pa.map_(pa.int32(), pa.int64()), nullable=True), - pa.field('value_counts', pa.map_(pa.int32(), pa.int64()), nullable=True), - pa.field('null_value_counts', pa.map_(pa.int32(), pa.int64()), nullable=True), - pa.field('nan_value_counts', pa.map_(pa.int32(), pa.int64()), nullable=True), - pa.field('lower_bounds', pa.map_(pa.int32(), pa.binary()), nullable=True), - pa.field('upper_bounds', pa.map_(pa.int32(), pa.binary()), nullable=True), - pa.field('key_metadata', pa.binary(), nullable=True), - pa.field('split_offsets', pa.list_(pa.int64()), nullable=True), - pa.field('equality_ids', pa.list_(pa.int32()), nullable=True), - pa.field('sort_order_id', pa.int32(), nullable=True), + pa.field("content", pa.int8(), nullable=False), + pa.field("file_path", pa.string(), nullable=False), + pa.field("file_format", pa.string(), nullable=False), + pa.field("partition", pa_record_struct, nullable=False), + pa.field("record_count", pa.int64(), nullable=False), + pa.field("file_size_in_bytes", pa.int64(), nullable=False), + pa.field("column_sizes", pa.map_(pa.int32(), pa.int64()), nullable=True), + pa.field("value_counts", pa.map_(pa.int32(), pa.int64()), nullable=True), + pa.field("null_value_counts", pa.map_(pa.int32(), pa.int64()), nullable=True), + pa.field("nan_value_counts", pa.map_(pa.int32(), pa.int64()), nullable=True), + pa.field("lower_bounds", pa.map_(pa.int32(), pa.binary()), nullable=True), + pa.field("upper_bounds", pa.map_(pa.int32(), pa.binary()), nullable=True), + pa.field("key_metadata", pa.binary(), nullable=True), + pa.field("split_offsets", pa.list_(pa.int64()), nullable=True), + pa.field("equality_ids", pa.list_(pa.int32()), nullable=True), + pa.field("sort_order_id", pa.int32(), nullable=True), ]), nullable=False, ), - pa.field('readable_metrics', pa.struct(readable_metrics_struct), nullable=True), + pa.field("readable_metrics", pa.struct(readable_metrics_struct), nullable=True), ]) entries = [] @@ -3403,11 +3403,11 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType: } entries.append({ - 'status': entry.status.value, - 'snapshot_id': entry.snapshot_id, - 'sequence_number': entry.data_sequence_number, - 'file_sequence_number': entry.file_sequence_number, - 'data_file': { + "status": entry.status.value, + "snapshot_id": entry.snapshot_id, + "sequence_number": entry.data_sequence_number, + "file_sequence_number": entry.file_sequence_number, + "data_file": { "content": entry.data_file.content, "file_path": entry.data_file.file_path, "file_format": entry.data_file.file_format, @@ -3426,7 +3426,7 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType: "sort_order_id": entry.data_file.sort_order_id, "spec_id": entry.data_file.spec_id, }, - 'readable_metrics': readable_metrics, + "readable_metrics": readable_metrics, }) return pa.Table.from_pylist( @@ -3438,24 +3438,24 @@ def refs(self) -> "pa.Table": import pyarrow as pa ref_schema = pa.schema([ - pa.field('name', pa.string(), nullable=False), - pa.field('type', pa.dictionary(pa.int32(), pa.string()), nullable=False), - pa.field('snapshot_id', pa.int64(), nullable=False), - pa.field('max_reference_age_in_ms', pa.int64(), nullable=True), - pa.field('min_snapshots_to_keep', pa.int32(), nullable=True), - pa.field('max_snapshot_age_in_ms', pa.int64(), nullable=True), + pa.field("name", pa.string(), nullable=False), + pa.field("type", pa.dictionary(pa.int32(), pa.string()), nullable=False), + pa.field("snapshot_id", pa.int64(), nullable=False), + pa.field("max_reference_age_in_ms", pa.int64(), nullable=True), + pa.field("min_snapshots_to_keep", pa.int32(), nullable=True), + pa.field("max_snapshot_age_in_ms", pa.int64(), nullable=True), ]) ref_results = [] for ref in self.tbl.metadata.refs: if snapshot_ref := self.tbl.metadata.refs.get(ref): ref_results.append({ - 'name': ref, - 'type': snapshot_ref.snapshot_ref_type.upper(), - 'snapshot_id': snapshot_ref.snapshot_id, - 'max_reference_age_in_ms': snapshot_ref.max_ref_age_ms, - 'min_snapshots_to_keep': snapshot_ref.min_snapshots_to_keep, - 'max_snapshot_age_in_ms': snapshot_ref.max_snapshot_age_ms, + "name": ref, + "type": snapshot_ref.snapshot_ref_type.upper(), + "snapshot_id": snapshot_ref.snapshot_id, + "max_reference_age_in_ms": snapshot_ref.max_ref_age_ms, + "min_snapshots_to_keep": snapshot_ref.min_snapshots_to_keep, + "max_snapshot_age_in_ms": snapshot_ref.max_snapshot_age_ms, }) return pa.Table.from_pylist(ref_results, schema=ref_schema) @@ -3466,15 +3466,15 @@ def partitions(self, snapshot_id: Optional[int] = None) -> "pa.Table": from pyiceberg.io.pyarrow import schema_to_pyarrow table_schema = pa.schema([ - pa.field('record_count', pa.int64(), nullable=False), - pa.field('file_count', pa.int32(), nullable=False), - pa.field('total_data_file_size_in_bytes', pa.int64(), nullable=False), - pa.field('position_delete_record_count', pa.int64(), nullable=False), - pa.field('position_delete_file_count', pa.int32(), nullable=False), - pa.field('equality_delete_record_count', pa.int64(), nullable=False), - pa.field('equality_delete_file_count', pa.int32(), nullable=False), - pa.field('last_updated_at', pa.timestamp(unit='ms'), nullable=True), - pa.field('last_updated_snapshot_id', pa.int64(), nullable=True), + pa.field("record_count", pa.int64(), nullable=False), + pa.field("file_count", pa.int32(), nullable=False), + pa.field("total_data_file_size_in_bytes", pa.int64(), nullable=False), + pa.field("position_delete_record_count", pa.int64(), nullable=False), + pa.field("position_delete_file_count", pa.int32(), nullable=False), + pa.field("equality_delete_record_count", pa.int64(), nullable=False), + pa.field("equality_delete_file_count", pa.int32(), nullable=False), + pa.field("last_updated_at", pa.timestamp(unit="ms"), nullable=True), + pa.field("last_updated_snapshot_id", pa.int64(), nullable=True), ]) partition_record = self.tbl.metadata.specs_struct() @@ -3483,8 +3483,8 @@ def partitions(self, snapshot_id: Optional[int] = None) -> "pa.Table": if has_partitions: pa_record_struct = schema_to_pyarrow(partition_record) partitions_schema = pa.schema([ - pa.field('partition', pa_record_struct, nullable=False), - pa.field('spec_id', pa.int32(), nullable=False), + pa.field("partition", pa_record_struct, nullable=False), + pa.field("spec_id", pa.int32(), nullable=False), ]) table_schema = pa.unify_schemas([partitions_schema, table_schema]) @@ -3561,18 +3561,18 @@ def manifests(self) -> "pa.Table": ]) manifest_schema = pa.schema([ - pa.field('content', pa.int8(), nullable=False), - pa.field('path', pa.string(), nullable=False), - pa.field('length', pa.int64(), nullable=False), - pa.field('partition_spec_id', pa.int32(), nullable=False), - pa.field('added_snapshot_id', pa.int64(), nullable=False), - pa.field('added_data_files_count', pa.int32(), nullable=False), - pa.field('existing_data_files_count', pa.int32(), nullable=False), - pa.field('deleted_data_files_count', pa.int32(), nullable=False), - pa.field('added_delete_files_count', pa.int32(), nullable=False), - pa.field('existing_delete_files_count', pa.int32(), nullable=False), - pa.field('deleted_delete_files_count', pa.int32(), nullable=False), - pa.field('partition_summaries', pa.list_(partition_summary_schema), nullable=False), + pa.field("content", pa.int8(), nullable=False), + pa.field("path", pa.string(), nullable=False), + pa.field("length", pa.int64(), nullable=False), + pa.field("partition_spec_id", pa.int32(), nullable=False), + pa.field("added_snapshot_id", pa.int64(), nullable=False), + pa.field("added_data_files_count", pa.int32(), nullable=False), + pa.field("existing_data_files_count", pa.int32(), nullable=False), + pa.field("deleted_data_files_count", pa.int32(), nullable=False), + pa.field("added_delete_files_count", pa.int32(), nullable=False), + pa.field("existing_delete_files_count", pa.int32(), nullable=False), + pa.field("deleted_delete_files_count", pa.int32(), nullable=False), + pa.field("partition_summaries", pa.list_(partition_summary_schema), nullable=False), ]) def _partition_summaries_to_rows( @@ -3601,10 +3601,10 @@ def _partition_summaries_to_rows( else None ) rows.append({ - 'contains_null': field_summary.contains_null, - 'contains_nan': field_summary.contains_nan, - 'lower_bound': lower_bound, - 'upper_bound': upper_bound, + "contains_null": field_summary.contains_null, + "contains_nan": field_summary.contains_nan, + "lower_bound": lower_bound, + "upper_bound": upper_bound, }) return rows @@ -3615,18 +3615,18 @@ def _partition_summaries_to_rows( is_data_file = manifest.content == ManifestContent.DATA is_delete_file = manifest.content == ManifestContent.DELETES manifests.append({ - 'content': manifest.content, - 'path': manifest.manifest_path, - 'length': manifest.manifest_length, - 'partition_spec_id': manifest.partition_spec_id, - 'added_snapshot_id': manifest.added_snapshot_id, - 'added_data_files_count': manifest.added_files_count if is_data_file else 0, - 'existing_data_files_count': manifest.existing_files_count if is_data_file else 0, - 'deleted_data_files_count': manifest.deleted_files_count if is_data_file else 0, - 'added_delete_files_count': manifest.added_files_count if is_delete_file else 0, - 'existing_delete_files_count': manifest.existing_files_count if is_delete_file else 0, - 'deleted_delete_files_count': manifest.deleted_files_count if is_delete_file else 0, - 'partition_summaries': _partition_summaries_to_rows(specs[manifest.partition_spec_id], manifest.partitions) + "content": manifest.content, + "path": manifest.manifest_path, + "length": manifest.manifest_length, + "partition_spec_id": manifest.partition_spec_id, + "added_snapshot_id": manifest.added_snapshot_id, + "added_data_files_count": manifest.added_files_count if is_data_file else 0, + "existing_data_files_count": manifest.existing_files_count if is_data_file else 0, + "deleted_data_files_count": manifest.deleted_files_count if is_data_file else 0, + "added_delete_files_count": manifest.added_files_count if is_delete_file else 0, + "existing_delete_files_count": manifest.existing_files_count if is_delete_file else 0, + "deleted_delete_files_count": manifest.deleted_files_count if is_delete_file else 0, + "partition_summaries": _partition_summaries_to_rows(specs[manifest.partition_spec_id], manifest.partitions) if manifest.partitions else [], }) @@ -3644,16 +3644,16 @@ class TablePartition: def _get_partition_sort_order(partition_columns: list[str], reverse: bool = False) -> dict[str, Any]: - order = 'ascending' if not reverse else 'descending' - null_placement = 'at_start' if reverse else 'at_end' - return {'sort_keys': [(column_name, order) for column_name in partition_columns], 'null_placement': null_placement} + order = "ascending" if not reverse else "descending" + null_placement = "at_start" if reverse else "at_end" + return {"sort_keys": [(column_name, order) for column_name in partition_columns], "null_placement": null_placement} def group_by_partition_scheme(arrow_table: pa.Table, partition_columns: list[str]) -> pa.Table: """Given a table, sort it by current partition scheme.""" # only works for identity for now sort_options = _get_partition_sort_order(partition_columns, reverse=False) - sorted_arrow_table = arrow_table.sort_by(sorting=sort_options['sort_keys'], null_placement=sort_options['null_placement']) + sorted_arrow_table = arrow_table.sort_by(sorting=sort_options["sort_keys"], null_placement=sort_options["null_placement"]) return sorted_arrow_table @@ -3676,7 +3676,7 @@ def _get_table_partitions( schema: Schema, slice_instructions: list[dict[str, Any]], ) -> list[TablePartition]: - sorted_slice_instructions = sorted(slice_instructions, key=lambda x: x['offset']) + sorted_slice_instructions = sorted(slice_instructions, key=lambda x: x["offset"]) partition_fields = partition_spec.fields diff --git a/pyiceberg/table/metadata.py b/pyiceberg/table/metadata.py index ba0c885758..8c3c389318 100644 --- a/pyiceberg/table/metadata.py +++ b/pyiceberg/table/metadata.py @@ -222,7 +222,7 @@ class TableMetadataCommonFields(IcebergBaseModel): current-snapshot-id even if the refs map is null.""" # validators - @field_validator('properties', mode='before') + @field_validator("properties", mode="before") def transform_properties_dict_value_to_str(cls, properties: Properties) -> Dict[str, str]: return transform_dict_value_to_str(properties) @@ -305,7 +305,7 @@ def sort_order_by_id(self, sort_order_id: int) -> Optional[SortOrder]: """Get the sort order by sort_order_id.""" return next((sort_order for sort_order in self.sort_orders if sort_order.order_id == sort_order_id), None) - @field_serializer('current_snapshot_id') + @field_serializer("current_snapshot_id") def serialize_current_snapshot_id(self, current_snapshot_id: Optional[int]) -> Optional[int]: if current_snapshot_id is None and Config().get_bool("legacy-current-snapshot-id"): return -1 @@ -319,7 +319,7 @@ def _generate_snapshot_id() -> int: """ rnd_uuid = uuid.uuid4() snapshot_id = int.from_bytes( - bytes(lhs ^ rhs for lhs, rhs in zip(rnd_uuid.bytes[0:8], rnd_uuid.bytes[8:16])), byteorder='little', signed=True + bytes(lhs ^ rhs for lhs, rhs in zip(rnd_uuid.bytes[0:8], rnd_uuid.bytes[8:16])), byteorder="little", signed=True ) snapshot_id = snapshot_id if snapshot_id >= 0 else snapshot_id * -1 diff --git a/pyiceberg/table/name_mapping.py b/pyiceberg/table/name_mapping.py index baa15f168d..5a4e769003 100644 --- a/pyiceberg/table/name_mapping.py +++ b/pyiceberg/table/name_mapping.py @@ -40,12 +40,12 @@ class MappedField(IcebergBaseModel): names: List[str] = conlist(str, min_length=1) fields: List[MappedField] = Field(default_factory=list) - @field_validator('fields', mode='before') + @field_validator("fields", mode="before") @classmethod def convert_null_to_empty_List(cls, v: Any) -> Any: return v or [] - @field_validator('names', mode='after') + @field_validator("names", mode="after") @classmethod def check_at_least_one(cls, v: List[str]) -> Any: """ @@ -60,10 +60,10 @@ def check_at_least_one(cls, v: List[str]) -> Any: @model_serializer def ser_model(self) -> Dict[str, Any]: """Set custom serializer to leave out the field when it is empty.""" - fields = {'fields': self.fields} if len(self.fields) > 0 else {} + fields = {"fields": self.fields} if len(self.fields) > 0 else {} return { - 'field-id': self.field_id, - 'names': self.names, + "field-id": self.field_id, + "names": self.names, **fields, } @@ -87,7 +87,7 @@ def _field_by_name(self) -> Dict[str, MappedField]: return visit_name_mapping(self, _IndexByName()) def find(self, *names: str) -> MappedField: - name = '.'.join(names) + name = ".".join(names) try: return self._field_by_name[name] except KeyError as e: @@ -109,7 +109,7 @@ def __str__(self) -> str: return "[\n " + "\n ".join([str(e) for e in self.root]) + "\n]" -S = TypeVar('S') +S = TypeVar("S") T = TypeVar("T") diff --git a/pyiceberg/table/refs.py b/pyiceberg/table/refs.py index df18fadd31..d87a319a16 100644 --- a/pyiceberg/table/refs.py +++ b/pyiceberg/table/refs.py @@ -46,14 +46,14 @@ class SnapshotRef(IcebergBaseModel): max_snapshot_age_ms: Annotated[Optional[int], Field(alias="max-snapshot-age-ms", default=None, gt=0)] max_ref_age_ms: Annotated[Optional[int], Field(alias="max-ref-age-ms", default=None, gt=0)] - @model_validator(mode='after') - def check_min_snapshots_to_keep(self) -> 'SnapshotRef': + @model_validator(mode="after") + def check_min_snapshots_to_keep(self) -> "SnapshotRef": if self.min_snapshots_to_keep is not None and self.snapshot_ref_type == SnapshotRefType.TAG: raise ValidationError("Tags do not support setting minSnapshotsToKeep") return self - @model_validator(mode='after') - def check_max_snapshot_age_ms(self) -> 'SnapshotRef': + @model_validator(mode="after") + def check_max_snapshot_age_ms(self) -> "SnapshotRef": if self.max_snapshot_age_ms is not None and self.snapshot_ref_type == SnapshotRefType.TAG: raise ValidationError("Tags do not support setting maxSnapshotAgeMs") return self diff --git a/pyiceberg/table/snapshots.py b/pyiceberg/table/snapshots.py index 79eb8b0b8a..e2ce3fe4f1 100644 --- a/pyiceberg/table/snapshots.py +++ b/pyiceberg/table/snapshots.py @@ -27,29 +27,29 @@ from pyiceberg.schema import Schema from pyiceberg.typedef import IcebergBaseModel -ADDED_DATA_FILES = 'added-data-files' -ADDED_DELETE_FILES = 'added-delete-files' -ADDED_EQUALITY_DELETES = 'added-equality-deletes' -ADDED_FILE_SIZE = 'added-files-size' -ADDED_POSITION_DELETES = 'added-position-deletes' -ADDED_POSITION_DELETE_FILES = 'added-position-delete-files' -ADDED_RECORDS = 'added-records' -DELETED_DATA_FILES = 'deleted-data-files' -DELETED_RECORDS = 'deleted-records' -ADDED_EQUALITY_DELETE_FILES = 'added-equality-delete-files' -REMOVED_DELETE_FILES = 'removed-delete-files' -REMOVED_EQUALITY_DELETES = 'removed-equality-deletes' -REMOVED_EQUALITY_DELETE_FILES = 'removed-equality-delete-files' -REMOVED_FILE_SIZE = 'removed-files-size' -REMOVED_POSITION_DELETES = 'removed-position-deletes' -REMOVED_POSITION_DELETE_FILES = 'removed-position-delete-files' -TOTAL_EQUALITY_DELETES = 'total-equality-deletes' -TOTAL_POSITION_DELETES = 'total-position-deletes' -TOTAL_DATA_FILES = 'total-data-files' -TOTAL_DELETE_FILES = 'total-delete-files' -TOTAL_RECORDS = 'total-records' -TOTAL_FILE_SIZE = 'total-files-size' -CHANGED_PARTITION_COUNT_PROP = 'changed-partition-count' +ADDED_DATA_FILES = "added-data-files" +ADDED_DELETE_FILES = "added-delete-files" +ADDED_EQUALITY_DELETES = "added-equality-deletes" +ADDED_FILE_SIZE = "added-files-size" +ADDED_POSITION_DELETES = "added-position-deletes" +ADDED_POSITION_DELETE_FILES = "added-position-delete-files" +ADDED_RECORDS = "added-records" +DELETED_DATA_FILES = "deleted-data-files" +DELETED_RECORDS = "deleted-records" +ADDED_EQUALITY_DELETE_FILES = "added-equality-delete-files" +REMOVED_DELETE_FILES = "removed-delete-files" +REMOVED_EQUALITY_DELETES = "removed-equality-deletes" +REMOVED_EQUALITY_DELETE_FILES = "removed-equality-delete-files" +REMOVED_FILE_SIZE = "removed-files-size" +REMOVED_POSITION_DELETES = "removed-position-deletes" +REMOVED_POSITION_DELETE_FILES = "removed-position-delete-files" +TOTAL_EQUALITY_DELETES = "total-equality-deletes" +TOTAL_POSITION_DELETES = "total-position-deletes" +TOTAL_DATA_FILES = "total-data-files" +TOTAL_DELETE_FILES = "total-delete-files" +TOTAL_RECORDS = "total-records" +TOTAL_FILE_SIZE = "total-files-size" +CHANGED_PARTITION_COUNT_PROP = "changed-partition-count" CHANGED_PARTITION_PREFIX = "partitions." OPERATION = "operation" @@ -181,14 +181,14 @@ def __init__(self, operation: Operation, **data: Any) -> None: def __getitem__(self, __key: str) -> Optional[Any]: # type: ignore """Return a key as it is a map.""" - if __key.lower() == 'operation': + if __key.lower() == "operation": return self.operation else: return self._additional_properties.get(__key) def __setitem__(self, key: str, value: Any) -> None: """Set a key as it is a map.""" - if key.lower() == 'operation': + if key.lower() == "operation": self.operation = value else: self._additional_properties[key] = value @@ -317,10 +317,10 @@ def _truncate_table_summary(summary: Summary, previous_summary: Mapping[str, str TOTAL_POSITION_DELETES, TOTAL_EQUALITY_DELETES, }: - summary[prop] = '0' + summary[prop] = "0" def get_prop(prop: str) -> int: - value = previous_summary.get(prop) or '0' + value = previous_summary.get(prop) or "0" try: return int(value) except ValueError as e: @@ -353,12 +353,12 @@ def update_snapshot_summaries( if not previous_summary: previous_summary = { - TOTAL_DATA_FILES: '0', - TOTAL_DELETE_FILES: '0', - TOTAL_RECORDS: '0', - TOTAL_FILE_SIZE: '0', - TOTAL_POSITION_DELETES: '0', - TOTAL_EQUALITY_DELETES: '0', + TOTAL_DATA_FILES: "0", + TOTAL_DELETE_FILES: "0", + TOTAL_RECORDS: "0", + TOTAL_FILE_SIZE: "0", + TOTAL_POSITION_DELETES: "0", + TOTAL_EQUALITY_DELETES: "0", } def _update_totals(total_property: str, added_property: str, removed_property: str) -> None: diff --git a/pyiceberg/typedef.py b/pyiceberg/typedef.py index 26f4d4d5ac..2ff123148b 100644 --- a/pyiceberg/typedef.py +++ b/pyiceberg/typedef.py @@ -52,7 +52,7 @@ def update(self, *args: Any, **kwargs: Any) -> None: raise AttributeError("FrozenDict does not support .update()") -UTF8 = 'utf-8' +UTF8 = "utf-8" EMPTY_DICT = FrozenDict() diff --git a/pyiceberg/utils/config.py b/pyiceberg/utils/config.py index 8b1b81d3a7..5eb9cfaa66 100644 --- a/pyiceberg/utils/config.py +++ b/pyiceberg/utils/config.py @@ -127,7 +127,7 @@ def set_property(_config: RecursiveDict, path: List[str], config_value: str) -> if env_var_lower.startswith(PYICEBERG.lower()): key = env_var_lower[len(PYICEBERG) :] parts = key.split("__", maxsplit=2) - parts_normalized = [part.replace('__', '.').replace("_", "-") for part in parts] + parts_normalized = [part.replace("__", ".").replace("_", "-") for part in parts] set_property(config, parts_normalized, config_value) return config diff --git a/ruff.toml b/ruff.toml index 92fb9a9c80..caaa108c84 100644 --- a/ruff.toml +++ b/ruff.toml @@ -80,4 +80,4 @@ known-first-party = ["pyiceberg", "tests"] section-order = ["future", "standard-library", "third-party", "first-party", "local-folder"] [format] -quote-style = "preserve" +quote-style = "double" diff --git a/tests/avro/test_file.py b/tests/avro/test_file.py index 0809f56fea..4df132304c 100644 --- a/tests/avro/test_file.py +++ b/tests/avro/test_file.py @@ -173,13 +173,13 @@ def test_write_manifest_entry_with_iceberg_read_with_fastavro_v1() -> None: v2_entry = todict(entry) # These are not written in V1 - del v2_entry['data_sequence_number'] - del v2_entry['file_sequence_number'] - del v2_entry['data_file']['content'] - del v2_entry['data_file']['equality_ids'] + del v2_entry["data_sequence_number"] + del v2_entry["file_sequence_number"] + del v2_entry["data_file"]["content"] + del v2_entry["data_file"]["equality_ids"] # Required in V1 - v2_entry['data_file']['block_size_in_bytes'] = DEFAULT_BLOCK_SIZE + v2_entry["data_file"]["block_size_in_bytes"] = DEFAULT_BLOCK_SIZE assert v2_entry == fa_entry diff --git a/tests/catalog/integration_test_glue.py b/tests/catalog/integration_test_glue.py index ee43779073..21c415212a 100644 --- a/tests/catalog/integration_test_glue.py +++ b/tests/catalog/integration_test_glue.py @@ -484,7 +484,7 @@ def test_commit_table_properties( updated_table_metadata = table.metadata assert MetastoreCatalog._parse_metadata_version(table.metadata_location) == 1 - assert updated_table_metadata.properties == {'Description': 'test_description', "test_a": "test_aa", "test_c": "test_c"} + assert updated_table_metadata.properties == {"Description": "test_description", "test_a": "test_aa", "test_c": "test_c"} table_info = glue.get_table( DatabaseName=database_name, diff --git a/tests/catalog/test_dynamodb.py b/tests/catalog/test_dynamodb.py index f4b16d343b..7ad1301d9d 100644 --- a/tests/catalog/test_dynamodb.py +++ b/tests/catalog/test_dynamodb.py @@ -569,10 +569,10 @@ def test_passing_provided_profile() -> None: } props = {"py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} props.update(session_props) # type: ignore - with mock.patch('boto3.Session', return_value=mock.Mock()) as mock_session: + with mock.patch("boto3.Session", return_value=mock.Mock()) as mock_session: mock_client = mock.Mock() mock_session.return_value.client.return_value = mock_client - mock_client.describe_table.return_value = {'Table': {'TableStatus': 'ACTIVE'}} + mock_client.describe_table.return_value = {"Table": {"TableStatus": "ACTIVE"}} test_catalog = DynamoDbCatalog(catalog_name, **props) assert test_catalog.dynamodb is mock_client mock_session.assert_called_with(**session_props) @@ -590,4 +590,4 @@ def test_table_exists( # Act and Assert for an existing table assert test_catalog.table_exists(identifier) is True # Act and Assert for an non-existing table - assert test_catalog.table_exists(('non', 'exist')) is False + assert test_catalog.table_exists(("non", "exist")) is False diff --git a/tests/catalog/test_glue.py b/tests/catalog/test_glue.py index 1aea46d6ef..6b57f1dfe6 100644 --- a/tests/catalog/test_glue.py +++ b/tests/catalog/test_glue.py @@ -715,7 +715,7 @@ def test_commit_table_properties( updated_table_metadata = table.metadata assert test_catalog._parse_metadata_version(table.metadata_location) == 1 - assert updated_table_metadata.properties == {'Description': 'test_description', "test_a": "test_aa", "test_c": "test_c"} + assert updated_table_metadata.properties == {"Description": "test_description", "test_a": "test_aa", "test_c": "test_c"} table_info = _glue.get_table( DatabaseName=database_name, @@ -847,7 +847,7 @@ def test_table_exists( # Act and Assert for an existing table assert test_catalog.table_exists(identifier) is True # Act and Assert for a non-existing table - assert test_catalog.table_exists(('non', 'exist')) is False + assert test_catalog.table_exists(("non", "exist")) is False @mock_aws diff --git a/tests/catalog/test_hive.py b/tests/catalog/test_hive.py index ef662b3aff..96e95815be 100644 --- a/tests/catalog/test_hive.py +++ b/tests/catalog/test_hive.py @@ -234,27 +234,27 @@ def test_create_table( retention=None, sd=StorageDescriptor( cols=[ - FieldSchema(name='boolean', type='boolean', comment=None), - FieldSchema(name='integer', type='int', comment=None), - FieldSchema(name='long', type='bigint', comment=None), - FieldSchema(name='float', type='float', comment=None), - FieldSchema(name='double', type='double', comment=None), - FieldSchema(name='decimal', type='decimal(32,3)', comment=None), - FieldSchema(name='date', type='date', comment=None), - FieldSchema(name='time', type='string', comment=None), - FieldSchema(name='timestamp', type='timestamp', comment=None), + FieldSchema(name="boolean", type="boolean", comment=None), + FieldSchema(name="integer", type="int", comment=None), + FieldSchema(name="long", type="bigint", comment=None), + FieldSchema(name="float", type="float", comment=None), + FieldSchema(name="double", type="double", comment=None), + FieldSchema(name="decimal", type="decimal(32,3)", comment=None), + FieldSchema(name="date", type="date", comment=None), + FieldSchema(name="time", type="string", comment=None), + FieldSchema(name="timestamp", type="timestamp", comment=None), FieldSchema( - name='timestamptz', - type='timestamp' if hive2_compatible else 'timestamp with local time zone', + name="timestamptz", + type="timestamp" if hive2_compatible else "timestamp with local time zone", comment=None, ), - FieldSchema(name='string', type='string', comment=None), - FieldSchema(name='uuid', type='string', comment=None), - FieldSchema(name='fixed', type='binary', comment=None), - FieldSchema(name='binary', type='binary', comment=None), - FieldSchema(name='list', type='array', comment=None), - FieldSchema(name='map', type='map', comment=None), - FieldSchema(name='struct', type='struct', comment=None), + FieldSchema(name="string", type="string", comment=None), + FieldSchema(name="uuid", type="string", comment=None), + FieldSchema(name="fixed", type="binary", comment=None), + FieldSchema(name="binary", type="binary", comment=None), + FieldSchema(name="list", type="array", comment=None), + FieldSchema(name="map", type="map", comment=None), + FieldSchema(name="struct", type="struct", comment=None), ], location=f"{hive_database.locationUri}/table", inputFormat="org.apache.hadoop.mapred.FileInputFormat", @@ -314,40 +314,40 @@ def test_create_table( last_column_id=22, schemas=[ Schema( - NestedField(field_id=1, name='boolean', field_type=BooleanType(), required=True), - NestedField(field_id=2, name='integer', field_type=IntegerType(), required=True), - NestedField(field_id=3, name='long', field_type=LongType(), required=True), - NestedField(field_id=4, name='float', field_type=FloatType(), required=True), - NestedField(field_id=5, name='double', field_type=DoubleType(), required=True), - NestedField(field_id=6, name='decimal', field_type=DecimalType(precision=32, scale=3), required=True), - NestedField(field_id=7, name='date', field_type=DateType(), required=True), - NestedField(field_id=8, name='time', field_type=TimeType(), required=True), - NestedField(field_id=9, name='timestamp', field_type=TimestampType(), required=True), - NestedField(field_id=10, name='timestamptz', field_type=TimestamptzType(), required=True), - NestedField(field_id=11, name='string', field_type=StringType(), required=True), - NestedField(field_id=12, name='uuid', field_type=UUIDType(), required=True), - NestedField(field_id=13, name='fixed', field_type=FixedType(length=12), required=True), - NestedField(field_id=14, name='binary', field_type=BinaryType(), required=True), + NestedField(field_id=1, name="boolean", field_type=BooleanType(), required=True), + NestedField(field_id=2, name="integer", field_type=IntegerType(), required=True), + NestedField(field_id=3, name="long", field_type=LongType(), required=True), + NestedField(field_id=4, name="float", field_type=FloatType(), required=True), + NestedField(field_id=5, name="double", field_type=DoubleType(), required=True), + NestedField(field_id=6, name="decimal", field_type=DecimalType(precision=32, scale=3), required=True), + NestedField(field_id=7, name="date", field_type=DateType(), required=True), + NestedField(field_id=8, name="time", field_type=TimeType(), required=True), + NestedField(field_id=9, name="timestamp", field_type=TimestampType(), required=True), + NestedField(field_id=10, name="timestamptz", field_type=TimestamptzType(), required=True), + NestedField(field_id=11, name="string", field_type=StringType(), required=True), + NestedField(field_id=12, name="uuid", field_type=UUIDType(), required=True), + NestedField(field_id=13, name="fixed", field_type=FixedType(length=12), required=True), + NestedField(field_id=14, name="binary", field_type=BinaryType(), required=True), NestedField( field_id=15, - name='list', - field_type=ListType(type='list', element_id=18, element_type=StringType(), element_required=True), + name="list", + field_type=ListType(type="list", element_id=18, element_type=StringType(), element_required=True), required=True, ), NestedField( field_id=16, - name='map', + name="map", field_type=MapType( - type='map', key_id=19, key_type=StringType(), value_id=20, value_type=IntegerType(), value_required=True + type="map", key_id=19, key_type=StringType(), value_id=20, value_type=IntegerType(), value_required=True ), required=True, ), NestedField( field_id=17, - name='struct', + name="struct", field_type=StructType( - NestedField(field_id=21, name='inner_string', field_type=StringType(), required=False), - NestedField(field_id=22, name='inner_int', field_type=IntegerType(), required=True), + NestedField(field_id=21, name="inner_string", field_type=StringType(), required=False), + NestedField(field_id=22, name="inner_int", field_type=IntegerType(), required=True), ), required=False, ), @@ -357,7 +357,7 @@ def test_create_table( ], current_schema_id=0, last_partition_id=999, - properties={"owner": "javaberg", 'write.parquet.compression-codec': 'zstd'}, + properties={"owner": "javaberg", "write.parquet.compression-codec": "zstd"}, partition_specs=[PartitionSpec()], default_spec_id=0, current_snapshot_id=None, @@ -409,27 +409,27 @@ def test_create_table_with_given_location_removes_trailing_slash( retention=None, sd=StorageDescriptor( cols=[ - FieldSchema(name='boolean', type='boolean', comment=None), - FieldSchema(name='integer', type='int', comment=None), - FieldSchema(name='long', type='bigint', comment=None), - FieldSchema(name='float', type='float', comment=None), - FieldSchema(name='double', type='double', comment=None), - FieldSchema(name='decimal', type='decimal(32,3)', comment=None), - FieldSchema(name='date', type='date', comment=None), - FieldSchema(name='time', type='string', comment=None), - FieldSchema(name='timestamp', type='timestamp', comment=None), + FieldSchema(name="boolean", type="boolean", comment=None), + FieldSchema(name="integer", type="int", comment=None), + FieldSchema(name="long", type="bigint", comment=None), + FieldSchema(name="float", type="float", comment=None), + FieldSchema(name="double", type="double", comment=None), + FieldSchema(name="decimal", type="decimal(32,3)", comment=None), + FieldSchema(name="date", type="date", comment=None), + FieldSchema(name="time", type="string", comment=None), + FieldSchema(name="timestamp", type="timestamp", comment=None), FieldSchema( - name='timestamptz', - type='timestamp' if hive2_compatible else 'timestamp with local time zone', + name="timestamptz", + type="timestamp" if hive2_compatible else "timestamp with local time zone", comment=None, ), - FieldSchema(name='string', type='string', comment=None), - FieldSchema(name='uuid', type='string', comment=None), - FieldSchema(name='fixed', type='binary', comment=None), - FieldSchema(name='binary', type='binary', comment=None), - FieldSchema(name='list', type='array', comment=None), - FieldSchema(name='map', type='map', comment=None), - FieldSchema(name='struct', type='struct', comment=None), + FieldSchema(name="string", type="string", comment=None), + FieldSchema(name="uuid", type="string", comment=None), + FieldSchema(name="fixed", type="binary", comment=None), + FieldSchema(name="binary", type="binary", comment=None), + FieldSchema(name="list", type="array", comment=None), + FieldSchema(name="map", type="map", comment=None), + FieldSchema(name="struct", type="struct", comment=None), ], location=f"{hive_database.locationUri}/table-given-location", inputFormat="org.apache.hadoop.mapred.FileInputFormat", @@ -489,40 +489,40 @@ def test_create_table_with_given_location_removes_trailing_slash( last_column_id=22, schemas=[ Schema( - NestedField(field_id=1, name='boolean', field_type=BooleanType(), required=True), - NestedField(field_id=2, name='integer', field_type=IntegerType(), required=True), - NestedField(field_id=3, name='long', field_type=LongType(), required=True), - NestedField(field_id=4, name='float', field_type=FloatType(), required=True), - NestedField(field_id=5, name='double', field_type=DoubleType(), required=True), - NestedField(field_id=6, name='decimal', field_type=DecimalType(precision=32, scale=3), required=True), - NestedField(field_id=7, name='date', field_type=DateType(), required=True), - NestedField(field_id=8, name='time', field_type=TimeType(), required=True), - NestedField(field_id=9, name='timestamp', field_type=TimestampType(), required=True), - NestedField(field_id=10, name='timestamptz', field_type=TimestamptzType(), required=True), - NestedField(field_id=11, name='string', field_type=StringType(), required=True), - NestedField(field_id=12, name='uuid', field_type=UUIDType(), required=True), - NestedField(field_id=13, name='fixed', field_type=FixedType(length=12), required=True), - NestedField(field_id=14, name='binary', field_type=BinaryType(), required=True), + NestedField(field_id=1, name="boolean", field_type=BooleanType(), required=True), + NestedField(field_id=2, name="integer", field_type=IntegerType(), required=True), + NestedField(field_id=3, name="long", field_type=LongType(), required=True), + NestedField(field_id=4, name="float", field_type=FloatType(), required=True), + NestedField(field_id=5, name="double", field_type=DoubleType(), required=True), + NestedField(field_id=6, name="decimal", field_type=DecimalType(precision=32, scale=3), required=True), + NestedField(field_id=7, name="date", field_type=DateType(), required=True), + NestedField(field_id=8, name="time", field_type=TimeType(), required=True), + NestedField(field_id=9, name="timestamp", field_type=TimestampType(), required=True), + NestedField(field_id=10, name="timestamptz", field_type=TimestamptzType(), required=True), + NestedField(field_id=11, name="string", field_type=StringType(), required=True), + NestedField(field_id=12, name="uuid", field_type=UUIDType(), required=True), + NestedField(field_id=13, name="fixed", field_type=FixedType(length=12), required=True), + NestedField(field_id=14, name="binary", field_type=BinaryType(), required=True), NestedField( field_id=15, - name='list', - field_type=ListType(type='list', element_id=18, element_type=StringType(), element_required=True), + name="list", + field_type=ListType(type="list", element_id=18, element_type=StringType(), element_required=True), required=True, ), NestedField( field_id=16, - name='map', + name="map", field_type=MapType( - type='map', key_id=19, key_type=StringType(), value_id=20, value_type=IntegerType(), value_required=True + type="map", key_id=19, key_type=StringType(), value_id=20, value_type=IntegerType(), value_required=True ), required=True, ), NestedField( field_id=17, - name='struct', + name="struct", field_type=StructType( - NestedField(field_id=21, name='inner_string', field_type=StringType(), required=False), - NestedField(field_id=22, name='inner_int', field_type=IntegerType(), required=True), + NestedField(field_id=21, name="inner_string", field_type=StringType(), required=False), + NestedField(field_id=22, name="inner_int", field_type=IntegerType(), required=True), ), required=False, ), @@ -532,7 +532,7 @@ def test_create_table_with_given_location_removes_trailing_slash( ], current_schema_id=0, last_partition_id=999, - properties={"owner": "javaberg", 'write.parquet.compression-codec': 'zstd'}, + properties={"owner": "javaberg", "write.parquet.compression-codec": "zstd"}, partition_specs=[PartitionSpec()], default_spec_id=0, current_snapshot_id=None, diff --git a/tests/catalog/test_sql.py b/tests/catalog/test_sql.py index 285cfd9ab9..6dc498233e 100644 --- a/tests/catalog/test_sql.py +++ b/tests/catalog/test_sql.py @@ -169,10 +169,10 @@ def test_creation_with_unsupported_uri(catalog_name: str) -> None: @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) def test_create_tables_idempotency(catalog: SqlCatalog) -> None: @@ -182,10 +182,10 @@ def test_create_tables_idempotency(catalog: SqlCatalog) -> None: @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) @pytest.mark.parametrize( @@ -207,10 +207,10 @@ def test_create_table_default_sort_order(catalog: SqlCatalog, table_schema_neste @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) @pytest.mark.parametrize( @@ -234,10 +234,10 @@ def test_create_v1_table(catalog: SqlCatalog, table_schema_nested: Schema, table @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) @pytest.mark.parametrize( @@ -263,10 +263,10 @@ def test_create_table_with_pyarrow_schema( @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) @pytest.mark.parametrize( @@ -288,10 +288,10 @@ def test_write_pyarrow_schema(catalog: SqlCatalog, table_identifier: Identifier) pa.array([None, "A", "B", "C"]), # 'large' column ], schema=pa.schema([ - pa.field('foo', pa.string(), nullable=True), - pa.field('bar', pa.int32(), nullable=False), - pa.field('baz', pa.bool_(), nullable=True), - pa.field('large', pa.large_string(), nullable=True), + pa.field("foo", pa.string(), nullable=True), + pa.field("bar", pa.int32(), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), + pa.field("large", pa.large_string(), nullable=True), ]), ) table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) @@ -302,10 +302,10 @@ def test_write_pyarrow_schema(catalog: SqlCatalog, table_identifier: Identifier) @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) @pytest.mark.parametrize( @@ -332,10 +332,10 @@ def test_create_table_custom_sort_order(catalog: SqlCatalog, table_schema_nested @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) @pytest.mark.parametrize( @@ -361,10 +361,10 @@ def test_create_table_with_default_warehouse_location( @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) @pytest.mark.parametrize( @@ -393,10 +393,10 @@ def test_create_table_with_given_location_removes_trailing_slash( @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) @pytest.mark.parametrize( @@ -417,10 +417,10 @@ def test_create_duplicated_table(catalog: SqlCatalog, table_schema_nested: Schem @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) @pytest.mark.parametrize( @@ -443,10 +443,10 @@ def test_create_table_if_not_exists_duplicated_table( @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) def test_create_table_with_non_existing_namespace(catalog: SqlCatalog, table_schema_nested: Schema, table_name: str) -> None: @@ -456,10 +456,10 @@ def test_create_table_with_non_existing_namespace(catalog: SqlCatalog, table_sch @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) def test_create_table_without_namespace(catalog: SqlCatalog, table_schema_nested: Schema, table_name: str) -> None: @@ -468,10 +468,10 @@ def test_create_table_without_namespace(catalog: SqlCatalog, table_schema_nested @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) @pytest.mark.parametrize( @@ -494,10 +494,10 @@ def test_register_table(catalog: SqlCatalog, table_identifier: Identifier, metad @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) @pytest.mark.parametrize( @@ -518,10 +518,10 @@ def test_register_existing_table(catalog: SqlCatalog, table_identifier: Identifi @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) def test_register_table_with_non_existing_namespace(catalog: SqlCatalog, metadata_location: str, table_name: str) -> None: @@ -531,10 +531,10 @@ def test_register_table_with_non_existing_namespace(catalog: SqlCatalog, metadat @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) def test_register_table_without_namespace(catalog: SqlCatalog, metadata_location: str, table_name: str) -> None: @@ -543,10 +543,10 @@ def test_register_table_without_namespace(catalog: SqlCatalog, metadata_location @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) @pytest.mark.parametrize( @@ -569,10 +569,10 @@ def test_load_table(catalog: SqlCatalog, table_schema_nested: Schema, table_iden @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) @pytest.mark.parametrize( @@ -597,11 +597,11 @@ def test_load_table_from_self_identifier(catalog: SqlCatalog, table_schema_neste @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), - lazy_fixture('catalog_sqlite_without_rowcount'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), ], ) @pytest.mark.parametrize( @@ -624,11 +624,11 @@ def test_drop_table(catalog: SqlCatalog, table_schema_nested: Schema, table_iden @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), - lazy_fixture('catalog_sqlite_without_rowcount'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), ], ) @pytest.mark.parametrize( @@ -653,11 +653,11 @@ def test_drop_table_from_self_identifier(catalog: SqlCatalog, table_schema_neste @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), - lazy_fixture('catalog_sqlite_without_rowcount'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), ], ) @pytest.mark.parametrize( @@ -674,11 +674,11 @@ def test_drop_table_that_does_not_exist(catalog: SqlCatalog, table_identifier: I @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), - lazy_fixture('catalog_sqlite_without_rowcount'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), ], ) @pytest.mark.parametrize( @@ -717,11 +717,11 @@ def test_rename_table( @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), - lazy_fixture('catalog_sqlite_without_rowcount'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), ], ) @pytest.mark.parametrize( @@ -762,11 +762,11 @@ def test_rename_table_from_self_identifier( @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), - lazy_fixture('catalog_sqlite_without_rowcount'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), ], ) @pytest.mark.parametrize( @@ -803,11 +803,11 @@ def test_rename_table_to_existing_one( @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), - lazy_fixture('catalog_sqlite_without_rowcount'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), ], ) @pytest.mark.parametrize( @@ -835,11 +835,11 @@ def test_rename_missing_table(catalog: SqlCatalog, from_table_identifier: Identi @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), - lazy_fixture('catalog_sqlite_without_rowcount'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), ], ) @pytest.mark.parametrize( @@ -871,10 +871,10 @@ def test_rename_table_to_missing_namespace( @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) @pytest.mark.parametrize( @@ -914,10 +914,10 @@ def test_list_tables( @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) @pytest.mark.parametrize("namespace", [lazy_fixture("database_name"), lazy_fixture("hierarchical_namespace_name")]) @@ -927,10 +927,10 @@ def test_list_tables_when_missing_namespace(catalog: SqlCatalog, namespace: str) @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) def test_create_namespace_if_not_exists(catalog: SqlCatalog, database_name: str) -> None: @@ -941,10 +941,10 @@ def test_create_namespace_if_not_exists(catalog: SqlCatalog, database_name: str) @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) @pytest.mark.parametrize("namespace", [lazy_fixture("database_name"), lazy_fixture("hierarchical_namespace_name")]) @@ -954,10 +954,10 @@ def test_create_namespace(catalog: SqlCatalog, namespace: str) -> None: @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) @pytest.mark.parametrize("namespace", [lazy_fixture("database_name"), lazy_fixture("hierarchical_namespace_name")]) @@ -968,10 +968,10 @@ def test_create_duplicate_namespace(catalog: SqlCatalog, namespace: str) -> None @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) @pytest.mark.parametrize("namespace", [lazy_fixture("database_name"), lazy_fixture("hierarchical_namespace_name")]) @@ -982,10 +982,10 @@ def test_create_namespaces_sharing_same_prefix(catalog: SqlCatalog, namespace: s @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) @pytest.mark.parametrize("namespace", [lazy_fixture("database_name"), lazy_fixture("hierarchical_namespace_name")]) @@ -1004,10 +1004,10 @@ def test_create_namespace_with_comment_and_location(catalog: SqlCatalog, namespa @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) @pytest.mark.parametrize("namespace", [lazy_fixture("database_name"), lazy_fixture("hierarchical_namespace_name")]) @@ -1021,10 +1021,10 @@ def test_create_namespace_with_null_properties(catalog: SqlCatalog, namespace: s @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) @pytest.mark.parametrize("empty_namespace", ["", (), (""), ("", ""), " ", (" ")]) @@ -1034,10 +1034,10 @@ def test_create_namespace_with_empty_identifier(catalog: SqlCatalog, empty_names @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) @pytest.mark.parametrize("namespace_list", [lazy_fixture("database_list"), lazy_fixture("hierarchical_namespace_list")]) @@ -1054,10 +1054,10 @@ def test_list_namespaces(catalog: SqlCatalog, namespace_list: List[str]) -> None @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) def test_list_non_existing_namespaces(catalog: SqlCatalog) -> None: @@ -1066,10 +1066,10 @@ def test_list_non_existing_namespaces(catalog: SqlCatalog) -> None: @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) @pytest.mark.parametrize( @@ -1094,10 +1094,10 @@ def test_drop_namespace(catalog: SqlCatalog, table_schema_nested: Schema, table_ @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) @pytest.mark.parametrize("namespace", [lazy_fixture("database_name"), lazy_fixture("hierarchical_namespace_name")]) @@ -1119,10 +1119,10 @@ def test_load_namespace_properties(catalog: SqlCatalog, namespace: str) -> None: @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) @pytest.mark.parametrize("namespace", [lazy_fixture("database_name"), lazy_fixture("hierarchical_namespace_name")]) @@ -1133,10 +1133,10 @@ def test_load_empty_namespace_properties(catalog: SqlCatalog, namespace: str) -> @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) def test_load_namespace_properties_non_existing_namespace(catalog: SqlCatalog) -> None: @@ -1145,10 +1145,10 @@ def test_load_namespace_properties_non_existing_namespace(catalog: SqlCatalog) - @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) @pytest.mark.parametrize("namespace", [lazy_fixture("database_name"), lazy_fixture("hierarchical_namespace_name")]) @@ -1176,11 +1176,11 @@ def test_update_namespace_properties(catalog: SqlCatalog, namespace: str) -> Non @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), - lazy_fixture('catalog_sqlite_without_rowcount'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), ], ) @pytest.mark.parametrize( @@ -1218,12 +1218,12 @@ def test_commit_table(catalog: SqlCatalog, table_schema_nested: Schema, table_id @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), - lazy_fixture('catalog_sqlite_without_rowcount'), - lazy_fixture('catalog_sqlite_fsspec'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), + lazy_fixture("catalog_sqlite_fsspec"), ], ) @pytest.mark.parametrize( @@ -1258,21 +1258,21 @@ def test_append_table(catalog: SqlCatalog, table_schema_simple: Schema, table_id assert table.metadata.snapshots[0].sequence_number == 1 assert table.metadata.snapshots[0].summary is not None assert table.metadata.snapshots[0].summary.operation == Operation.APPEND - assert table.metadata.snapshots[0].summary['added-data-files'] == '1' - assert table.metadata.snapshots[0].summary['added-records'] == '1' - assert table.metadata.snapshots[0].summary['total-data-files'] == '1' - assert table.metadata.snapshots[0].summary['total-records'] == '1' + assert table.metadata.snapshots[0].summary["added-data-files"] == "1" + assert table.metadata.snapshots[0].summary["added-records"] == "1" + assert table.metadata.snapshots[0].summary["total-data-files"] == "1" + assert table.metadata.snapshots[0].summary["total-records"] == "1" # read back the data assert df == table.scan().to_arrow() @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), - lazy_fixture('catalog_sqlite_without_rowcount'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), ], ) @pytest.mark.parametrize( @@ -1300,11 +1300,11 @@ def test_concurrent_commit_table(catalog: SqlCatalog, table_schema_simple: Schem @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), - lazy_fixture('catalog_sqlite_without_rowcount'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), ], ) @pytest.mark.parametrize("format_version", [1, 2]) @@ -1323,7 +1323,7 @@ def test_write_and_evolve(catalog: SqlCatalog, format_version: int) -> None: pa_table = pa.Table.from_pydict( { - 'foo': ['a', None, 'z'], + "foo": ["a", None, "z"], }, schema=pa.schema([pa.field("foo", pa.string(), nullable=True)]), ) @@ -1332,8 +1332,8 @@ def test_write_and_evolve(catalog: SqlCatalog, format_version: int) -> None: pa_table_with_column = pa.Table.from_pydict( { - 'foo': ['a', None, 'z'], - 'bar': [19, None, 25], + "foo": ["a", None, "z"], + "bar": [19, None, 25], }, schema=pa.schema([ pa.field("foo", pa.string(), nullable=True), @@ -1351,11 +1351,11 @@ def test_write_and_evolve(catalog: SqlCatalog, format_version: int) -> None: @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), - lazy_fixture('catalog_sqlite_without_rowcount'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), ], ) @pytest.mark.parametrize( @@ -1377,11 +1377,11 @@ def test_table_properties_int_value(catalog: SqlCatalog, table_schema_simple: Sc @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), - lazy_fixture('catalog_sqlite_without_rowcount'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), ], ) @pytest.mark.parametrize( @@ -1405,10 +1405,10 @@ def test_table_properties_raise_for_none_value( @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) @pytest.mark.parametrize( @@ -1429,4 +1429,4 @@ def test_table_exists(catalog: SqlCatalog, table_schema_simple: Schema, table_id assert catalog.table_exists(existing_table) is True # Act and Assert for a non-existing table - assert catalog.table_exists(('non', 'exist')) is False + assert catalog.table_exists(("non", "exist")) is False diff --git a/tests/conftest.py b/tests/conftest.py index 4baefafef4..01915b7d82 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -324,9 +324,9 @@ def pyarrow_schema_simple_without_ids() -> "pa.Schema": import pyarrow as pa return pa.schema([ - pa.field('foo', pa.string(), nullable=True), - pa.field('bar', pa.int32(), nullable=False), - pa.field('baz', pa.bool_(), nullable=True), + pa.field("foo", pa.string(), nullable=True), + pa.field("bar", pa.int32(), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), ]) @@ -335,12 +335,12 @@ def pyarrow_schema_nested_without_ids() -> "pa.Schema": import pyarrow as pa return pa.schema([ - pa.field('foo', pa.string(), nullable=False), - pa.field('bar', pa.int32(), nullable=False), - pa.field('baz', pa.bool_(), nullable=True), - pa.field('qux', pa.list_(pa.string()), nullable=False), + pa.field("foo", pa.string(), nullable=False), + pa.field("bar", pa.int32(), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), + pa.field("qux", pa.list_(pa.string()), nullable=False), pa.field( - 'quux', + "quux", pa.map_( pa.string(), pa.map_(pa.string(), pa.int32()), @@ -348,20 +348,20 @@ def pyarrow_schema_nested_without_ids() -> "pa.Schema": nullable=False, ), pa.field( - 'location', + "location", pa.list_( pa.struct([ - pa.field('latitude', pa.float32(), nullable=False), - pa.field('longitude', pa.float32(), nullable=False), + pa.field("latitude", pa.float32(), nullable=False), + pa.field("longitude", pa.float32(), nullable=False), ]), ), nullable=False, ), pa.field( - 'person', + "person", pa.struct([ - pa.field('name', pa.string(), nullable=True), - pa.field('age', pa.int32(), nullable=False), + pa.field("name", pa.string(), nullable=True), + pa.field("age", pa.int32(), nullable=False), ]), nullable=True, ), @@ -2081,31 +2081,31 @@ def spark() -> "SparkSession": TEST_DATA_WITH_NULL = { - 'bool': [False, None, True], - 'string': ['a', None, 'z'], + "bool": [False, None, True], + "string": ["a", None, "z"], # Go over the 16 bytes to kick in truncation - 'string_long': ['a' * 22, None, 'z' * 22], - 'int': [1, None, 9], - 'long': [1, None, 9], - 'float': [0.0, None, 0.9], - 'double': [0.0, None, 0.9], + "string_long": ["a" * 22, None, "z" * 22], + "int": [1, None, 9], + "long": [1, None, 9], + "float": [0.0, None, 0.9], + "double": [0.0, None, 0.9], # 'time': [1_000_000, None, 3_000_000], # Example times: 1s, none, and 3s past midnight #Spark does not support time fields - 'timestamp': [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)], - 'timestamptz': [ + "timestamp": [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)], + "timestamptz": [ datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc), None, datetime(2023, 3, 1, 19, 25, 00, tzinfo=timezone.utc), ], - 'date': [date(2023, 1, 1), None, date(2023, 3, 1)], + "date": [date(2023, 1, 1), None, date(2023, 3, 1)], # Not supported by Spark # 'time': [time(1, 22, 0), None, time(19, 25, 0)], # Not natively supported by Arrow # 'uuid': [uuid.UUID('00000000-0000-0000-0000-000000000000').bytes, None, uuid.UUID('11111111-1111-1111-1111-111111111111').bytes], - 'binary': [b'\01', None, b'\22'], - 'fixed': [ - uuid.UUID('00000000-0000-0000-0000-000000000000').bytes, + "binary": [b"\01", None, b"\22"], + "fixed": [ + uuid.UUID("00000000-0000-0000-0000-000000000000").bytes, None, - uuid.UUID('11111111-1111-1111-1111-111111111111').bytes, + uuid.UUID("11111111-1111-1111-1111-111111111111").bytes, ], } diff --git a/tests/expressions/test_expressions.py b/tests/expressions/test_expressions.py index f277672d87..87856a04f6 100644 --- a/tests/expressions/test_expressions.py +++ b/tests/expressions/test_expressions.py @@ -1152,11 +1152,11 @@ def test_above_long_bounds_greater_than_or_equal( def test_eq_bound_expression(bound_reference_str: BoundReference[str]) -> None: - assert BoundEqualTo(term=bound_reference_str, literal=literal('a')) != BoundGreaterThanOrEqual( - term=bound_reference_str, literal=literal('a') + assert BoundEqualTo(term=bound_reference_str, literal=literal("a")) != BoundGreaterThanOrEqual( + term=bound_reference_str, literal=literal("a") ) - assert BoundEqualTo(term=bound_reference_str, literal=literal('a')) == BoundEqualTo( - term=bound_reference_str, literal=literal('a') + assert BoundEqualTo(term=bound_reference_str, literal=literal("a")) == BoundEqualTo( + term=bound_reference_str, literal=literal("a") ) diff --git a/tests/integration/test_add_files.py b/tests/integration/test_add_files.py index 94c73918c8..84729fcca4 100644 --- a/tests/integration/test_add_files.py +++ b/tests/integration/test_add_files.py @@ -65,10 +65,10 @@ ) ARROW_SCHEMA_WITH_IDS = pa.schema([ - pa.field('foo', pa.bool_(), nullable=False, metadata={"PARQUET:field_id": "1"}), - pa.field('bar', pa.string(), nullable=False, metadata={"PARQUET:field_id": "2"}), - pa.field('baz', pa.int32(), nullable=False, metadata={"PARQUET:field_id": "3"}), - pa.field('qux', pa.date32(), nullable=False, metadata={"PARQUET:field_id": "4"}), + pa.field("foo", pa.bool_(), nullable=False, metadata={"PARQUET:field_id": "1"}), + pa.field("bar", pa.string(), nullable=False, metadata={"PARQUET:field_id": "2"}), + pa.field("baz", pa.int32(), nullable=False, metadata={"PARQUET:field_id": "3"}), + pa.field("qux", pa.date32(), nullable=False, metadata={"PARQUET:field_id": "4"}), ]) diff --git a/tests/integration/test_inspect_table.py b/tests/integration/test_inspect_table.py index 8665435e43..1f2b9a3ead 100644 --- a/tests/integration/test_inspect_table.py +++ b/tests/integration/test_inspect_table.py @@ -88,45 +88,45 @@ def test_inspect_snapshots( df = tbl.inspect.snapshots() assert df.column_names == [ - 'committed_at', - 'snapshot_id', - 'parent_id', - 'operation', - 'manifest_list', - 'summary', + "committed_at", + "snapshot_id", + "parent_id", + "operation", + "manifest_list", + "summary", ] - for committed_at in df['committed_at']: + for committed_at in df["committed_at"]: assert isinstance(committed_at.as_py(), datetime) - for snapshot_id in df['snapshot_id']: + for snapshot_id in df["snapshot_id"]: assert isinstance(snapshot_id.as_py(), int) - assert df['parent_id'][0].as_py() is None - assert df['parent_id'][1:] == df['snapshot_id'][:2] + assert df["parent_id"][0].as_py() is None + assert df["parent_id"][1:] == df["snapshot_id"][:2] - assert [operation.as_py() for operation in df['operation']] == ['append', 'overwrite', 'append'] + assert [operation.as_py() for operation in df["operation"]] == ["append", "overwrite", "append"] - for manifest_list in df['manifest_list']: + for manifest_list in df["manifest_list"]: assert manifest_list.as_py().startswith("s3://") - assert df['summary'][0].as_py() == [ - ('added-files-size', '5459'), - ('added-data-files', '1'), - ('added-records', '3'), - ('total-data-files', '1'), - ('total-delete-files', '0'), - ('total-records', '3'), - ('total-files-size', '5459'), - ('total-position-deletes', '0'), - ('total-equality-deletes', '0'), + assert df["summary"][0].as_py() == [ + ("added-files-size", "5459"), + ("added-data-files", "1"), + ("added-records", "3"), + ("total-data-files", "1"), + ("total-delete-files", "0"), + ("total-records", "3"), + ("total-files-size", "5459"), + ("total-position-deletes", "0"), + ("total-equality-deletes", "0"), ] lhs = spark.table(f"{identifier}.snapshots").toPandas() rhs = df.to_pandas() for column in df.column_names: for left, right in zip(lhs[column].to_list(), rhs[column].to_list()): - if column == 'summary': + if column == "summary": # Arrow returns a list of tuples, instead of a dict right = dict(right) @@ -150,29 +150,29 @@ def test_inspect_entries( def check_pyiceberg_df_equals_spark_df(df: pa.Table, spark_df: DataFrame) -> None: assert df.column_names == [ - 'status', - 'snapshot_id', - 'sequence_number', - 'file_sequence_number', - 'data_file', - 'readable_metrics', + "status", + "snapshot_id", + "sequence_number", + "file_sequence_number", + "data_file", + "readable_metrics", ] # Make sure that they are filled properly - for int_column in ['status', 'snapshot_id', 'sequence_number', 'file_sequence_number']: + for int_column in ["status", "snapshot_id", "sequence_number", "file_sequence_number"]: for value in df[int_column]: assert isinstance(value.as_py(), int) - for snapshot_id in df['snapshot_id']: + for snapshot_id in df["snapshot_id"]: assert isinstance(snapshot_id.as_py(), int) lhs = df.to_pandas() rhs = spark_df.toPandas() for column in df.column_names: for left, right in zip(lhs[column].to_list(), rhs[column].to_list()): - if column == 'data_file': + if column == "data_file": for df_column in left.keys(): - if df_column == 'partition': + if df_column == "partition": # Spark leaves out the partition if the table is unpartitioned continue @@ -183,20 +183,20 @@ def check_pyiceberg_df_equals_spark_df(df: pa.Table, spark_df: DataFrame) -> Non df_lhs = dict(df_lhs) assert df_lhs == df_rhs, f"Difference in data_file column {df_column}: {df_lhs} != {df_rhs}" - elif column == 'readable_metrics': + elif column == "readable_metrics": assert list(left.keys()) == [ - 'bool', - 'string', - 'string_long', - 'int', - 'long', - 'float', - 'double', - 'timestamp', - 'timestamptz', - 'date', - 'binary', - 'fixed', + "bool", + "string", + "string_long", + "int", + "long", + "float", + "double", + "timestamp", + "timestamptz", + "date", + "binary", + "fixed", ] assert left.keys() == right.keys() @@ -205,18 +205,18 @@ def check_pyiceberg_df_equals_spark_df(df: pa.Table, spark_df: DataFrame) -> Non rm_lhs = left[rm_column] rm_rhs = right[rm_column] - assert rm_lhs['column_size'] == rm_rhs['column_size'] - assert rm_lhs['value_count'] == rm_rhs['value_count'] - assert rm_lhs['null_value_count'] == rm_rhs['null_value_count'] - assert rm_lhs['nan_value_count'] == rm_rhs['nan_value_count'] + assert rm_lhs["column_size"] == rm_rhs["column_size"] + assert rm_lhs["value_count"] == rm_rhs["value_count"] + assert rm_lhs["null_value_count"] == rm_rhs["null_value_count"] + assert rm_lhs["nan_value_count"] == rm_rhs["nan_value_count"] - if rm_column == 'timestamptz': + if rm_column == "timestamptz": # PySpark does not correctly set the timstamptz - rm_rhs['lower_bound'] = rm_rhs['lower_bound'].replace(tzinfo=pytz.utc) - rm_rhs['upper_bound'] = rm_rhs['upper_bound'].replace(tzinfo=pytz.utc) + rm_rhs["lower_bound"] = rm_rhs["lower_bound"].replace(tzinfo=pytz.utc) + rm_rhs["upper_bound"] = rm_rhs["upper_bound"].replace(tzinfo=pytz.utc) - assert rm_lhs['lower_bound'] == rm_rhs['lower_bound'] - assert rm_lhs['upper_bound'] == rm_rhs['upper_bound'] + assert rm_lhs["lower_bound"] == rm_rhs["lower_bound"] + assert rm_lhs["upper_bound"] == rm_rhs["upper_bound"] else: assert left == right, f"Difference in column {column}: {left} != {right}" @@ -265,8 +265,8 @@ def test_inspect_entries_partitioned(spark: SparkSession, session_catalog: Catal df = session_catalog.load_table(identifier).inspect.entries() - assert df.to_pydict()['data_file'][0]['partition'] == {'dt_day': date(2021, 2, 1), 'dt_month': None} - assert df.to_pydict()['data_file'][1]['partition'] == {'dt_day': None, 'dt_month': 612} + assert df.to_pydict()["data_file"][0]["partition"] == {"dt_day": date(2021, 2, 1), "dt_month": None} + assert df.to_pydict()["data_file"][1]["partition"] == {"dt_day": None, "dt_month": 612} @pytest.mark.integration @@ -301,21 +301,21 @@ def test_inspect_refs( df = tbl.refresh().inspect.refs() assert df.column_names == [ - 'name', - 'type', - 'snapshot_id', - 'max_reference_age_in_ms', - 'min_snapshots_to_keep', - 'max_snapshot_age_in_ms', + "name", + "type", + "snapshot_id", + "max_reference_age_in_ms", + "min_snapshots_to_keep", + "max_snapshot_age_in_ms", ] - assert [name.as_py() for name in df['name']] == ['testBranch', 'main', 'testTag'] - assert [ref_type.as_py() for ref_type in df['type']] == ['BRANCH', 'BRANCH', 'TAG'] + assert [name.as_py() for name in df["name"]] == ["testBranch", "main", "testTag"] + assert [ref_type.as_py() for ref_type in df["type"]] == ["BRANCH", "BRANCH", "TAG"] - for snapshot_id in df['snapshot_id']: + for snapshot_id in df["snapshot_id"]: assert isinstance(snapshot_id.as_py(), int) - for int_column in ['max_reference_age_in_ms', 'min_snapshots_to_keep', 'max_snapshot_age_in_ms']: + for int_column in ["max_reference_age_in_ms", "min_snapshots_to_keep", "max_snapshot_age_in_ms"]: for value in df[int_column]: assert isinstance(value.as_py(), int) or not value.as_py() @@ -343,28 +343,28 @@ def test_inspect_partitions_unpartitioned( df = tbl.inspect.partitions() assert df.column_names == [ - 'record_count', - 'file_count', - 'total_data_file_size_in_bytes', - 'position_delete_record_count', - 'position_delete_file_count', - 'equality_delete_record_count', - 'equality_delete_file_count', - 'last_updated_at', - 'last_updated_snapshot_id', + "record_count", + "file_count", + "total_data_file_size_in_bytes", + "position_delete_record_count", + "position_delete_file_count", + "equality_delete_record_count", + "equality_delete_file_count", + "last_updated_at", + "last_updated_snapshot_id", ] - for last_updated_at in df['last_updated_at']: + for last_updated_at in df["last_updated_at"]: assert isinstance(last_updated_at.as_py(), datetime) int_cols = [ - 'record_count', - 'file_count', - 'total_data_file_size_in_bytes', - 'position_delete_record_count', - 'position_delete_file_count', - 'equality_delete_record_count', - 'equality_delete_file_count', - 'last_updated_snapshot_id', + "record_count", + "file_count", + "total_data_file_size_in_bytes", + "position_delete_record_count", + "position_delete_file_count", + "equality_delete_record_count", + "equality_delete_file_count", + "last_updated_snapshot_id", ] for column in int_cols: for value in df[column]: @@ -434,8 +434,8 @@ def test_inspect_partitions_partitioned(spark: SparkSession, session_catalog: Ca ) def check_pyiceberg_df_equals_spark_df(df: pa.Table, spark_df: DataFrame) -> None: - lhs = df.to_pandas().sort_values('spec_id') - rhs = spark_df.toPandas().sort_values('spec_id') + lhs = df.to_pandas().sort_values("spec_id") + rhs = spark_df.toPandas().sort_values("spec_id") for column in df.column_names: for left, right in zip(lhs[column].to_list(), rhs[column].to_list()): assert left == right, f"Difference in column {column}: {left} != {right}" @@ -481,31 +481,31 @@ def test_inspect_manifests(spark: SparkSession, session_catalog: Catalog, format df = session_catalog.load_table(identifier).inspect.manifests() assert df.column_names == [ - 'content', - 'path', - 'length', - 'partition_spec_id', - 'added_snapshot_id', - 'added_data_files_count', - 'existing_data_files_count', - 'deleted_data_files_count', - 'added_delete_files_count', - 'existing_delete_files_count', - 'deleted_delete_files_count', - 'partition_summaries', + "content", + "path", + "length", + "partition_spec_id", + "added_snapshot_id", + "added_data_files_count", + "existing_data_files_count", + "deleted_data_files_count", + "added_delete_files_count", + "existing_delete_files_count", + "deleted_delete_files_count", + "partition_summaries", ] int_cols = [ - 'content', - 'length', - 'partition_spec_id', - 'added_snapshot_id', - 'added_data_files_count', - 'existing_data_files_count', - 'deleted_data_files_count', - 'added_delete_files_count', - 'existing_delete_files_count', - 'deleted_delete_files_count', + "content", + "length", + "partition_spec_id", + "added_snapshot_id", + "added_data_files_count", + "existing_data_files_count", + "deleted_data_files_count", + "added_delete_files_count", + "existing_delete_files_count", + "deleted_delete_files_count", ] for column in int_cols: diff --git a/tests/integration/test_partition_evolution.py b/tests/integration/test_partition_evolution.py index 785b34b82c..5cc7512f4a 100644 --- a/tests/integration/test_partition_evolution.py +++ b/tests/integration/test_partition_evolution.py @@ -73,7 +73,7 @@ def _create_table_with_schema(catalog: Catalog, schema: Schema, format_version: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_add_identity_partition(catalog: Catalog, table_schema_simple: Schema) -> None: simple_table = _simple_table(catalog, table_schema_simple) simple_table.update_spec().add_identity("foo").commit() @@ -85,7 +85,7 @@ def test_add_identity_partition(catalog: Catalog, table_schema_simple: Schema) - @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_add_year(catalog: Catalog) -> None: table = _table(catalog) table.update_spec().add_field("event_ts", YearTransform(), "year_transform").commit() @@ -93,7 +93,7 @@ def test_add_year(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_add_month(catalog: Catalog) -> None: table = _table(catalog) table.update_spec().add_field("event_ts", MonthTransform(), "month_transform").commit() @@ -101,7 +101,7 @@ def test_add_month(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_add_day(catalog: Catalog) -> None: table = _table(catalog) table.update_spec().add_field("event_ts", DayTransform(), "day_transform").commit() @@ -109,7 +109,7 @@ def test_add_day(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_add_hour(catalog: Catalog) -> None: table = _table(catalog) table.update_spec().add_field("event_ts", HourTransform(), "hour_transform").commit() @@ -117,7 +117,7 @@ def test_add_hour(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_add_bucket(catalog: Catalog, table_schema_simple: Schema) -> None: simple_table = _create_table_with_schema(catalog, table_schema_simple, "1") simple_table.update_spec().add_field("foo", BucketTransform(12), "bucket_transform").commit() @@ -125,7 +125,7 @@ def test_add_bucket(catalog: Catalog, table_schema_simple: Schema) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_add_truncate(catalog: Catalog, table_schema_simple: Schema) -> None: simple_table = _create_table_with_schema(catalog, table_schema_simple, "1") simple_table.update_spec().add_field("foo", TruncateTransform(1), "truncate_transform").commit() @@ -135,7 +135,7 @@ def test_add_truncate(catalog: Catalog, table_schema_simple: Schema) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_multiple_adds(catalog: Catalog) -> None: table = _table(catalog) table.update_spec().add_identity("id").add_field("event_ts", HourTransform(), "hourly_partitioned").add_field( @@ -153,7 +153,7 @@ def test_multiple_adds(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_add_hour_to_day(catalog: Catalog) -> None: table = _table(catalog) table.update_spec().add_field("event_ts", DayTransform(), "daily_partitioned").commit() @@ -169,7 +169,7 @@ def test_add_hour_to_day(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_add_multiple_buckets(catalog: Catalog) -> None: table = _table(catalog) table.update_spec().add_field("id", BucketTransform(16)).add_field("id", BucketTransform(4)).commit() @@ -184,7 +184,7 @@ def test_add_multiple_buckets(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_remove_identity(catalog: Catalog) -> None: table = _table(catalog) table.update_spec().add_identity("id").commit() @@ -192,12 +192,12 @@ def test_remove_identity(catalog: Catalog) -> None: assert len(table.specs()) == 3 assert table.spec().spec_id == 2 assert table.spec() == PartitionSpec( - PartitionField(source_id=1, field_id=1000, transform=VoidTransform(), name='id'), spec_id=2 + PartitionField(source_id=1, field_id=1000, transform=VoidTransform(), name="id"), spec_id=2 ) @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_remove_identity_v2(catalog: Catalog) -> None: table_v2 = _table_v2(catalog) table_v2.update_spec().add_identity("id").commit() @@ -208,7 +208,7 @@ def test_remove_identity_v2(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_remove_bucket(catalog: Catalog) -> None: table = _table(catalog) with table.update_spec() as update: @@ -223,13 +223,13 @@ def test_remove_bucket(catalog: Catalog) -> None: 1001, 2, 1001, - PartitionField(source_id=1, field_id=1000, transform=VoidTransform(), name='bucketed_id'), - PartitionField(source_id=2, field_id=1001, transform=DayTransform(), name='day_ts'), + PartitionField(source_id=1, field_id=1000, transform=VoidTransform(), name="bucketed_id"), + PartitionField(source_id=2, field_id=1001, transform=DayTransform(), name="day_ts"), ) @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_remove_bucket_v2(catalog: Catalog) -> None: table_v2 = _table_v2(catalog) with table_v2.update_spec() as update: @@ -239,12 +239,12 @@ def test_remove_bucket_v2(catalog: Catalog) -> None: remove.remove_field("bucketed_id") assert len(table_v2.specs()) == 3 _validate_new_partition_fields( - table_v2, 1001, 2, 1001, PartitionField(source_id=2, field_id=1001, transform=DayTransform(), name='day_ts') + table_v2, 1001, 2, 1001, PartitionField(source_id=2, field_id=1001, transform=DayTransform(), name="day_ts") ) @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_remove_day(catalog: Catalog) -> None: table = _table(catalog) with table.update_spec() as update: @@ -259,13 +259,13 @@ def test_remove_day(catalog: Catalog) -> None: 1001, 2, 1001, - PartitionField(source_id=1, field_id=1000, transform=BucketTransform(16), name='bucketed_id'), - PartitionField(source_id=2, field_id=1001, transform=VoidTransform(), name='day_ts'), + PartitionField(source_id=1, field_id=1000, transform=BucketTransform(16), name="bucketed_id"), + PartitionField(source_id=2, field_id=1001, transform=VoidTransform(), name="day_ts"), ) @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_remove_day_v2(catalog: Catalog) -> None: table_v2 = _table_v2(catalog) with table_v2.update_spec() as update: @@ -275,12 +275,12 @@ def test_remove_day_v2(catalog: Catalog) -> None: remove.remove_field("day_ts") assert len(table_v2.specs()) == 3 _validate_new_partition_fields( - table_v2, 1000, 2, 1001, PartitionField(source_id=1, field_id=1000, transform=BucketTransform(16), name='bucketed_id') + table_v2, 1000, 2, 1001, PartitionField(source_id=1, field_id=1000, transform=BucketTransform(16), name="bucketed_id") ) @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_rename(catalog: Catalog) -> None: table = _table(catalog) table.update_spec().add_identity("id").commit() @@ -291,7 +291,7 @@ def test_rename(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_cannot_add_and_remove(catalog: Catalog) -> None: table = _table(catalog) with pytest.raises(ValueError) as exc_info: @@ -300,7 +300,7 @@ def test_cannot_add_and_remove(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_cannot_add_redundant_time_partition(catalog: Catalog) -> None: table = _table(catalog) with pytest.raises(ValueError) as exc_info: @@ -311,7 +311,7 @@ def test_cannot_add_redundant_time_partition(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_cannot_delete_and_rename(catalog: Catalog) -> None: table = _table(catalog) with pytest.raises(ValueError) as exc_info: @@ -321,7 +321,7 @@ def test_cannot_delete_and_rename(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_cannot_rename_and_delete(catalog: Catalog) -> None: table = _table(catalog) with pytest.raises(ValueError) as exc_info: @@ -331,7 +331,7 @@ def test_cannot_rename_and_delete(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_cannot_add_same_tranform_for_same_field(catalog: Catalog) -> None: table = _table(catalog) with pytest.raises(ValueError) as exc_info: @@ -342,7 +342,7 @@ def test_cannot_add_same_tranform_for_same_field(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_cannot_add_same_field_multiple_times(catalog: Catalog) -> None: table = _table(catalog) with pytest.raises(ValueError) as exc_info: @@ -353,7 +353,7 @@ def test_cannot_add_same_field_multiple_times(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_cannot_add_multiple_specs_same_name(catalog: Catalog) -> None: table = _table(catalog) with pytest.raises(ValueError) as exc_info: @@ -364,7 +364,7 @@ def test_cannot_add_multiple_specs_same_name(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_change_specs_and_schema_transaction(catalog: Catalog) -> None: table = _table(catalog) with table.transaction() as transaction: @@ -387,17 +387,17 @@ def test_change_specs_and_schema_transaction(catalog: Catalog) -> None: ) assert table.schema() == Schema( - NestedField(field_id=1, name='id', field_type=LongType(), required=False), - NestedField(field_id=2, name='event_ts', field_type=TimestampType(), required=False), - NestedField(field_id=3, name='str', field_type=StringType(), required=False), - NestedField(field_id=4, name='col_string', field_type=StringType(), required=False), + NestedField(field_id=1, name="id", field_type=LongType(), required=False), + NestedField(field_id=2, name="event_ts", field_type=TimestampType(), required=False), + NestedField(field_id=3, name="str", field_type=StringType(), required=False), + NestedField(field_id=4, name="col_string", field_type=StringType(), required=False), identifier_field_ids=[], ) assert table.schema().schema_id == 1 @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_multiple_adds_and_remove_v1(catalog: Catalog) -> None: table = _table(catalog) with table.update_spec() as update: @@ -419,7 +419,7 @@ def test_multiple_adds_and_remove_v1(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_multiple_adds_and_remove_v2(catalog: Catalog) -> None: table_v2 = _table_v2(catalog) with table_v2.update_spec() as update: @@ -433,7 +433,7 @@ def test_multiple_adds_and_remove_v2(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_multiple_remove_and_add_reuses_v2(catalog: Catalog) -> None: table_v2 = _table_v2(catalog) with table_v2.update_spec() as update: diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py index d89ecaf202..29f664909c 100644 --- a/tests/integration/test_partitioning_key.py +++ b/tests/integration/test_partitioning_key.py @@ -328,8 +328,8 @@ ), ( [PartitionField(source_id=11, field_id=1001, transform=IdentityTransform(), name="binary_field")], - [b'example'], - Record(binary_field=b'example'), + [b"example"], + Record(binary_field=b"example"), "binary_field=ZXhhbXBsZQ%3D%3D", f"""CREATE TABLE {identifier} ( binary_field binary, @@ -347,8 +347,8 @@ ), ( [PartitionField(source_id=13, field_id=1001, transform=IdentityTransform(), name="decimal_field")], - [Decimal('123.45')], - Record(decimal_field=Decimal('123.45')), + [Decimal("123.45")], + Record(decimal_field=Decimal("123.45")), "decimal_field=123.45", f"""CREATE TABLE {identifier} ( decimal_field decimal(5,2), @@ -638,8 +638,8 @@ ), ( [PartitionField(source_id=13, field_id=1001, transform=TruncateTransform(width=5), name="decimal_field_trunc")], - [Decimal('678.93')], - Record(decimal_field_trunc=Decimal('678.90')), + [Decimal("678.93")], + Record(decimal_field_trunc=Decimal("678.90")), "decimal_field_trunc=678.90", # Assuming truncation width of 1 leads to truncating to 670 f"""CREATE TABLE {identifier} ( decimal_field decimal(5,2), @@ -657,8 +657,8 @@ ), ( [PartitionField(source_id=11, field_id=1001, transform=TruncateTransform(10), name="binary_field_trunc")], - [b'HELLOICEBERG'], - Record(binary_field_trunc=b'HELLOICEBE'), + [b"HELLOICEBERG"], + Record(binary_field_trunc=b"HELLOICEBE"), "binary_field_trunc=SEVMTE9JQ0VCRQ%3D%3D", f"""CREATE TABLE {identifier} ( binary_field binary, diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py index 2a10e37ba9..80a6f18632 100644 --- a/tests/integration/test_reads.py +++ b/tests/integration/test_reads.py @@ -51,7 +51,7 @@ ) from pyiceberg.utils.concurrent import ExecutorFactory -DEFAULT_PROPERTIES = {'write.parquet.compression-codec': 'zstd'} +DEFAULT_PROPERTIES = {"write.parquet.compression-codec": "zstd"} TABLE_NAME = ("default", "t1") @@ -74,7 +74,7 @@ def create_table(catalog: Catalog) -> Table: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_table_properties(catalog: Catalog) -> None: table = create_table(catalog) @@ -104,7 +104,7 @@ def test_table_properties(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_table_properties_dict(catalog: Catalog) -> None: table = create_table(catalog) @@ -134,7 +134,7 @@ def test_table_properties_dict(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_table_properties_error(catalog: Catalog) -> None: table = create_table(catalog) properties = {"abc": "def"} @@ -144,7 +144,7 @@ def test_table_properties_error(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_pyarrow_nan(catalog: Catalog) -> None: table_test_null_nan = catalog.load_table("default.test_null_nan") arrow_table = table_test_null_nan.scan(row_filter=IsNaN("col_numeric"), selected_fields=("idx", "col_numeric")).to_arrow() @@ -154,7 +154,7 @@ def test_pyarrow_nan(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_pyarrow_nan_rewritten(catalog: Catalog) -> None: table_test_null_nan_rewritten = catalog.load_table("default.test_null_nan_rewritten") arrow_table = table_test_null_nan_rewritten.scan( @@ -166,7 +166,7 @@ def test_pyarrow_nan_rewritten(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) @pytest.mark.skip(reason="Fixing issues with NaN's: https://github.com/apache/arrow/issues/34162") def test_pyarrow_not_nan_count(catalog: Catalog) -> None: table_test_null_nan = catalog.load_table("default.test_null_nan") @@ -175,7 +175,7 @@ def test_pyarrow_not_nan_count(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_duckdb_nan(catalog: Catalog) -> None: table_test_null_nan_rewritten = catalog.load_table("default.test_null_nan_rewritten") con = table_test_null_nan_rewritten.scan().to_duckdb("table_test_null_nan") @@ -185,7 +185,7 @@ def test_duckdb_nan(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_pyarrow_limit(catalog: Catalog) -> None: table_test_limit = catalog.load_table("default.test_limit") limited_result = table_test_limit.scan(selected_fields=("idx",), limit=1).to_arrow() @@ -200,7 +200,7 @@ def test_pyarrow_limit(catalog: Catalog) -> None: @pytest.mark.integration @pytest.mark.filterwarnings("ignore") -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_daft_nan(catalog: Catalog) -> None: table_test_null_nan_rewritten = catalog.load_table("default.test_null_nan_rewritten") df = table_test_null_nan_rewritten.to_daft() @@ -209,7 +209,7 @@ def test_daft_nan(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_daft_nan_rewritten(catalog: Catalog) -> None: table_test_null_nan_rewritten = catalog.load_table("default.test_null_nan_rewritten") df = table_test_null_nan_rewritten.to_daft() @@ -222,7 +222,7 @@ def test_daft_nan_rewritten(catalog: Catalog) -> None: @pytest.mark.integration @pytest.mark.filterwarnings("ignore") -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_ray_nan(catalog: Catalog) -> None: table_test_null_nan_rewritten = catalog.load_table("default.test_null_nan_rewritten") ray_dataset = table_test_null_nan_rewritten.scan().to_ray() @@ -231,7 +231,7 @@ def test_ray_nan(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_ray_nan_rewritten(catalog: Catalog) -> None: table_test_null_nan_rewritten = catalog.load_table("default.test_null_nan_rewritten") ray_dataset = table_test_null_nan_rewritten.scan( @@ -243,7 +243,7 @@ def test_ray_nan_rewritten(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) @pytest.mark.skip(reason="Fixing issues with NaN's: https://github.com/apache/arrow/issues/34162") def test_ray_not_nan_count(catalog: Catalog) -> None: table_test_null_nan_rewritten = catalog.load_table("default.test_null_nan_rewritten") @@ -252,7 +252,7 @@ def test_ray_not_nan_count(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_ray_all_types(catalog: Catalog) -> None: table_test_all_types = catalog.load_table("default.test_all_types") ray_dataset = table_test_all_types.scan().to_ray() @@ -262,7 +262,7 @@ def test_ray_all_types(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_pyarrow_to_iceberg_all_types(catalog: Catalog) -> None: table_test_all_types = catalog.load_table("default.test_all_types") fs = S3FileSystem( @@ -281,7 +281,7 @@ def test_pyarrow_to_iceberg_all_types(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_pyarrow_deletes(catalog: Catalog) -> None: # number, letter # (1, 'a'), @@ -318,7 +318,7 @@ def test_pyarrow_deletes(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_pyarrow_deletes_double(catalog: Catalog) -> None: # number, letter # (1, 'a'), @@ -355,7 +355,7 @@ def test_pyarrow_deletes_double(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_partitioned_tables(catalog: Catalog) -> None: for table_name, predicate in [ ("test_partitioned_by_identity", "ts >= '2023-03-05T00:00:00+00:00'"), @@ -372,7 +372,7 @@ def test_partitioned_tables(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_unpartitioned_uuid_table(catalog: Catalog) -> None: unpartitioned_uuid = catalog.load_table("default.test_uuid_and_fixed_unpartitioned") arrow_table_eq = unpartitioned_uuid.scan(row_filter="uuid_col == '102cb62f-e6f8-4eb0-9973-d9b012ff0967'").to_arrow() @@ -389,7 +389,7 @@ def test_unpartitioned_uuid_table(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_unpartitioned_fixed_table(catalog: Catalog) -> None: fixed_table = catalog.load_table("default.test_uuid_and_fixed_unpartitioned") arrow_table_eq = fixed_table.scan(row_filter=EqualTo("fixed_col", b"1234567890123456789012345")).to_arrow() @@ -408,7 +408,7 @@ def test_unpartitioned_fixed_table(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_scan_tag(catalog: Catalog) -> None: test_positional_mor_deletes = catalog.load_table("default.test_positional_mor_deletes") arrow_table = test_positional_mor_deletes.scan().use_ref("tag_12").to_arrow() @@ -416,7 +416,7 @@ def test_scan_tag(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_scan_branch(catalog: Catalog) -> None: test_positional_mor_deletes = catalog.load_table("default.test_positional_mor_deletes") arrow_table = test_positional_mor_deletes.scan().use_ref("without_5").to_arrow() @@ -424,21 +424,21 @@ def test_scan_branch(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_filter_on_new_column(catalog: Catalog) -> None: test_table_add_column = catalog.load_table("default.test_table_add_column") arrow_table = test_table_add_column.scan(row_filter="b == '2'").to_arrow() - assert arrow_table["b"].to_pylist() == ['2'] + assert arrow_table["b"].to_pylist() == ["2"] arrow_table = test_table_add_column.scan(row_filter="b is not null").to_arrow() - assert arrow_table["b"].to_pylist() == ['2'] + assert arrow_table["b"].to_pylist() == ["2"] arrow_table = test_table_add_column.scan(row_filter="b is null").to_arrow() assert arrow_table["b"].to_pylist() == [None] @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_upgrade_table_version(catalog: Catalog) -> None: table_test_table_version = catalog.load_table("default.test_table_version") @@ -466,7 +466,7 @@ def test_upgrade_table_version(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_sanitize_character(catalog: Catalog) -> None: table_test_table_sanitized_character = catalog.load_table("default.test_table_sanitized_character") arrow_table = table_test_table_sanitized_character.scan().to_arrow() @@ -476,7 +476,7 @@ def test_sanitize_character(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_null_list_and_map(catalog: Catalog) -> None: table_test_empty_list_and_map = catalog.load_table("default.test_table_empty_list_and_map") arrow_table = table_test_empty_list_and_map.scan().to_arrow() @@ -485,7 +485,7 @@ def test_null_list_and_map(catalog: Catalog) -> None: # This should be: # assert arrow_table["col_list_with_struct"].to_pylist() == [None, [{'test': 1}]] # Once https://github.com/apache/arrow/issues/38809 has been fixed - assert arrow_table["col_list_with_struct"].to_pylist() == [[], [{'test': 1}]] + assert arrow_table["col_list_with_struct"].to_pylist() == [[], [{"test": 1}]] @pytest.mark.integration diff --git a/tests/integration/test_rest_manifest.py b/tests/integration/test_rest_manifest.py index 0e768c6e68..82c41cfd93 100644 --- a/tests/integration/test_rest_manifest.py +++ b/tests/integration/test_rest_manifest.py @@ -104,7 +104,7 @@ def test_write_sample_manifest(table_test_all_types: Table) -> None: wrapped_entry_v2.data_file = wrapped_data_file_v2_debug wrapped_entry_v2_dict = todict(wrapped_entry_v2) # This one should not be written - del wrapped_entry_v2_dict['data_file']['spec_id'] + del wrapped_entry_v2_dict["data_file"]["spec_id"] with TemporaryDirectory() as tmpdir: tmp_avro_file = tmpdir + "/test_write_manifest.avro" diff --git a/tests/integration/test_rest_schema.py b/tests/integration/test_rest_schema.py index ac5d1ce050..f4ab98a883 100644 --- a/tests/integration/test_rest_schema.py +++ b/tests/integration/test_rest_schema.py @@ -358,16 +358,16 @@ def test_revert_changes(simple_table: Table, table_schema_simple: Schema) -> Non assert simple_table.schemas() == { 0: Schema( - NestedField(field_id=1, name='foo', field_type=StringType(), required=False), - NestedField(field_id=2, name='bar', field_type=IntegerType(), required=True), - NestedField(field_id=3, name='baz', field_type=BooleanType(), required=False), + NestedField(field_id=1, name="foo", field_type=StringType(), required=False), + NestedField(field_id=2, name="bar", field_type=IntegerType(), required=True), + NestedField(field_id=3, name="baz", field_type=BooleanType(), required=False), identifier_field_ids=[2], ), 1: Schema( - NestedField(field_id=1, name='foo', field_type=StringType(), required=False), - NestedField(field_id=2, name='bar', field_type=IntegerType(), required=True), - NestedField(field_id=3, name='baz', field_type=BooleanType(), required=False), - NestedField(field_id=4, name='data', field_type=IntegerType(), required=False), + NestedField(field_id=1, name="foo", field_type=StringType(), required=False), + NestedField(field_id=2, name="bar", field_type=IntegerType(), required=True), + NestedField(field_id=3, name="baz", field_type=BooleanType(), required=False), + NestedField(field_id=4, name="data", field_type=IntegerType(), required=False), identifier_field_ids=[2], ), } @@ -685,9 +685,9 @@ def test_rename_simple(simple_table: Table) -> None: # Check that the name mapping gets updated assert simple_table.name_mapping() == NameMapping([ - MappedField(field_id=1, names=['foo', 'vo']), - MappedField(field_id=2, names=['bar', 'var']), - MappedField(field_id=3, names=['baz']), + MappedField(field_id=1, names=["foo", "vo"]), + MappedField(field_id=2, names=["bar", "var"]), + MappedField(field_id=3, names=["baz"]), ]) @@ -719,7 +719,7 @@ def test_rename_simple_nested(catalog: Catalog) -> None: # Check that the name mapping gets updated assert tbl.name_mapping() == NameMapping([ - MappedField(field_id=1, names=['foo'], fields=[MappedField(field_id=2, names=['bar', 'vo'])]), + MappedField(field_id=1, names=["foo"], fields=[MappedField(field_id=2, names=["bar", "vo"])]), ]) diff --git a/tests/integration/test_writes/test_partitioned_writes.py b/tests/integration/test_writes/test_partitioned_writes.py index d84b9745a7..5cb03e59d8 100644 --- a/tests/integration/test_writes/test_partitioned_writes.py +++ b/tests/integration/test_writes/test_partitioned_writes.py @@ -38,7 +38,7 @@ @pytest.mark.integration @pytest.mark.parametrize( - "part_col", ['int', 'bool', 'string', "string_long", "long", "float", "double", "date", 'timestamp', 'timestamptz', 'binary'] + "part_col", ["int", "bool", "string", "string_long", "long", "float", "double", "date", "timestamp", "timestamptz", "binary"] ) @pytest.mark.parametrize("format_version", [1, 2]) def test_query_filter_null_partitioned( @@ -71,7 +71,7 @@ def test_query_filter_null_partitioned( @pytest.mark.integration @pytest.mark.parametrize( - "part_col", ['int', 'bool', 'string', "string_long", "long", "float", "double", "date", 'timestamp', 'timestamptz', 'binary'] + "part_col", ["int", "bool", "string", "string_long", "long", "float", "double", "date", "timestamp", "timestamptz", "binary"] ) @pytest.mark.parametrize("format_version", [1, 2]) def test_query_filter_without_data_partitioned( @@ -103,7 +103,7 @@ def test_query_filter_without_data_partitioned( @pytest.mark.integration @pytest.mark.parametrize( - "part_col", ['int', 'bool', 'string', "string_long", "long", "float", "double", "date", 'timestamp', 'timestamptz', 'binary'] + "part_col", ["int", "bool", "string", "string_long", "long", "float", "double", "date", "timestamp", "timestamptz", "binary"] ) @pytest.mark.parametrize("format_version", [1, 2]) def test_query_filter_only_nulls_partitioned( @@ -135,7 +135,7 @@ def test_query_filter_only_nulls_partitioned( @pytest.mark.integration @pytest.mark.parametrize( - "part_col", ['int', 'bool', 'string', "string_long", "long", "float", "double", "date", "timestamptz", "timestamp", "binary"] + "part_col", ["int", "bool", "string", "string_long", "long", "float", "double", "date", "timestamptz", "timestamp", "binary"] ) @pytest.mark.parametrize("format_version", [1, 2]) def test_query_filter_appended_null_partitioned( @@ -174,7 +174,7 @@ def test_query_filter_appended_null_partitioned( @pytest.mark.integration @pytest.mark.parametrize( - "part_col", ['int', 'bool', 'string', "string_long", "long", "float", "double", "date", "timestamptz", "timestamp", "binary"] + "part_col", ["int", "bool", "string", "string_long", "long", "float", "double", "date", "timestamptz", "timestamp", "binary"] ) def test_query_filter_v1_v2_append_null( session_catalog: Catalog, spark: SparkSession, arrow_table_with_null: pa.Table, part_col: str @@ -225,7 +225,7 @@ def test_summaries_with_null(spark: SparkSession, session_catalog: Catalog, arro identifier=identifier, schema=TABLE_SCHEMA, partition_spec=PartitionSpec(PartitionField(source_id=4, field_id=1001, transform=IdentityTransform(), name="int")), - properties={'format-version': '2'}, + properties={"format-version": "2"}, ) tbl.append(arrow_table_with_null) @@ -240,33 +240,33 @@ def test_summaries_with_null(spark: SparkSession, session_catalog: Catalog, arro ).collect() operations = [row.operation for row in rows] - assert operations == ['append', 'append'] + assert operations == ["append", "append"] summaries = [row.summary for row in rows] assert summaries[0] == { - 'changed-partition-count': '3', - 'added-data-files': '3', - 'added-files-size': '15029', - 'added-records': '3', - 'total-data-files': '3', - 'total-delete-files': '0', - 'total-equality-deletes': '0', - 'total-files-size': '15029', - 'total-position-deletes': '0', - 'total-records': '3', + "changed-partition-count": "3", + "added-data-files": "3", + "added-files-size": "15029", + "added-records": "3", + "total-data-files": "3", + "total-delete-files": "0", + "total-equality-deletes": "0", + "total-files-size": "15029", + "total-position-deletes": "0", + "total-records": "3", } assert summaries[1] == { - 'changed-partition-count': '3', - 'added-data-files': '3', - 'added-files-size': '15029', - 'added-records': '3', - 'total-data-files': '6', - 'total-delete-files': '0', - 'total-equality-deletes': '0', - 'total-files-size': '30058', - 'total-position-deletes': '0', - 'total-records': '6', + "changed-partition-count": "3", + "added-data-files": "3", + "added-files-size": "15029", + "added-records": "3", + "total-data-files": "6", + "total-delete-files": "0", + "total-equality-deletes": "0", + "total-files-size": "30058", + "total-position-deletes": "0", + "total-records": "6", } @@ -284,7 +284,7 @@ def test_data_files_with_table_partitioned_with_null( identifier=identifier, schema=TABLE_SCHEMA, partition_spec=PartitionSpec(PartitionField(source_id=4, field_id=1001, transform=IdentityTransform(), name="int")), - properties={'format-version': '1'}, + properties={"format-version": "1"}, ) tbl.append(arrow_table_with_null) @@ -320,7 +320,7 @@ def test_invalid_arguments(spark: SparkSession, session_catalog: Catalog) -> Non identifier=identifier, schema=TABLE_SCHEMA, partition_spec=PartitionSpec(PartitionField(source_id=4, field_id=1001, transform=IdentityTransform(), name="int")), - properties={'format-version': '1'}, + properties={"format-version": "1"}, ) with pytest.raises(ValueError, match="Expected PyArrow table, got: not a df"): @@ -379,7 +379,7 @@ def test_unsupported_transform( identifier=identifier, schema=TABLE_SCHEMA, partition_spec=spec, - properties={'format-version': '1'}, + properties={"format-version": "1"}, ) with pytest.raises(ValueError, match="All transforms are not supported.*"): diff --git a/tests/integration/test_writes/test_writes.py b/tests/integration/test_writes/test_writes.py index 74b6857dce..0941b35850 100644 --- a/tests/integration/test_writes/test_writes.py +++ b/tests/integration/test_writes/test_writes.py @@ -186,47 +186,47 @@ def test_summaries(spark: SparkSession, session_catalog: Catalog, arrow_table_wi ).collect() operations = [row.operation for row in rows] - assert operations == ['append', 'append', 'overwrite'] + assert operations == ["append", "append", "overwrite"] summaries = [row.summary for row in rows] assert summaries[0] == { - 'added-data-files': '1', - 'added-files-size': '5459', - 'added-records': '3', - 'total-data-files': '1', - 'total-delete-files': '0', - 'total-equality-deletes': '0', - 'total-files-size': '5459', - 'total-position-deletes': '0', - 'total-records': '3', + "added-data-files": "1", + "added-files-size": "5459", + "added-records": "3", + "total-data-files": "1", + "total-delete-files": "0", + "total-equality-deletes": "0", + "total-files-size": "5459", + "total-position-deletes": "0", + "total-records": "3", } assert summaries[1] == { - 'added-data-files': '1', - 'added-files-size': '5459', - 'added-records': '3', - 'total-data-files': '2', - 'total-delete-files': '0', - 'total-equality-deletes': '0', - 'total-files-size': '10918', - 'total-position-deletes': '0', - 'total-records': '6', + "added-data-files": "1", + "added-files-size": "5459", + "added-records": "3", + "total-data-files": "2", + "total-delete-files": "0", + "total-equality-deletes": "0", + "total-files-size": "10918", + "total-position-deletes": "0", + "total-records": "6", } assert summaries[2] == { - 'added-data-files': '1', - 'added-files-size': '5459', - 'added-records': '3', - 'deleted-data-files': '2', - 'deleted-records': '6', - 'removed-files-size': '10918', - 'total-data-files': '1', - 'total-delete-files': '0', - 'total-equality-deletes': '0', - 'total-files-size': '5459', - 'total-position-deletes': '0', - 'total-records': '3', + "added-data-files": "1", + "added-files-size": "5459", + "added-records": "3", + "deleted-data-files": "2", + "deleted-records": "6", + "removed-files-size": "10918", + "total-data-files": "1", + "total-delete-files": "0", + "total-equality-deletes": "0", + "total-files-size": "5459", + "total-position-deletes": "0", + "total-records": "3", } @@ -283,25 +283,25 @@ def test_python_writes_special_character_column_with_spark_reads( identifier = "default.python_writes_special_character_column_with_spark_reads" column_name_with_special_character = "letter/abc" TEST_DATA_WITH_SPECIAL_CHARACTER_COLUMN = { - column_name_with_special_character: ['a', None, 'z'], - 'id': [1, 2, 3], - 'name': ['AB', 'CD', 'EF'], - 'address': [ - {'street': '123', 'city': 'SFO', 'zip': 12345, column_name_with_special_character: 'a'}, - {'street': '456', 'city': 'SW', 'zip': 67890, column_name_with_special_character: 'b'}, - {'street': '789', 'city': 'Random', 'zip': 10112, column_name_with_special_character: 'c'}, + column_name_with_special_character: ["a", None, "z"], + "id": [1, 2, 3], + "name": ["AB", "CD", "EF"], + "address": [ + {"street": "123", "city": "SFO", "zip": 12345, column_name_with_special_character: "a"}, + {"street": "456", "city": "SW", "zip": 67890, column_name_with_special_character: "b"}, + {"street": "789", "city": "Random", "zip": 10112, column_name_with_special_character: "c"}, ], } pa_schema = pa.schema([ pa.field(column_name_with_special_character, pa.string()), - pa.field('id', pa.int32()), - pa.field('name', pa.string()), + pa.field("id", pa.int32()), + pa.field("name", pa.string()), pa.field( - 'address', + "address", pa.struct([ - pa.field('street', pa.string()), - pa.field('city', pa.string()), - pa.field('zip', pa.int32()), + pa.field("street", pa.string()), + pa.field("city", pa.string()), + pa.field("zip", pa.int32()), pa.field(column_name_with_special_character, pa.string()), ]), ), @@ -322,12 +322,12 @@ def test_python_writes_dictionary_encoded_column_with_spark_reads( ) -> None: identifier = "default.python_writes_dictionary_encoded_column_with_spark_reads" TEST_DATA = { - 'id': [1, 2, 3, 1, 1], - 'name': ['AB', 'CD', 'EF', 'CD', 'EF'], + "id": [1, 2, 3, 1, 1], + "name": ["AB", "CD", "EF", "CD", "EF"], } pa_schema = pa.schema([ - pa.field('id', pa.dictionary(pa.int32(), pa.int32(), False)), - pa.field('name', pa.dictionary(pa.int32(), pa.string(), False)), + pa.field("id", pa.dictionary(pa.int32(), pa.int32(), False)), + pa.field("name", pa.dictionary(pa.int32(), pa.string(), False)), ]) arrow_table = pa.Table.from_pydict(TEST_DATA, schema=pa_schema) @@ -473,7 +473,7 @@ def test_write_parquet_unsupported_properties( @pytest.mark.integration def test_invalid_arguments(spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table) -> None: identifier = "default.arrow_data_files" - tbl = _create_table(session_catalog, identifier, {'format-version': '1'}, []) + tbl = _create_table(session_catalog, identifier, {"format-version": "1"}, []) with pytest.raises(ValueError, match="Expected PyArrow table, got: not a df"): tbl.overwrite("not a df") @@ -488,7 +488,7 @@ def test_summaries_with_only_nulls( ) -> None: identifier = "default.arrow_table_summaries_with_only_nulls" tbl = _create_table( - session_catalog, identifier, {'format-version': '1'}, [arrow_table_without_data, arrow_table_with_only_nulls] + session_catalog, identifier, {"format-version": "1"}, [arrow_table_without_data, arrow_table_with_only_nulls] ) tbl.overwrite(arrow_table_without_data) @@ -501,49 +501,49 @@ def test_summaries_with_only_nulls( ).collect() operations = [row.operation for row in rows] - assert operations == ['append', 'append', 'overwrite'] + assert operations == ["append", "append", "overwrite"] summaries = [row.summary for row in rows] assert summaries[0] == { - 'total-data-files': '0', - 'total-delete-files': '0', - 'total-equality-deletes': '0', - 'total-files-size': '0', - 'total-position-deletes': '0', - 'total-records': '0', + "total-data-files": "0", + "total-delete-files": "0", + "total-equality-deletes": "0", + "total-files-size": "0", + "total-position-deletes": "0", + "total-records": "0", } assert summaries[1] == { - 'added-data-files': '1', - 'added-files-size': '4239', - 'added-records': '2', - 'total-data-files': '1', - 'total-delete-files': '0', - 'total-equality-deletes': '0', - 'total-files-size': '4239', - 'total-position-deletes': '0', - 'total-records': '2', + "added-data-files": "1", + "added-files-size": "4239", + "added-records": "2", + "total-data-files": "1", + "total-delete-files": "0", + "total-equality-deletes": "0", + "total-files-size": "4239", + "total-position-deletes": "0", + "total-records": "2", } assert summaries[2] == { - 'removed-files-size': '4239', - 'total-equality-deletes': '0', - 'total-position-deletes': '0', - 'deleted-data-files': '1', - 'total-delete-files': '0', - 'total-files-size': '0', - 'deleted-records': '2', - 'total-data-files': '0', - 'total-records': '0', + "removed-files-size": "4239", + "total-equality-deletes": "0", + "total-position-deletes": "0", + "deleted-data-files": "1", + "total-delete-files": "0", + "total-files-size": "0", + "deleted-records": "2", + "total-data-files": "0", + "total-records": "0", } @pytest.mark.integration def test_duckdb_url_import(warehouse: Path, arrow_table_with_null: pa.Table) -> None: - os.environ['TZ'] = 'Etc/UTC' + os.environ["TZ"] = "Etc/UTC" time.tzset() - tz = pytz.timezone(os.environ['TZ']) + tz = pytz.timezone(os.environ["TZ"]) catalog = SqlCatalog("test_sql_catalog", uri="sqlite:///:memory:", warehouse=f"/{warehouse}") catalog.create_namespace("default") @@ -554,7 +554,7 @@ def test_duckdb_url_import(warehouse: Path, arrow_table_with_null: pa.Table) -> import duckdb - duckdb.sql('INSTALL iceberg; LOAD iceberg;') + duckdb.sql("INSTALL iceberg; LOAD iceberg;") result = duckdb.sql( f""" SELECT * @@ -565,8 +565,8 @@ def test_duckdb_url_import(warehouse: Path, arrow_table_with_null: pa.Table) -> assert result == [ ( False, - 'a', - 'aaaaaaaaaaaaaaaaaaaaaa', + "a", + "aaaaaaaaaaaaaaaaaaaaaa", 1, 1, 0.0, @@ -574,14 +574,14 @@ def test_duckdb_url_import(warehouse: Path, arrow_table_with_null: pa.Table) -> datetime(2023, 1, 1, 19, 25), datetime(2023, 1, 1, 19, 25, tzinfo=tz), date(2023, 1, 1), - b'\x01', - b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + b"\x01", + b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", ), (None, None, None, None, None, None, None, None, None, None, None, None), ( True, - 'z', - 'zzzzzzzzzzzzzzzzzzzzzz', + "z", + "zzzzzzzzzzzzzzzzzzzzzz", 9, 9, 0.8999999761581421, @@ -589,8 +589,8 @@ def test_duckdb_url_import(warehouse: Path, arrow_table_with_null: pa.Table) -> datetime(2023, 3, 1, 19, 25), datetime(2023, 3, 1, 19, 25, tzinfo=tz), date(2023, 3, 1), - b'\x12', - b'\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11', + b"\x12", + b"\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11", ), ] @@ -607,7 +607,7 @@ def test_write_and_evolve(session_catalog: Catalog, format_version: int) -> None pa_table = pa.Table.from_pydict( { - 'foo': ['a', None, 'z'], + "foo": ["a", None, "z"], }, schema=pa.schema([pa.field("foo", pa.string(), nullable=True)]), ) @@ -618,8 +618,8 @@ def test_write_and_evolve(session_catalog: Catalog, format_version: int) -> None pa_table_with_column = pa.Table.from_pydict( { - 'foo': ['a', None, 'z'], - 'bar': [19, None, 25], + "foo": ["a", None, "z"], + "bar": [19, None, 25], }, schema=pa.schema([ pa.field("foo", pa.string(), nullable=True), @@ -653,15 +653,15 @@ def test_create_table_transaction(session_catalog: Catalog, format_version: int) pa_table = pa.Table.from_pydict( { - 'foo': ['a', None, 'z'], + "foo": ["a", None, "z"], }, schema=pa.schema([pa.field("foo", pa.string(), nullable=True)]), ) pa_table_with_column = pa.Table.from_pydict( { - 'foo': ['a', None, 'z'], - 'bar': [19, None, 25], + "foo": ["a", None, "z"], + "bar": [19, None, 25], }, schema=pa.schema([ pa.field("foo", pa.string(), nullable=True), @@ -741,45 +741,45 @@ def test_inspect_snapshots( df = tbl.inspect.snapshots() assert df.column_names == [ - 'committed_at', - 'snapshot_id', - 'parent_id', - 'operation', - 'manifest_list', - 'summary', + "committed_at", + "snapshot_id", + "parent_id", + "operation", + "manifest_list", + "summary", ] - for committed_at in df['committed_at']: + for committed_at in df["committed_at"]: assert isinstance(committed_at.as_py(), datetime) - for snapshot_id in df['snapshot_id']: + for snapshot_id in df["snapshot_id"]: assert isinstance(snapshot_id.as_py(), int) - assert df['parent_id'][0].as_py() is None - assert df['parent_id'][1:] == df['snapshot_id'][:2] + assert df["parent_id"][0].as_py() is None + assert df["parent_id"][1:] == df["snapshot_id"][:2] - assert [operation.as_py() for operation in df['operation']] == ['append', 'overwrite', 'append'] + assert [operation.as_py() for operation in df["operation"]] == ["append", "overwrite", "append"] - for manifest_list in df['manifest_list']: + for manifest_list in df["manifest_list"]: assert manifest_list.as_py().startswith("s3://") - assert df['summary'][0].as_py() == [ - ('added-files-size', '5459'), - ('added-data-files', '1'), - ('added-records', '3'), - ('total-data-files', '1'), - ('total-delete-files', '0'), - ('total-records', '3'), - ('total-files-size', '5459'), - ('total-position-deletes', '0'), - ('total-equality-deletes', '0'), + assert df["summary"][0].as_py() == [ + ("added-files-size", "5459"), + ("added-data-files", "1"), + ("added-records", "3"), + ("total-data-files", "1"), + ("total-delete-files", "0"), + ("total-records", "3"), + ("total-files-size", "5459"), + ("total-position-deletes", "0"), + ("total-equality-deletes", "0"), ] lhs = spark.table(f"{identifier}.snapshots").toPandas() rhs = df.to_pandas() for column in df.column_names: for left, right in zip(lhs[column].to_list(), rhs[column].to_list()): - if column == 'summary': + if column == "summary": # Arrow returns a list of tuples, instead of a dict right = dict(right) @@ -838,7 +838,7 @@ def test_hive_catalog_storage_descriptor( @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_sanitize_character_partitioned(catalog: Catalog) -> None: table_name = "default.test_table_partitioned_sanitized_character" try: diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index 90f5b08bf0..ec511f959d 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -1215,13 +1215,13 @@ def test_projection_list_of_structs(schema_list_of_structs: Schema, file_list_of results = [row.as_py() for row in result_table.columns[0]] assert results == [ [ - {'latitude': 52.371807, 'longitude': 4.896029, 'altitude': None}, - {'latitude': 52.387386, 'longitude': 4.646219, 'altitude': None}, + {"latitude": 52.371807, "longitude": 4.896029, "altitude": None}, + {"latitude": 52.387386, "longitude": 4.646219, "altitude": None}, ], [], [ - {'latitude': 52.078663, 'longitude': 4.288788, 'altitude': None}, - {'latitude': 52.387386, 'longitude': 4.646219, 'altitude': None}, + {"latitude": 52.078663, "longitude": 4.288788, "altitude": None}, + {"latitude": 52.387386, "longitude": 4.646219, "altitude": None}, ], ] assert ( diff --git a/tests/io/test_pyarrow_visitor.py b/tests/io/test_pyarrow_visitor.py index 46ad331aa0..c8571dacf1 100644 --- a/tests/io/test_pyarrow_visitor.py +++ b/tests/io/test_pyarrow_visitor.py @@ -329,7 +329,7 @@ def test_round_schema_large_string() -> None: def test_simple_schema_has_missing_ids() -> None: schema = pa.schema([ - pa.field('foo', pa.string(), nullable=False), + pa.field("foo", pa.string(), nullable=False), ]) visitor = _HasIds() has_ids = visit_pyarrow(schema, visitor) @@ -338,8 +338,8 @@ def test_simple_schema_has_missing_ids() -> None: def test_simple_schema_has_missing_ids_partial() -> None: schema = pa.schema([ - pa.field('foo', pa.string(), nullable=False, metadata={"PARQUET:field_id": "1", "doc": "foo doc"}), - pa.field('bar', pa.int32(), nullable=False), + pa.field("foo", pa.string(), nullable=False, metadata={"PARQUET:field_id": "1", "doc": "foo doc"}), + pa.field("bar", pa.int32(), nullable=False), ]) visitor = _HasIds() has_ids = visit_pyarrow(schema, visitor) @@ -348,9 +348,9 @@ def test_simple_schema_has_missing_ids_partial() -> None: def test_nested_schema_has_missing_ids() -> None: schema = pa.schema([ - pa.field('foo', pa.string(), nullable=False), + pa.field("foo", pa.string(), nullable=False), pa.field( - 'quux', + "quux", pa.map_( pa.string(), pa.map_(pa.string(), pa.int32()), @@ -365,16 +365,16 @@ def test_nested_schema_has_missing_ids() -> None: def test_nested_schema_has_ids() -> None: schema = pa.schema([ - pa.field('foo', pa.string(), nullable=False, metadata={"PARQUET:field_id": "1", "doc": "foo doc"}), + pa.field("foo", pa.string(), nullable=False, metadata={"PARQUET:field_id": "1", "doc": "foo doc"}), pa.field( - 'quux', + "quux", pa.map_( pa.field("key", pa.string(), nullable=False, metadata={"PARQUET:field_id": "7"}), pa.field( "value", pa.map_( - pa.field('key', pa.string(), nullable=False, metadata={"PARQUET:field_id": "9"}), - pa.field('value', pa.int32(), metadata={"PARQUET:field_id": "10"}), + pa.field("key", pa.string(), nullable=False, metadata={"PARQUET:field_id": "9"}), + pa.field("value", pa.int32(), metadata={"PARQUET:field_id": "10"}), ), nullable=False, metadata={"PARQUET:field_id": "8"}, @@ -391,14 +391,14 @@ def test_nested_schema_has_ids() -> None: def test_nested_schema_has_partial_missing_ids() -> None: schema = pa.schema([ - pa.field('foo', pa.string(), nullable=False, metadata={"PARQUET:field_id": "1", "doc": "foo doc"}), + pa.field("foo", pa.string(), nullable=False, metadata={"PARQUET:field_id": "1", "doc": "foo doc"}), pa.field( - 'quux', + "quux", pa.map_( pa.field("key", pa.string(), nullable=False, metadata={"PARQUET:field_id": "7"}), pa.field( "value", - pa.map_(pa.field('key', pa.string(), nullable=False), pa.field('value', pa.int32())), + pa.map_(pa.field("key", pa.string(), nullable=False), pa.field("value", pa.int32())), nullable=False, ), ), @@ -426,9 +426,9 @@ def test_simple_pyarrow_schema_to_schema_missing_ids_using_name_mapping( ) -> None: schema = pyarrow_schema_simple_without_ids name_mapping = NameMapping([ - MappedField(field_id=1, names=['foo']), - MappedField(field_id=2, names=['bar']), - MappedField(field_id=3, names=['baz']), + MappedField(field_id=1, names=["foo"]), + MappedField(field_id=2, names=["bar"]), + MappedField(field_id=3, names=["baz"]), ]) assert pyarrow_to_schema(schema, name_mapping) == iceberg_schema_simple @@ -439,7 +439,7 @@ def test_simple_pyarrow_schema_to_schema_missing_ids_using_name_mapping_partial_ ) -> None: schema = pyarrow_schema_simple_without_ids name_mapping = NameMapping([ - MappedField(field_id=1, names=['foo']), + MappedField(field_id=1, names=["foo"]), ]) with pytest.raises(ValueError) as exc_info: _ = pyarrow_to_schema(schema, name_mapping) @@ -452,45 +452,45 @@ def test_nested_pyarrow_schema_to_schema_missing_ids_using_name_mapping( schema = pyarrow_schema_nested_without_ids name_mapping = NameMapping([ - MappedField(field_id=1, names=['foo']), - MappedField(field_id=2, names=['bar']), - MappedField(field_id=3, names=['baz']), - MappedField(field_id=4, names=['qux'], fields=[MappedField(field_id=5, names=['element'])]), + MappedField(field_id=1, names=["foo"]), + MappedField(field_id=2, names=["bar"]), + MappedField(field_id=3, names=["baz"]), + MappedField(field_id=4, names=["qux"], fields=[MappedField(field_id=5, names=["element"])]), MappedField( field_id=6, - names=['quux'], + names=["quux"], fields=[ - MappedField(field_id=7, names=['key']), + MappedField(field_id=7, names=["key"]), MappedField( field_id=8, - names=['value'], + names=["value"], fields=[ - MappedField(field_id=9, names=['key']), - MappedField(field_id=10, names=['value']), + MappedField(field_id=9, names=["key"]), + MappedField(field_id=10, names=["value"]), ], ), ], ), MappedField( field_id=11, - names=['location'], + names=["location"], fields=[ MappedField( field_id=12, - names=['element'], + names=["element"], fields=[ - MappedField(field_id=13, names=['latitude']), - MappedField(field_id=14, names=['longitude']), + MappedField(field_id=13, names=["latitude"]), + MappedField(field_id=14, names=["longitude"]), ], ) ], ), MappedField( field_id=15, - names=['person'], + names=["person"], fields=[ - MappedField(field_id=16, names=['name']), - MappedField(field_id=17, names=['age']), + MappedField(field_id=16, names=["name"]), + MappedField(field_id=17, names=["age"]), ], ), ]) @@ -500,9 +500,9 @@ def test_nested_pyarrow_schema_to_schema_missing_ids_using_name_mapping( def test_pyarrow_schema_to_schema_missing_ids_using_name_mapping_nested_missing_id() -> None: schema = pa.schema([ - pa.field('foo', pa.string(), nullable=False), + pa.field("foo", pa.string(), nullable=False), pa.field( - 'quux', + "quux", pa.map_( pa.string(), pa.map_(pa.string(), pa.int32()), @@ -512,17 +512,17 @@ def test_pyarrow_schema_to_schema_missing_ids_using_name_mapping_nested_missing_ ]) name_mapping = NameMapping([ - MappedField(field_id=1, names=['foo']), + MappedField(field_id=1, names=["foo"]), MappedField( field_id=6, - names=['quux'], + names=["quux"], fields=[ - MappedField(field_id=7, names=['key']), + MappedField(field_id=7, names=["key"]), MappedField( field_id=8, - names=['value'], + names=["value"], fields=[ - MappedField(field_id=10, names=['value']), + MappedField(field_id=10, names=["value"]), ], ), ], diff --git a/tests/table/test_init.py b/tests/table/test_init.py index 2bc78f3197..11d50db8a5 100644 --- a/tests/table/test_init.py +++ b/tests/table/test_init.py @@ -995,9 +995,9 @@ def test_correct_schema() -> None: # Should use the current schema, instead the one from the snapshot projection_schema = t.scan().projection() assert projection_schema == Schema( - NestedField(field_id=1, name='x', field_type=LongType(), required=True), - NestedField(field_id=2, name='y', field_type=LongType(), required=True), - NestedField(field_id=3, name='z', field_type=LongType(), required=True), + NestedField(field_id=1, name="x", field_type=LongType(), required=True), + NestedField(field_id=2, name="y", field_type=LongType(), required=True), + NestedField(field_id=3, name="z", field_type=LongType(), required=True), identifier_field_ids=[1, 2], ) assert projection_schema.schema_id == 1 @@ -1005,7 +1005,7 @@ def test_correct_schema() -> None: # When we explicitly filter on the commit, we want to have the schema that's linked to the snapshot projection_schema = t.scan(snapshot_id=123).projection() assert projection_schema == Schema( - NestedField(field_id=1, name='x', field_type=LongType(), required=True), + NestedField(field_id=1, name="x", field_type=LongType(), required=True), identifier_field_ids=[], ) assert projection_schema.schema_id == 0 @@ -1138,8 +1138,8 @@ def test_table_properties_raise_for_none_value(example_table_metadata_v2: Dict[s def test_serialize_commit_table_request() -> None: request = CommitTableRequest( - requirements=(AssertTableUUID(uuid='4bfd18a3-74c6-478e-98b1-71c4c32f4163'),), - identifier=TableIdentifier(namespace=['a'], name='b'), + requirements=(AssertTableUUID(uuid="4bfd18a3-74c6-478e-98b1-71c4c32f4163"),), + identifier=TableIdentifier(namespace=["a"], name="b"), ) deserialized_request = CommitTableRequest.model_validate_json(request.model_dump_json()) @@ -1149,17 +1149,17 @@ def test_serialize_commit_table_request() -> None: def test_partition_for_demo() -> None: import pyarrow as pa - test_pa_schema = pa.schema([('year', pa.int64()), ("n_legs", pa.int64()), ("animal", pa.string())]) + test_pa_schema = pa.schema([("year", pa.int64()), ("n_legs", pa.int64()), ("animal", pa.string())]) test_schema = Schema( - NestedField(field_id=1, name='year', field_type=StringType(), required=False), - NestedField(field_id=2, name='n_legs', field_type=IntegerType(), required=True), - NestedField(field_id=3, name='animal', field_type=StringType(), required=False), + NestedField(field_id=1, name="year", field_type=StringType(), required=False), + NestedField(field_id=2, name="n_legs", field_type=IntegerType(), required=True), + NestedField(field_id=3, name="animal", field_type=StringType(), required=False), schema_id=1, ) test_data = { - 'year': [2020, 2022, 2022, 2022, 2021, 2022, 2022, 2019, 2021], - 'n_legs': [2, 2, 2, 4, 4, 4, 4, 5, 100], - 'animal': ["Flamingo", "Parrot", "Parrot", "Horse", "Dog", "Horse", "Horse", "Brittle stars", "Centipede"], + "year": [2020, 2022, 2022, 2022, 2021, 2022, 2022, 2019, 2021], + "n_legs": [2, 2, 2, 4, 4, 4, 4, 5, 100], + "animal": ["Flamingo", "Parrot", "Parrot", "Horse", "Dog", "Horse", "Horse", "Brittle stars", "Centipede"], } arrow_table = pa.Table.from_pydict(test_data, schema=test_pa_schema) partition_spec = PartitionSpec( @@ -1183,11 +1183,11 @@ def test_partition_for_demo() -> None: def test_identity_partition_on_multi_columns() -> None: import pyarrow as pa - test_pa_schema = pa.schema([('born_year', pa.int64()), ("n_legs", pa.int64()), ("animal", pa.string())]) + test_pa_schema = pa.schema([("born_year", pa.int64()), ("n_legs", pa.int64()), ("animal", pa.string())]) test_schema = Schema( - NestedField(field_id=1, name='born_year', field_type=StringType(), required=False), - NestedField(field_id=2, name='n_legs', field_type=IntegerType(), required=True), - NestedField(field_id=3, name='animal', field_type=StringType(), required=False), + NestedField(field_id=1, name="born_year", field_type=StringType(), required=False), + NestedField(field_id=2, name="n_legs", field_type=IntegerType(), required=True), + NestedField(field_id=3, name="animal", field_type=StringType(), required=False), schema_id=1, ) # 5 partitions, 6 unique row values, 12 rows @@ -1210,9 +1210,9 @@ def test_identity_partition_on_multi_columns() -> None: for _ in range(1000): random.shuffle(test_rows) test_data = { - 'born_year': [row[0] for row in test_rows], - 'n_legs': [row[1] for row in test_rows], - 'animal': [row[2] for row in test_rows], + "born_year": [row[0] for row in test_rows], + "n_legs": [row[1] for row in test_rows], + "animal": [row[2] for row in test_rows], } arrow_table = pa.Table.from_pydict(test_data, schema=test_pa_schema) @@ -1222,7 +1222,7 @@ def test_identity_partition_on_multi_columns() -> None: concatenated_arrow_table = pa.concat_tables([table_partition.arrow_table_partition for table_partition in result]) assert concatenated_arrow_table.num_rows == arrow_table.num_rows assert concatenated_arrow_table.sort_by([ - ('born_year', 'ascending'), - ('n_legs', 'ascending'), - ('animal', 'ascending'), - ]) == arrow_table.sort_by([('born_year', 'ascending'), ('n_legs', 'ascending'), ('animal', 'ascending')]) + ("born_year", "ascending"), + ("n_legs", "ascending"), + ("animal", "ascending"), + ]) == arrow_table.sort_by([("born_year", "ascending"), ("n_legs", "ascending"), ("animal", "ascending")]) diff --git a/tests/table/test_metadata.py b/tests/table/test_metadata.py index b4e30a6b84..0e2b91f24b 100644 --- a/tests/table/test_metadata.py +++ b/tests/table/test_metadata.py @@ -220,7 +220,7 @@ def test_new_table_metadata_with_explicit_v1_format() -> None: partition_spec=partition_spec, sort_order=sort_order, location="s3://some_v1_location/", - properties={'format-version': "1"}, + properties={"format-version": "1"}, ) expected_schema = Schema( diff --git a/tests/table/test_name_mapping.py b/tests/table/test_name_mapping.py index e039415ce3..d4a2bf6c41 100644 --- a/tests/table/test_name_mapping.py +++ b/tests/table/test_name_mapping.py @@ -30,45 +30,45 @@ @pytest.fixture(scope="session") def table_name_mapping_nested() -> NameMapping: return NameMapping([ - MappedField(field_id=1, names=['foo']), - MappedField(field_id=2, names=['bar']), - MappedField(field_id=3, names=['baz']), - MappedField(field_id=4, names=['qux'], fields=[MappedField(field_id=5, names=['element'])]), + MappedField(field_id=1, names=["foo"]), + MappedField(field_id=2, names=["bar"]), + MappedField(field_id=3, names=["baz"]), + MappedField(field_id=4, names=["qux"], fields=[MappedField(field_id=5, names=["element"])]), MappedField( field_id=6, - names=['quux'], + names=["quux"], fields=[ - MappedField(field_id=7, names=['key']), + MappedField(field_id=7, names=["key"]), MappedField( field_id=8, - names=['value'], + names=["value"], fields=[ - MappedField(field_id=9, names=['key']), - MappedField(field_id=10, names=['value']), + MappedField(field_id=9, names=["key"]), + MappedField(field_id=10, names=["value"]), ], ), ], ), MappedField( field_id=11, - names=['location'], + names=["location"], fields=[ MappedField( field_id=12, - names=['element'], + names=["element"], fields=[ - MappedField(field_id=13, names=['latitude']), - MappedField(field_id=14, names=['longitude']), + MappedField(field_id=13, names=["latitude"]), + MappedField(field_id=14, names=["longitude"]), ], ) ], ), MappedField( field_id=15, - names=['person'], + names=["person"], fields=[ - MappedField(field_id=16, names=['name']), - MappedField(field_id=17, names=['age']), + MappedField(field_id=16, names=["name"]), + MappedField(field_id=17, names=["age"]), ], ), ]) @@ -80,7 +80,7 @@ def test_json_mapped_field_deserialization() -> None: "names": ["id", "record_id"] } """ - assert MappedField(field_id=1, names=['id', 'record_id']) == MappedField.model_validate_json(mapped_field) + assert MappedField(field_id=1, names=["id", "record_id"]) == MappedField.model_validate_json(mapped_field) mapped_field_with_null_fields = """{ "field-id": 1, @@ -88,7 +88,7 @@ def test_json_mapped_field_deserialization() -> None: "fields": null } """ - assert MappedField(field_id=1, names=['id', 'record_id']) == MappedField.model_validate_json(mapped_field_with_null_fields) + assert MappedField(field_id=1, names=["id", "record_id"]) == MappedField.model_validate_json(mapped_field_with_null_fields) def test_json_name_mapping_deserialization() -> None: @@ -133,14 +133,14 @@ def test_json_name_mapping_deserialization() -> None: """ assert parse_mapping_from_json(name_mapping) == NameMapping([ - MappedField(field_id=1, names=['id', 'record_id']), - MappedField(field_id=2, names=['data']), + MappedField(field_id=1, names=["id", "record_id"]), + MappedField(field_id=2, names=["data"]), MappedField( - names=['location'], + names=["location"], field_id=3, fields=[ - MappedField(field_id=4, names=['latitude', 'lat']), - MappedField(field_id=5, names=['longitude', 'long']), + MappedField(field_id=4, names=["latitude", "lat"]), + MappedField(field_id=5, names=["longitude", "long"]), ], ), ]) @@ -155,14 +155,14 @@ def test_json_serialization(table_name_mapping_nested: NameMapping) -> None: def test_name_mapping_to_string() -> None: nm = NameMapping([ - MappedField(field_id=1, names=['id', 'record_id']), - MappedField(field_id=2, names=['data']), + MappedField(field_id=1, names=["id", "record_id"]), + MappedField(field_id=2, names=["data"]), MappedField( - names=['location'], + names=["location"], field_id=3, fields=[ - MappedField(field_id=4, names=['lat', 'latitude']), - MappedField(field_id=5, names=['long', 'longitude']), + MappedField(field_id=4, names=["lat", "latitude"]), + MappedField(field_id=5, names=["long", "longitude"]), ], ), ]) @@ -184,64 +184,64 @@ def test_mapping_from_schema(table_schema_nested: Schema, table_name_mapping_nes def test_mapping_by_name(table_name_mapping_nested: NameMapping) -> None: assert table_name_mapping_nested._field_by_name == { - 'person.age': MappedField(field_id=17, names=['age']), - 'person.name': MappedField(field_id=16, names=['name']), - 'person': MappedField( + "person.age": MappedField(field_id=17, names=["age"]), + "person.name": MappedField(field_id=16, names=["name"]), + "person": MappedField( field_id=15, - names=['person'], - fields=[MappedField(field_id=16, names=['name']), MappedField(field_id=17, names=['age'])], + names=["person"], + fields=[MappedField(field_id=16, names=["name"]), MappedField(field_id=17, names=["age"])], ), - 'location.element.longitude': MappedField(field_id=14, names=['longitude']), - 'location.element.latitude': MappedField(field_id=13, names=['latitude']), - 'location.element': MappedField( + "location.element.longitude": MappedField(field_id=14, names=["longitude"]), + "location.element.latitude": MappedField(field_id=13, names=["latitude"]), + "location.element": MappedField( field_id=12, - names=['element'], - fields=[MappedField(field_id=13, names=['latitude']), MappedField(field_id=14, names=['longitude'])], + names=["element"], + fields=[MappedField(field_id=13, names=["latitude"]), MappedField(field_id=14, names=["longitude"])], ), - 'location': MappedField( + "location": MappedField( field_id=11, - names=['location'], + names=["location"], fields=[ MappedField( field_id=12, - names=['element'], - fields=[MappedField(field_id=13, names=['latitude']), MappedField(field_id=14, names=['longitude'])], + names=["element"], + fields=[MappedField(field_id=13, names=["latitude"]), MappedField(field_id=14, names=["longitude"])], ) ], ), - 'quux.value.value': MappedField(field_id=10, names=['value']), - 'quux.value.key': MappedField(field_id=9, names=['key']), - 'quux.value': MappedField( + "quux.value.value": MappedField(field_id=10, names=["value"]), + "quux.value.key": MappedField(field_id=9, names=["key"]), + "quux.value": MappedField( field_id=8, - names=['value'], - fields=[MappedField(field_id=9, names=['key']), MappedField(field_id=10, names=['value'])], + names=["value"], + fields=[MappedField(field_id=9, names=["key"]), MappedField(field_id=10, names=["value"])], ), - 'quux.key': MappedField(field_id=7, names=['key']), - 'quux': MappedField( + "quux.key": MappedField(field_id=7, names=["key"]), + "quux": MappedField( field_id=6, - names=['quux'], + names=["quux"], fields=[ - MappedField(field_id=7, names=['key']), + MappedField(field_id=7, names=["key"]), MappedField( field_id=8, - names=['value'], - fields=[MappedField(field_id=9, names=['key']), MappedField(field_id=10, names=['value'])], + names=["value"], + fields=[MappedField(field_id=9, names=["key"]), MappedField(field_id=10, names=["value"])], ), ], ), - 'qux.element': MappedField(field_id=5, names=['element']), - 'qux': MappedField(field_id=4, names=['qux'], fields=[MappedField(field_id=5, names=['element'])]), - 'baz': MappedField(field_id=3, names=['baz']), - 'bar': MappedField(field_id=2, names=['bar']), - 'foo': MappedField(field_id=1, names=['foo']), + "qux.element": MappedField(field_id=5, names=["element"]), + "qux": MappedField(field_id=4, names=["qux"], fields=[MappedField(field_id=5, names=["element"])]), + "baz": MappedField(field_id=3, names=["baz"]), + "bar": MappedField(field_id=2, names=["bar"]), + "foo": MappedField(field_id=1, names=["foo"]), } def test_mapping_lookup_by_name(table_name_mapping_nested: NameMapping) -> None: - assert table_name_mapping_nested.find("foo") == MappedField(field_id=1, names=['foo']) - assert table_name_mapping_nested.find("location.element.latitude") == MappedField(field_id=13, names=['latitude']) - assert table_name_mapping_nested.find("location", "element", "latitude") == MappedField(field_id=13, names=['latitude']) - assert table_name_mapping_nested.find(*["location", "element", "latitude"]) == MappedField(field_id=13, names=['latitude']) + assert table_name_mapping_nested.find("foo") == MappedField(field_id=1, names=["foo"]) + assert table_name_mapping_nested.find("location.element.latitude") == MappedField(field_id=13, names=["latitude"]) + assert table_name_mapping_nested.find("location", "element", "latitude") == MappedField(field_id=13, names=["latitude"]) + assert table_name_mapping_nested.find(*["location", "element", "latitude"]) == MappedField(field_id=13, names=["latitude"]) with pytest.raises(ValueError, match="Could not find field with name: boom"): table_name_mapping_nested.find("boom") @@ -264,48 +264,48 @@ def test_update_mapping(table_name_mapping_nested: NameMapping) -> None: } expected = NameMapping([ - MappedField(field_id=1, names=['foo', 'foo_update']), - MappedField(field_id=2, names=['bar']), - MappedField(field_id=3, names=['baz']), - MappedField(field_id=4, names=['qux'], fields=[MappedField(field_id=5, names=['element'])]), + MappedField(field_id=1, names=["foo", "foo_update"]), + MappedField(field_id=2, names=["bar"]), + MappedField(field_id=3, names=["baz"]), + MappedField(field_id=4, names=["qux"], fields=[MappedField(field_id=5, names=["element"])]), MappedField( field_id=6, - names=['quux'], + names=["quux"], fields=[ - MappedField(field_id=7, names=['key']), + MappedField(field_id=7, names=["key"]), MappedField( field_id=8, - names=['value'], + names=["value"], fields=[ - MappedField(field_id=9, names=['key']), - MappedField(field_id=10, names=['value']), + MappedField(field_id=9, names=["key"]), + MappedField(field_id=10, names=["value"]), ], ), ], ), MappedField( field_id=11, - names=['location'], + names=["location"], fields=[ MappedField( field_id=12, - names=['element'], + names=["element"], fields=[ - MappedField(field_id=13, names=['latitude']), - MappedField(field_id=14, names=['longitude']), + MappedField(field_id=13, names=["latitude"]), + MappedField(field_id=14, names=["longitude"]), ], ) ], ), MappedField( field_id=15, - names=['person'], + names=["person"], fields=[ - MappedField(field_id=17, names=['age']), - MappedField(field_id=19, names=['name']), - MappedField(field_id=20, names=['add_20']), + MappedField(field_id=17, names=["age"]), + MappedField(field_id=19, names=["name"]), + MappedField(field_id=20, names=["add_20"]), ], ), - MappedField(field_id=18, names=['add_18']), + MappedField(field_id=18, names=["add_18"]), ]) assert update_mapping(table_name_mapping_nested, updates, adds) == expected diff --git a/tests/table/test_snapshots.py b/tests/table/test_snapshots.py index e85ecce506..2569a11dc2 100644 --- a/tests/table/test_snapshots.py +++ b/tests/table/test_snapshots.py @@ -156,9 +156,9 @@ def test_snapshot_summary_collector(table_schema_simple: Schema) -> None: ssc.add_file(data_file, schema=table_schema_simple) assert ssc.build() == { - 'added-data-files': '1', - 'added-files-size': '1234', - 'added-records': '100', + "added-data-files": "1", + "added-files-size": "1234", + "added-records": "100", } @@ -174,7 +174,7 @@ def test_snapshot_summary_collector_with_partition() -> None: NestedField(field_id=2, name="string_field", field_type=StringType(), required=False), NestedField(field_id=3, name="int_field", field_type=IntegerType(), required=False), ) - spec = PartitionSpec(PartitionField(source_id=3, field_id=1001, transform=IdentityTransform(), name='int_field')) + spec = PartitionSpec(PartitionField(source_id=3, field_id=1001, transform=IdentityTransform(), name="int_field")) data_file_1 = DataFile(content=DataFileContent.DATA, record_count=100, file_size_in_bytes=1234, partition=Record(int_field=1)) data_file_2 = DataFile(content=DataFileContent.DATA, record_count=200, file_size_in_bytes=4321, partition=Record(int_field=2)) # When @@ -184,13 +184,13 @@ def test_snapshot_summary_collector_with_partition() -> None: # Then assert ssc.build() == { - 'added-files-size': '1234', - 'removed-files-size': '5555', - 'added-data-files': '1', - 'deleted-data-files': '2', - 'added-records': '100', - 'deleted-records': '300', - 'changed-partition-count': '2', + "added-files-size": "1234", + "removed-files-size": "5555", + "added-data-files": "1", + "deleted-data-files": "2", + "added-records": "100", + "deleted-records": "300", + "changed-partition-count": "2", } # When @@ -198,15 +198,15 @@ def test_snapshot_summary_collector_with_partition() -> None: # Then assert ssc.build() == { - 'added-files-size': '1234', - 'removed-files-size': '5555', - 'added-data-files': '1', - 'deleted-data-files': '2', - 'added-records': '100', - 'deleted-records': '300', - 'changed-partition-count': '2', - 'partitions.int_field=1': 'added-files-size=1234,removed-files-size=1234,added-data-files=1,deleted-data-files=1,added-records=100,deleted-records=100', - 'partitions.int_field=2': 'removed-files-size=4321,deleted-data-files=1,deleted-records=200', + "added-files-size": "1234", + "removed-files-size": "5555", + "added-data-files": "1", + "deleted-data-files": "2", + "added-records": "100", + "deleted-records": "300", + "changed-partition-count": "2", + "partitions.int_field=1": "added-files-size=1234,removed-files-size=1234,added-data-files=1,deleted-data-files=1,added-records=100,deleted-records=100", + "partitions.int_field=2": "removed-files-size=4321,deleted-data-files=1,deleted-records=200", } @@ -214,12 +214,12 @@ def test_merge_snapshot_summaries_empty() -> None: assert update_snapshot_summaries(Summary(Operation.APPEND)) == Summary( operation=Operation.APPEND, **{ - 'total-data-files': '0', - 'total-delete-files': '0', - 'total-records': '0', - 'total-files-size': '0', - 'total-position-deletes': '0', - 'total-equality-deletes': '0', + "total-data-files": "0", + "total-delete-files": "0", + "total-records": "0", + "total-files-size": "0", + "total-position-deletes": "0", + "total-equality-deletes": "0", }, ) @@ -229,12 +229,12 @@ def test_merge_snapshot_summaries_new_summary() -> None: summary=Summary( operation=Operation.APPEND, **{ - 'added-data-files': '1', - 'added-delete-files': '2', - 'added-equality-deletes': '3', - 'added-files-size': '4', - 'added-position-deletes': '5', - 'added-records': '6', + "added-data-files": "1", + "added-delete-files": "2", + "added-equality-deletes": "3", + "added-files-size": "4", + "added-position-deletes": "5", + "added-records": "6", }, ) ) @@ -242,18 +242,18 @@ def test_merge_snapshot_summaries_new_summary() -> None: expected = Summary( operation=Operation.APPEND, **{ - 'added-data-files': '1', - 'added-delete-files': '2', - 'added-equality-deletes': '3', - 'added-files-size': '4', - 'added-position-deletes': '5', - 'added-records': '6', - 'total-data-files': '1', - 'total-delete-files': '2', - 'total-records': '6', - 'total-files-size': '4', - 'total-position-deletes': '5', - 'total-equality-deletes': '3', + "added-data-files": "1", + "added-delete-files": "2", + "added-equality-deletes": "3", + "added-files-size": "4", + "added-position-deletes": "5", + "added-records": "6", + "total-data-files": "1", + "total-delete-files": "2", + "total-records": "6", + "total-files-size": "4", + "total-position-deletes": "5", + "total-equality-deletes": "3", }, ) @@ -265,44 +265,44 @@ def test_merge_snapshot_summaries_overwrite_summary() -> None: summary=Summary( operation=Operation.OVERWRITE, **{ - 'added-data-files': '1', - 'added-delete-files': '2', - 'added-equality-deletes': '3', - 'added-files-size': '4', - 'added-position-deletes': '5', - 'added-records': '6', + "added-data-files": "1", + "added-delete-files": "2", + "added-equality-deletes": "3", + "added-files-size": "4", + "added-position-deletes": "5", + "added-records": "6", }, ), previous_summary={ - 'total-data-files': '1', - 'total-delete-files': '1', - 'total-equality-deletes': '1', - 'total-files-size': '1', - 'total-position-deletes': '1', - 'total-records': '1', + "total-data-files": "1", + "total-delete-files": "1", + "total-equality-deletes": "1", + "total-files-size": "1", + "total-position-deletes": "1", + "total-records": "1", }, truncate_full_table=True, ) expected = { - 'added-data-files': '1', - 'added-delete-files': '2', - 'added-equality-deletes': '3', - 'added-files-size': '4', - 'added-position-deletes': '5', - 'added-records': '6', - 'total-data-files': '1', - 'total-records': '6', - 'total-delete-files': '2', - 'total-equality-deletes': '3', - 'total-files-size': '4', - 'total-position-deletes': '5', - 'deleted-data-files': '1', - 'removed-delete-files': '1', - 'deleted-records': '1', - 'removed-files-size': '1', - 'removed-position-deletes': '1', - 'removed-equality-deletes': '1', + "added-data-files": "1", + "added-delete-files": "2", + "added-equality-deletes": "3", + "added-files-size": "4", + "added-position-deletes": "5", + "added-records": "6", + "total-data-files": "1", + "total-records": "6", + "total-delete-files": "2", + "total-equality-deletes": "3", + "total-files-size": "4", + "total-position-deletes": "5", + "deleted-data-files": "1", + "removed-delete-files": "1", + "deleted-records": "1", + "removed-files-size": "1", + "removed-position-deletes": "1", + "removed-equality-deletes": "1", } assert actual.additional_properties == expected @@ -324,15 +324,15 @@ def test_invalid_type() -> None: summary=Summary( operation=Operation.OVERWRITE, **{ - 'added-data-files': '1', - 'added-delete-files': '2', - 'added-equality-deletes': '3', - 'added-files-size': '4', - 'added-position-deletes': '5', - 'added-records': '6', + "added-data-files": "1", + "added-delete-files": "2", + "added-equality-deletes": "3", + "added-files-size": "4", + "added-position-deletes": "5", + "added-records": "6", }, ), - previous_summary={'total-data-files': 'abc'}, # should be a number + previous_summary={"total-data-files": "abc"}, # should be a number truncate_full_table=True, ) diff --git a/tests/test_serializers.py b/tests/test_serializers.py index 140db02700..ad40ea08e0 100644 --- a/tests/test_serializers.py +++ b/tests/test_serializers.py @@ -44,7 +44,7 @@ def test_legacy_current_snapshot_id( ToOutputFile.table_metadata(metadata, PyArrowFileIO().new_output(location=metadata_location), overwrite=True) with PyArrowFileIO().new_input(location=metadata_location).open() as input_stream: metadata_json_bytes = input_stream.read() - assert json.loads(metadata_json_bytes)['current-snapshot-id'] == -1 + assert json.loads(metadata_json_bytes)["current-snapshot-id"] == -1 backwards_compatible_static_table = StaticTable.from_metadata(metadata_location) assert backwards_compatible_static_table.metadata.current_snapshot_id is None assert backwards_compatible_static_table.metadata == static_table.metadata diff --git a/tests/test_transforms.py b/tests/test_transforms.py index 4dc3d9819f..b8bef4b998 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -1550,7 +1550,7 @@ def test_strict_bucket_bytes(bound_reference_binary: BoundReference[int]) -> Non def test_strict_bucket_uuid(bound_reference_uuid: BoundReference[int]) -> None: - value = literal(UUID('12345678123456781234567812345678')) + value = literal(UUID("12345678123456781234567812345678")) transform: Transform[Any, int] = BucketTransform(num_buckets=10) _test_projection( lhs=transform.strict_project(name="name", pred=BoundNotEqualTo(term=bound_reference_uuid, literal=value)), @@ -1575,14 +1575,14 @@ def test_strict_bucket_uuid(bound_reference_uuid: BoundReference[int]) -> None: _test_projection( lhs=transform.strict_project( name="name", - pred=BoundNotIn(term=bound_reference_uuid, literals={value, literal(UUID('12345678123456781234567812345679'))}), + pred=BoundNotIn(term=bound_reference_uuid, literals={value, literal(UUID("12345678123456781234567812345679"))}), ), rhs=NotIn(term=Reference("name"), literals={1, 4}), ) _test_projection( lhs=transform.strict_project( name="name", - pred=BoundIn(term=bound_reference_uuid, literals={value, literal(UUID('12345678123456781234567812345679'))}), + pred=BoundIn(term=bound_reference_uuid, literals={value, literal(UUID("12345678123456781234567812345679"))}), ), rhs=None, ) diff --git a/tests/utils/test_config.py b/tests/utils/test_config.py index 2f15bb56d8..066e7d7cc0 100644 --- a/tests/utils/test_config.py +++ b/tests/utils/test_config.py @@ -50,8 +50,8 @@ def test_from_environment_variables_uppercase() -> None: ) def test_fix_nested_objects_from_environment_variables() -> None: assert Config().get_catalog_config("PRODUCTION") == { - 's3.region': 'eu-north-1', - 's3.access-key-id': 'username', + "s3.region": "eu-north-1", + "s3.access-key-id": "username", } diff --git a/tests/utils/test_decimal.py b/tests/utils/test_decimal.py index 419cf05916..3e67bf691a 100644 --- a/tests/utils/test_decimal.py +++ b/tests/utils/test_decimal.py @@ -45,5 +45,5 @@ def test_decimal_required_bytes() -> None: def test_decimal_to_bytes() -> None: # Check the boundary between 2 and 3 bytes. # 2 bytes has a minimum of -32,768 and a maximum value of 32,767 (inclusive). - assert decimal_to_bytes(Decimal('32767.')) == b'\x7f\xff' - assert decimal_to_bytes(Decimal('32768.')) == b'\x00\x80\x00' + assert decimal_to_bytes(Decimal("32767.")) == b"\x7f\xff" + assert decimal_to_bytes(Decimal("32768.")) == b"\x00\x80\x00" From 91973f25a240f4e9c35b917fb7124e8ac12a6ecb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 31 May 2024 06:44:02 +0200 Subject: [PATCH 65/80] Bump moto from 5.0.8 to 5.0.9 (#783) Bumps [moto](https://github.com/getmoto/moto) from 5.0.8 to 5.0.9. - [Release notes](https://github.com/getmoto/moto/releases) - [Changelog](https://github.com/getmoto/moto/blob/master/CHANGELOG.md) - [Commits](https://github.com/getmoto/moto/compare/5.0.8...5.0.9) --- updated-dependencies: - dependency-name: moto dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index 7931ee0e38..adacbf7179 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2212,13 +2212,13 @@ test = ["mypy (>=1.0)", "pytest (>=7.0.0)"] [[package]] name = "moto" -version = "5.0.8" +version = "5.0.9" description = "" optional = false python-versions = ">=3.8" files = [ - {file = "moto-5.0.8-py2.py3-none-any.whl", hash = "sha256:7d1035e366434bfa9fcc0621f07d5aa724b6846408071d540137a0554c46f214"}, - {file = "moto-5.0.8.tar.gz", hash = "sha256:517fb808dc718bcbdda54c6ffeaca0adc34cf6e10821bfb01216ce420a31765c"}, + {file = "moto-5.0.9-py2.py3-none-any.whl", hash = "sha256:21a13e02f83d6a18cfcd99949c96abb2e889f4bd51c4c6a3ecc8b78765cb854e"}, + {file = "moto-5.0.9.tar.gz", hash = "sha256:eb71f1cba01c70fff1f16086acb24d6d9aeb32830d646d8989f98a29aeae24ba"}, ] [package.dependencies] From 0339e7fc82b71adc7d2a3de025b1b689ca9a2770 Mon Sep 17 00:00:00 2001 From: Honah J Date: Fri, 31 May 2024 00:27:13 -0700 Subject: [PATCH 66/80] Support CreateTableTransaction for SqlCatalog (#684) --- pyiceberg/catalog/sql.py | 104 +++++++++++++++++++++++--------------- tests/catalog/test_sql.py | 60 ++++++++++++++++++++++ 2 files changed, 124 insertions(+), 40 deletions(-) diff --git a/pyiceberg/catalog/sql.py b/pyiceberg/catalog/sql.py index 6c198767e7..ff7831d77f 100644 --- a/pyiceberg/catalog/sql.py +++ b/pyiceberg/catalog/sql.py @@ -60,7 +60,7 @@ from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec from pyiceberg.schema import Schema from pyiceberg.serializers import FromInputFile -from pyiceberg.table import CommitTableRequest, CommitTableResponse, Table, update_table_metadata +from pyiceberg.table import CommitTableRequest, CommitTableResponse, Table from pyiceberg.table.metadata import new_table_metadata from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder from pyiceberg.typedef import EMPTY_DICT, Identifier, Properties @@ -402,59 +402,83 @@ def _commit_table(self, table_request: CommitTableRequest) -> CommitTableRespons identifier_tuple = self.identifier_to_tuple_without_catalog( tuple(table_request.identifier.namespace.root + [table_request.identifier.name]) ) - current_table = self.load_table(identifier_tuple) namespace_tuple = Catalog.namespace_from(identifier_tuple) namespace = Catalog.namespace_to_string(namespace_tuple) table_name = Catalog.table_name_from(identifier_tuple) - base_metadata = current_table.metadata - for requirement in table_request.requirements: - requirement.validate(base_metadata) - updated_metadata = update_table_metadata(base_metadata, table_request.updates) - if updated_metadata == base_metadata: - # no changes, do nothing - return CommitTableResponse(metadata=base_metadata, metadata_location=current_table.metadata_location) + current_table: Optional[Table] + try: + current_table = self.load_table(identifier_tuple) + except NoSuchTableError: + current_table = None - # write new metadata - new_metadata_version = self._parse_metadata_version(current_table.metadata_location) + 1 - new_metadata_location = self._get_metadata_location(current_table.metadata.location, new_metadata_version) - self._write_metadata(updated_metadata, current_table.io, new_metadata_location) + updated_staged_table = self._update_and_stage_table(current_table, table_request) + if current_table and updated_staged_table.metadata == current_table.metadata: + # no changes, do nothing + return CommitTableResponse(metadata=current_table.metadata, metadata_location=current_table.metadata_location) + self._write_metadata( + metadata=updated_staged_table.metadata, + io=updated_staged_table.io, + metadata_path=updated_staged_table.metadata_location, + ) with Session(self.engine) as session: - if self.engine.dialect.supports_sane_rowcount: - stmt = ( - update(IcebergTables) - .where( - IcebergTables.catalog_name == self.name, - IcebergTables.table_namespace == namespace, - IcebergTables.table_name == table_name, - IcebergTables.metadata_location == current_table.metadata_location, - ) - .values(metadata_location=new_metadata_location, previous_metadata_location=current_table.metadata_location) - ) - result = session.execute(stmt) - if result.rowcount < 1: - raise CommitFailedException(f"Table has been updated by another process: {namespace}.{table_name}") - else: - try: - tbl = ( - session.query(IcebergTables) - .with_for_update(of=IcebergTables) - .filter( + if current_table: + # table exists, update it + if self.engine.dialect.supports_sane_rowcount: + stmt = ( + update(IcebergTables) + .where( IcebergTables.catalog_name == self.name, IcebergTables.table_namespace == namespace, IcebergTables.table_name == table_name, IcebergTables.metadata_location == current_table.metadata_location, ) - .one() + .values( + metadata_location=updated_staged_table.metadata_location, + previous_metadata_location=current_table.metadata_location, + ) ) - tbl.metadata_location = new_metadata_location - tbl.previous_metadata_location = current_table.metadata_location - except NoResultFound as e: - raise CommitFailedException(f"Table has been updated by another process: {namespace}.{table_name}") from e - session.commit() + result = session.execute(stmt) + if result.rowcount < 1: + raise CommitFailedException(f"Table has been updated by another process: {namespace}.{table_name}") + else: + try: + tbl = ( + session.query(IcebergTables) + .with_for_update(of=IcebergTables) + .filter( + IcebergTables.catalog_name == self.name, + IcebergTables.table_namespace == namespace, + IcebergTables.table_name == table_name, + IcebergTables.metadata_location == current_table.metadata_location, + ) + .one() + ) + tbl.metadata_location = updated_staged_table.metadata_location + tbl.previous_metadata_location = current_table.metadata_location + except NoResultFound as e: + raise CommitFailedException(f"Table has been updated by another process: {namespace}.{table_name}") from e + session.commit() + else: + # table does not exist, create it + try: + session.add( + IcebergTables( + catalog_name=self.name, + table_namespace=namespace, + table_name=table_name, + metadata_location=updated_staged_table.metadata_location, + previous_metadata_location=None, + ) + ) + session.commit() + except IntegrityError as e: + raise TableAlreadyExistsError(f"Table {namespace}.{table_name} already exists") from e - return CommitTableResponse(metadata=updated_metadata, metadata_location=new_metadata_location) + return CommitTableResponse( + metadata=updated_staged_table.metadata, metadata_location=updated_staged_table.metadata_location + ) def _namespace_exists(self, identifier: Union[str, Identifier]) -> bool: namespace_tuple = Catalog.identifier_to_tuple(identifier) diff --git a/tests/catalog/test_sql.py b/tests/catalog/test_sql.py index 6dc498233e..545916223a 100644 --- a/tests/catalog/test_sql.py +++ b/tests/catalog/test_sql.py @@ -1350,6 +1350,66 @@ def test_write_and_evolve(catalog: SqlCatalog, format_version: int) -> None: snapshot_update.append_data_file(data_file) +@pytest.mark.parametrize( + "catalog", + [ + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), + ], +) +@pytest.mark.parametrize("format_version", [1, 2]) +def test_create_table_transaction(catalog: SqlCatalog, format_version: int) -> None: + identifier = f"default.arrow_create_table_transaction_{catalog.name}_{format_version}" + try: + catalog.create_namespace("default") + except NamespaceAlreadyExistsError: + pass + + try: + catalog.drop_table(identifier=identifier) + except NoSuchTableError: + pass + + pa_table = pa.Table.from_pydict( + { + "foo": ["a", None, "z"], + }, + schema=pa.schema([pa.field("foo", pa.string(), nullable=True)]), + ) + + pa_table_with_column = pa.Table.from_pydict( + { + "foo": ["a", None, "z"], + "bar": [19, None, 25], + }, + schema=pa.schema([ + pa.field("foo", pa.string(), nullable=True), + pa.field("bar", pa.int32(), nullable=True), + ]), + ) + + with catalog.create_table_transaction( + identifier=identifier, schema=pa_table.schema, properties={"format-version": str(format_version)} + ) as txn: + with txn.update_snapshot().fast_append() as snapshot_update: + for data_file in _dataframe_to_data_files(table_metadata=txn.table_metadata, df=pa_table, io=txn._table.io): + snapshot_update.append_data_file(data_file) + + with txn.update_schema() as schema_txn: + schema_txn.union_by_name(pa_table_with_column.schema) + + with txn.update_snapshot().fast_append() as snapshot_update: + for data_file in _dataframe_to_data_files( + table_metadata=txn.table_metadata, df=pa_table_with_column, io=txn._table.io + ): + snapshot_update.append_data_file(data_file) + + tbl = catalog.load_table(identifier=identifier) + assert tbl.format_version == format_version + assert len(tbl.scan().to_arrow()) == 6 + + @pytest.mark.parametrize( "catalog", [ From 84a2c043870111937e2802132486d8eb5979570e Mon Sep 17 00:00:00 2001 From: Honah J Date: Fri, 31 May 2024 00:33:03 -0700 Subject: [PATCH 67/80] Support CreateTableTransaction for HiveCatalog (#683) --- pyiceberg/catalog/__init__.py | 2 +- pyiceberg/catalog/hive.py | 161 +++++++++++-------- tests/integration/test_writes/test_writes.py | 16 +- 3 files changed, 104 insertions(+), 75 deletions(-) diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index ea2bc65760..9a951b5c8e 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -761,7 +761,7 @@ def _create_staged_table( metadata = new_table_metadata( location=location, schema=schema, partition_spec=partition_spec, sort_order=sort_order, properties=properties ) - io = load_file_io(properties=self.properties, location=metadata_location) + io = self._load_file_io(properties=properties, location=metadata_location) return StagedTable( identifier=(self.name, database_name, table_name), metadata=metadata, diff --git a/pyiceberg/catalog/hive.py b/pyiceberg/catalog/hive.py index 13b57b6ea9..83bbd50779 100644 --- a/pyiceberg/catalog/hive.py +++ b/pyiceberg/catalog/hive.py @@ -70,11 +70,11 @@ NamespaceNotEmptyError, NoSuchIcebergTableError, NoSuchNamespaceError, + NoSuchPropertyException, NoSuchTableError, TableAlreadyExistsError, WaitingForLockException, ) -from pyiceberg.io import FileIO, load_file_io from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec from pyiceberg.schema import Schema, SchemaVisitor, visit from pyiceberg.serializers import FromInputFile @@ -82,11 +82,10 @@ CommitTableRequest, CommitTableResponse, PropertyUtil, + StagedTable, Table, TableProperties, - update_table_metadata, ) -from pyiceberg.table.metadata import new_table_metadata from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder from pyiceberg.typedef import EMPTY_DICT, Identifier, Properties from pyiceberg.types import ( @@ -272,10 +271,12 @@ def __init__(self, name: str, **properties: str): DEFAULT_LOCK_CHECK_RETRIES, ) - def _convert_hive_into_iceberg(self, table: HiveTable, io: FileIO) -> Table: + def _convert_hive_into_iceberg(self, table: HiveTable) -> Table: properties: Dict[str, str] = table.parameters if TABLE_TYPE not in properties: - raise NoSuchTableError(f"Property table_type missing, could not determine type: {table.dbName}.{table.tableName}") + raise NoSuchPropertyException( + f"Property table_type missing, could not determine type: {table.dbName}.{table.tableName}" + ) table_type = properties[TABLE_TYPE] if table_type.lower() != ICEBERG: @@ -286,8 +287,9 @@ def _convert_hive_into_iceberg(self, table: HiveTable, io: FileIO) -> Table: if prop_metadata_location := properties.get(METADATA_LOCATION): metadata_location = prop_metadata_location else: - raise NoSuchTableError(f"Table property {METADATA_LOCATION} is missing") + raise NoSuchPropertyException(f"Table property {METADATA_LOCATION} is missing") + io = self._load_file_io(location=metadata_location) file = io.new_input(metadata_location) metadata = FromInputFile.table_metadata(file) return Table( @@ -298,6 +300,38 @@ def _convert_hive_into_iceberg(self, table: HiveTable, io: FileIO) -> Table: catalog=self, ) + def _convert_iceberg_into_hive(self, table: Table) -> HiveTable: + identifier_tuple = self.identifier_to_tuple_without_catalog(table.identifier) + database_name, table_name = self.identifier_to_database_and_table(identifier_tuple, NoSuchTableError) + current_time_millis = int(time.time() * 1000) + + return HiveTable( + dbName=database_name, + tableName=table_name, + owner=table.properties[OWNER] if table.properties and OWNER in table.properties else getpass.getuser(), + createTime=current_time_millis // 1000, + lastAccessTime=current_time_millis // 1000, + sd=_construct_hive_storage_descriptor( + table.schema(), + table.location(), + PropertyUtil.property_as_bool(self.properties, HIVE2_COMPATIBLE, HIVE2_COMPATIBLE_DEFAULT), + ), + tableType=EXTERNAL_TABLE, + parameters=_construct_parameters(table.metadata_location), + ) + + def _create_hive_table(self, open_client: Client, hive_table: HiveTable) -> None: + try: + open_client.create_table(hive_table) + except AlreadyExistsException as e: + raise TableAlreadyExistsError(f"Table {hive_table.dbName}.{hive_table.tableName} already exists") from e + + def _get_hive_table(self, open_client: Client, database_name: str, table_name: str) -> HiveTable: + try: + return open_client.get_table(dbname=database_name, tbl_name=table_name) + except NoSuchObjectException as e: + raise NoSuchTableError(f"Table does not exists: {table_name}") from e + def create_table( self, identifier: Union[str, Identifier], @@ -324,45 +358,25 @@ def create_table( AlreadyExistsError: If a table with the name already exists. ValueError: If the identifier is invalid. """ - schema: Schema = self._convert_schema_if_needed(schema) # type: ignore - properties = {**DEFAULT_PROPERTIES, **properties} - database_name, table_name = self.identifier_to_database_and_table(identifier) - current_time_millis = int(time.time() * 1000) - - location = self._resolve_table_location(location, database_name, table_name) - - metadata_location = self._get_metadata_location(location=location) - metadata = new_table_metadata( - location=location, + staged_table = self._create_staged_table( + identifier=identifier, schema=schema, + location=location, partition_spec=partition_spec, sort_order=sort_order, properties=properties, ) - io = load_file_io({**self.properties, **properties}, location=location) - self._write_metadata(metadata, io, metadata_location) + database_name, table_name = self.identifier_to_database_and_table(identifier) - tbl = HiveTable( - dbName=database_name, - tableName=table_name, - owner=properties[OWNER] if properties and OWNER in properties else getpass.getuser(), - createTime=current_time_millis // 1000, - lastAccessTime=current_time_millis // 1000, - sd=_construct_hive_storage_descriptor( - schema, location, PropertyUtil.property_as_bool(self.properties, HIVE2_COMPATIBLE, HIVE2_COMPATIBLE_DEFAULT) - ), - tableType=EXTERNAL_TABLE, - parameters=_construct_parameters(metadata_location), - ) - try: - with self._client as open_client: - open_client.create_table(tbl) - hive_table = open_client.get_table(dbname=database_name, tbl_name=table_name) - except AlreadyExistsException as e: - raise TableAlreadyExistsError(f"Table {database_name}.{table_name} already exists") from e + self._write_metadata(staged_table.metadata, staged_table.io, staged_table.metadata_location) + tbl = self._convert_iceberg_into_hive(staged_table) + + with self._client as open_client: + self._create_hive_table(open_client, tbl) + hive_table = open_client.get_table(dbname=database_name, tbl_name=table_name) - return self._convert_hive_into_iceberg(hive_table, io) + return self._convert_hive_into_iceberg(hive_table) def register_table(self, identifier: Union[str, Identifier], metadata_location: str) -> Table: """Register a new table using existing metadata. @@ -437,36 +451,52 @@ def _commit_table(self, table_request: CommitTableRequest) -> CommitTableRespons else: raise CommitFailedException(f"Failed to acquire lock for {table_request.identifier}, state: {lock.state}") - hive_table = open_client.get_table(dbname=database_name, tbl_name=table_name) - io = load_file_io({**self.properties, **hive_table.parameters}, hive_table.sd.location) - current_table = self._convert_hive_into_iceberg(hive_table, io) - - base_metadata = current_table.metadata - for requirement in table_request.requirements: - requirement.validate(base_metadata) - - updated_metadata = update_table_metadata(base_metadata, table_request.updates) - if updated_metadata == base_metadata: + hive_table: Optional[HiveTable] + current_table: Optional[Table] + try: + hive_table = self._get_hive_table(open_client, database_name, table_name) + current_table = self._convert_hive_into_iceberg(hive_table) + except NoSuchTableError: + hive_table = None + current_table = None + + updated_staged_table = self._update_and_stage_table(current_table, table_request) + if current_table and updated_staged_table.metadata == current_table.metadata: # no changes, do nothing - return CommitTableResponse(metadata=base_metadata, metadata_location=current_table.metadata_location) - - # write new metadata - new_metadata_version = self._parse_metadata_version(current_table.metadata_location) + 1 - new_metadata_location = self._get_metadata_location(current_table.metadata.location, new_metadata_version) - self._write_metadata(updated_metadata, current_table.io, new_metadata_location) - - hive_table.parameters = _construct_parameters( - metadata_location=new_metadata_location, previous_metadata_location=current_table.metadata_location + return CommitTableResponse(metadata=current_table.metadata, metadata_location=current_table.metadata_location) + self._write_metadata( + metadata=updated_staged_table.metadata, + io=updated_staged_table.io, + metadata_path=updated_staged_table.metadata_location, ) - open_client.alter_table(dbname=database_name, tbl_name=table_name, new_tbl=hive_table) - except NoSuchObjectException as e: - raise NoSuchTableError(f"Table does not exist: {table_name}") from e + + if hive_table and current_table: + # Table exists, update it. + hive_table.parameters = _construct_parameters( + metadata_location=updated_staged_table.metadata_location, + previous_metadata_location=current_table.metadata_location, + ) + open_client.alter_table(dbname=database_name, tbl_name=table_name, new_tbl=hive_table) + else: + # Table does not exist, create it. + hive_table = self._convert_iceberg_into_hive( + StagedTable( + identifier=(self.name, database_name, table_name), + metadata=updated_staged_table.metadata, + metadata_location=updated_staged_table.metadata_location, + io=updated_staged_table.io, + catalog=self, + ) + ) + self._create_hive_table(open_client, hive_table) except WaitingForLockException as e: raise CommitFailedException(f"Failed to acquire lock for {table_request.identifier}, state: {lock.state}") from e finally: open_client.unlock(UnlockRequest(lockid=lock.lockid)) - return CommitTableResponse(metadata=updated_metadata, metadata_location=new_metadata_location) + return CommitTableResponse( + metadata=updated_staged_table.metadata, metadata_location=updated_staged_table.metadata_location + ) def load_table(self, identifier: Union[str, Identifier]) -> Table: """Load the table's metadata and return the table instance. @@ -485,14 +515,11 @@ def load_table(self, identifier: Union[str, Identifier]) -> Table: """ identifier_tuple = self.identifier_to_tuple_without_catalog(identifier) database_name, table_name = self.identifier_to_database_and_table(identifier_tuple, NoSuchTableError) - try: - with self._client as open_client: - hive_table = open_client.get_table(dbname=database_name, tbl_name=table_name) - except NoSuchObjectException as e: - raise NoSuchTableError(f"Table does not exists: {table_name}") from e - io = load_file_io({**self.properties, **hive_table.parameters}, hive_table.sd.location) - return self._convert_hive_into_iceberg(hive_table, io) + with self._client as open_client: + hive_table = self._get_hive_table(open_client, database_name, table_name) + + return self._convert_hive_into_iceberg(hive_table) def drop_table(self, identifier: Union[str, Identifier]) -> None: """Drop a table. diff --git a/tests/integration/test_writes/test_writes.py b/tests/integration/test_writes/test_writes.py index 0941b35850..e329adcd5c 100644 --- a/tests/integration/test_writes/test_writes.py +++ b/tests/integration/test_writes/test_writes.py @@ -34,6 +34,7 @@ from pyiceberg.catalog import Catalog from pyiceberg.catalog.hive import HiveCatalog +from pyiceberg.catalog.rest import RestCatalog from pyiceberg.catalog.sql import SqlCatalog from pyiceberg.exceptions import NoSuchTableError from pyiceberg.partitioning import PartitionField, PartitionSpec @@ -637,17 +638,18 @@ def test_write_and_evolve(session_catalog: Catalog, format_version: int) -> None @pytest.mark.integration -@pytest.mark.parametrize("format_version", [2]) -def test_create_table_transaction(session_catalog: Catalog, format_version: int) -> None: - if format_version == 1: +@pytest.mark.parametrize("format_version", [1, 2]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) +def test_create_table_transaction(catalog: Catalog, format_version: int) -> None: + if format_version == 1 and isinstance(catalog, RestCatalog): pytest.skip( "There is a bug in the REST catalog (maybe server side) that prevents create and commit a staged version 1 table" ) - identifier = f"default.arrow_create_table_transaction{format_version}" + identifier = f"default.arrow_create_table_transaction_{catalog.name}_{format_version}" try: - session_catalog.drop_table(identifier=identifier) + catalog.drop_table(identifier=identifier) except NoSuchTableError: pass @@ -669,7 +671,7 @@ def test_create_table_transaction(session_catalog: Catalog, format_version: int) ]), ) - with session_catalog.create_table_transaction( + with catalog.create_table_transaction( identifier=identifier, schema=pa_table.schema, properties={"format-version": str(format_version)} ) as txn: with txn.update_snapshot().fast_append() as snapshot_update: @@ -685,7 +687,7 @@ def test_create_table_transaction(session_catalog: Catalog, format_version: int) ): snapshot_update.append_data_file(data_file) - tbl = session_catalog.load_table(identifier=identifier) + tbl = catalog.load_table(identifier=identifier) assert tbl.format_version == format_version assert len(tbl.scan().to_arrow()) == 6 From 8d79664d3a6010a92468bfbee1a55283591d7800 Mon Sep 17 00:00:00 2001 From: Yothin M <689679+yothinix@users.noreply.github.com> Date: Fri, 31 May 2024 15:00:53 +0700 Subject: [PATCH 68/80] Support viewfs scheme along side with hdfs (#777) --- pyiceberg/io/__init__.py | 1 + pyiceberg/io/pyarrow.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pyiceberg/io/__init__.py b/pyiceberg/io/__init__.py index 1a78f306c6..9143cf6650 100644 --- a/pyiceberg/io/__init__.py +++ b/pyiceberg/io/__init__.py @@ -284,6 +284,7 @@ def delete(self, location: Union[str, InputFile, OutputFile]) -> None: "gs": [ARROW_FILE_IO], "file": [ARROW_FILE_IO, FSSPEC_FILE_IO], "hdfs": [ARROW_FILE_IO], + "viewfs": [ARROW_FILE_IO], "abfs": [FSSPEC_FILE_IO], "abfss": [FSSPEC_FILE_IO], } diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 9216c37f15..04f30ec63e 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -332,7 +332,7 @@ def parse_location(location: str) -> Tuple[str, str, str]: uri = urlparse(location) if not uri.scheme: return "file", uri.netloc, os.path.abspath(location) - elif uri.scheme == "hdfs": + elif uri.scheme in ("hdfs", "viewfs"): return uri.scheme, uri.netloc, uri.path else: return uri.scheme, uri.netloc, f"{uri.netloc}{uri.path}" @@ -356,12 +356,12 @@ def _initialize_fs(self, scheme: str, netloc: Optional[str] = None) -> FileSyste client_kwargs["connect_timeout"] = float(connect_timeout) return S3FileSystem(**client_kwargs) - elif scheme == "hdfs": + elif scheme in ("hdfs", "viewfs"): from pyarrow.fs import HadoopFileSystem hdfs_kwargs: Dict[str, Any] = {} if netloc: - return HadoopFileSystem.from_uri(f"hdfs://{netloc}") + return HadoopFileSystem.from_uri(f"{scheme}://{netloc}") if host := self.properties.get(HDFS_HOST): hdfs_kwargs["host"] = host if port := self.properties.get(HDFS_PORT): From 20f6afdf5f000ea5b167e804012f2000aa5b8573 Mon Sep 17 00:00:00 2001 From: Christian Date: Fri, 31 May 2024 17:19:40 +0200 Subject: [PATCH 69/80] Update `fsspec.py`to respect `s3.signer.uri property` (#741) * Update fsspec.py to respect s3.signer.uri property * Add S3_SIGNER_URI constant, add docs --------- Co-authored-by: Fokko Driesprong --- mkdocs/docs/configuration.md | 1 + pyiceberg/io/__init__.py | 1 + pyiceberg/io/fsspec.py | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index c0879b1d28..f8a69119c8 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -89,6 +89,7 @@ For the FileIO there are several configuration options available: | s3.access-key-id | admin | Configure the static secret access key used to access the FileIO. | | s3.secret-access-key | password | Configure the static session token used to access the FileIO. | | s3.signer | bearer | Configure the signature version of the FileIO. | +| s3.signer.uri | http://my.signer:8080/s3 | Configure the remote signing uri if it differs from the catalog uri. Remote signing is only implemented for `FsspecFileIO`. The final request is sent to `/v1/aws/s3/sign`. | | s3.region | us-west-2 | Sets the region of the bucket | | s3.proxy-uri | http://my.proxy.com:8080 | Configure the proxy server to be used by the FileIO. | | s3.connect-timeout | 60.0 | Configure socket connection timeout, in seconds. | diff --git a/pyiceberg/io/__init__.py b/pyiceberg/io/__init__.py index 9143cf6650..36c3e625c8 100644 --- a/pyiceberg/io/__init__.py +++ b/pyiceberg/io/__init__.py @@ -53,6 +53,7 @@ S3_REGION = "s3.region" S3_PROXY_URI = "s3.proxy-uri" S3_CONNECT_TIMEOUT = "s3.connect-timeout" +S3_SIGNER_URI = "s3.signer.uri" HDFS_HOST = "hdfs.host" HDFS_PORT = "hdfs.port" HDFS_USER = "hdfs.user" diff --git a/pyiceberg/io/fsspec.py b/pyiceberg/io/fsspec.py index 1089c9fe50..bb76f043c9 100644 --- a/pyiceberg/io/fsspec.py +++ b/pyiceberg/io/fsspec.py @@ -63,6 +63,7 @@ S3_REGION, S3_SECRET_ACCESS_KEY, S3_SESSION_TOKEN, + S3_SIGNER_URI, ADLFS_ClIENT_SECRET, FileIO, InputFile, @@ -79,7 +80,7 @@ def s3v4_rest_signer(properties: Properties, request: AWSRequest, **_: Any) -> A if TOKEN not in properties: raise SignError("Signer set, but token is not available") - signer_url = properties["uri"].rstrip("/") + signer_url = properties.get(S3_SIGNER_URI, properties["uri"]).rstrip("/") signer_headers = {"Authorization": f"Bearer {properties[TOKEN]}"} signer_body = { "method": request.method, From 5dd846d729662522409f0b06da923daefe7dfd97 Mon Sep 17 00:00:00 2001 From: Sung Yun <107272191+syun64@users.noreply.github.com> Date: Sat, 4 May 2024 02:20:39 +0000 Subject: [PATCH 70/80] checkpoint --- pyiceberg/transforms.py | 25 ++++++ .../test_writes/test_partitioned_writes.py | 89 +++++++++++++++++++ 2 files changed, 114 insertions(+) diff --git a/pyiceberg/transforms.py b/pyiceberg/transforms.py index 6dcae59e49..c75f7861c0 100644 --- a/pyiceberg/transforms.py +++ b/pyiceberg/transforms.py @@ -433,6 +433,31 @@ def __repr__(self) -> str: """Return the string representation of the MonthTransform class.""" return "MonthTransform()" + def pyarrow_transform(self, source: IcebergType) -> Callable: + import pyarrow as pa + import pyarrow.compute as pc + + if isinstance(source, DateType): + + def month_func(v: Any) -> int: + return pc.add( + pc.multiply(pc.years_between(pa.scalar(date(1970, 1, 1)), v), pa.scalar(12)), + pc.add(pc.month(v), pa.scalar(-1)), + ) + + elif isinstance(source, (TimestampType, TimestamptzType)): + + def month_func(v: Any) -> int: + return pc.add( + pc.multiply(pc.years_between(pa.scalar(datetime(1970, 1, 1)), pc.local_timestamp(v)), pa.scalar(12)), + pc.add(pc.month(v), pa.scalar(-1)), + ) + + else: + raise ValueError(f"Cannot apply month transform for type: {source}") + + return lambda v: month_func(v) if v is not None else None + class DayTransform(TimeTransform[S]): """Transforms a datetime value into a day value. diff --git a/tests/integration/test_writes/test_partitioned_writes.py b/tests/integration/test_writes/test_partitioned_writes.py index 5cb03e59d8..ddfb6b0f1d 100644 --- a/tests/integration/test_writes/test_partitioned_writes.py +++ b/tests/integration/test_writes/test_partitioned_writes.py @@ -16,6 +16,8 @@ # under the License. # pylint:disable=redefined-outer-name +from datetime import date, datetime, timezone + import pyarrow as pa import pytest from pyspark.sql import SparkSession @@ -36,6 +38,54 @@ from utils import TABLE_SCHEMA, _create_table +@pytest.fixture(scope="session") +def arrow_table_dates() -> pa.Table: + """Pyarrow table with only null values.""" + TEST_DATES = [date(2023, 12, 31), date(2024, 1, 1), date(2024, 1, 31), date(2024, 2, 1)] + return pa.Table.from_pydict( + {"dates": TEST_DATES}, + schema=pa.schema([ + ("dates", pa.date32()), + ]), + ) + + +@pytest.fixture(scope="session") +def arrow_table_timestamp() -> pa.Table: + """Pyarrow table with only null values.""" + TEST_DATETIMES = [ + datetime(2023, 12, 31, 0, 0, 0), + datetime(2024, 1, 1, 0, 0, 0), + datetime(2024, 1, 31, 0, 0, 0), + datetime(2024, 2, 1, 0, 0, 0), + datetime(2024, 2, 1, 6, 0, 0), + ] + return pa.Table.from_pydict( + {"dates": TEST_DATETIMES}, + schema=pa.schema([ + ("timestamp", pa.timestamp(unit="us")), + ]), + ) + + +@pytest.fixture(scope="session") +def arrow_table_timestamptz() -> pa.Table: + """Pyarrow table with only null values.""" + TEST_DATETIMES_WITH_TZ = [ + datetime(2023, 12, 31, 0, 0, 0, tzinfo=timezone.utc), + datetime(2024, 1, 1, 0, 0, 0, tzinfo=timezone.utc), + datetime(2024, 1, 31, 0, 0, 0, tzinfo=timezone.utc), + datetime(2024, 2, 1, 0, 0, 0, tzinfo=timezone.utc), + datetime(2024, 2, 1, 6, 0, 0, tzinfo=timezone.utc), + ] + return pa.Table.from_pydict( + {"dates": TEST_DATETIMES_WITH_TZ}, + schema=pa.schema([ + ("timestamptz", pa.timestamp(unit="us", tz="UTC")), + ]), + ) + + @pytest.mark.integration @pytest.mark.parametrize( "part_col", ["int", "bool", "string", "string_long", "long", "float", "double", "date", "timestamp", "timestamptz", "binary"] @@ -384,3 +434,42 @@ def test_unsupported_transform( with pytest.raises(ValueError, match="All transforms are not supported.*"): tbl.append(arrow_table_with_null) + + +@pytest.mark.integration +@pytest.mark.parametrize( + "part_col", ['int', 'bool', 'string', "string_long", "long", "float", "double", "date", "timestamptz", "timestamp", "binary"] +) +@pytest.mark.parametrize("format_version", [1, 2]) +def test_append_time_transform_partitioned_table( + session_catalog: Catalog, spark: SparkSession, arrow_table_with_null: pa.Table, part_col: str, format_version: int +) -> None: + # Given + identifier = f"default.arrow_table_v{format_version}_appended_with_null_partitioned_on_col_{part_col}" + nested_field = TABLE_SCHEMA.find_field(part_col) + partition_spec = PartitionSpec( + PartitionField(source_id=nested_field.field_id, field_id=1001, transform=IdentityTransform(), name=part_col) + ) + + # When + tbl = _create_table( + session_catalog=session_catalog, + identifier=identifier, + properties={"format-version": str(format_version)}, + data=[], + partition_spec=partition_spec, + ) + # Append with arrow_table_1 with lines [A,B,C] and then arrow_table_2 with lines[A,B,C,A,B,C] + tbl.append(arrow_table_with_null) + tbl.append(pa.concat_tables([arrow_table_with_null, arrow_table_with_null])) + + # Then + assert tbl.format_version == format_version, f"Expected v{format_version}, got: v{tbl.format_version}" + df = spark.table(identifier) + for col in TEST_DATA_WITH_NULL.keys(): + df = spark.table(identifier) + assert df.where(f"{col} is not null").count() == 6, f"Expected 6 non-null rows for {col}" + assert df.where(f"{col} is null").count() == 3, f"Expected 3 null rows for {col}" + # expecting 6 files: first append with [A], [B], [C], second append with [A, A], [B, B], [C, C] + rows = spark.sql(f"select partition from {identifier}.files").collect() + assert len(rows) == 6 From 6357193ee314e7443c4d9599856bb7a8fe3716fd Mon Sep 17 00:00:00 2001 From: Sung Yun <107272191+syun64@users.noreply.github.com> Date: Sun, 5 May 2024 16:27:57 +0000 Subject: [PATCH 71/80] checkpoint2 --- pyiceberg/transforms.py | 78 ++++++++++++++----- .../test_writes/test_partitioned_writes.py | 76 +++--------------- tests/test_transforms.py | 71 ++++++++++++++++- 3 files changed, 141 insertions(+), 84 deletions(-) diff --git a/pyiceberg/transforms.py b/pyiceberg/transforms.py index c75f7861c0..0cf26fe2a2 100644 --- a/pyiceberg/transforms.py +++ b/pyiceberg/transforms.py @@ -20,7 +20,7 @@ from abc import ABC, abstractmethod from enum import IntEnum from functools import singledispatch -from typing import Any, Callable, Generic, Optional, TypeVar +from typing import TYPE_CHECKING, Any, Callable, Generic, Optional, TypeVar from typing import Literal as LiteralType from uuid import UUID @@ -82,6 +82,9 @@ from pyiceberg.utils.parsing import ParseNumberFromBrackets from pyiceberg.utils.singleton import Singleton +if TYPE_CHECKING: + import pyarrow as pa + S = TypeVar("S") T = TypeVar("T") @@ -391,6 +394,21 @@ def __repr__(self) -> str: """Return the string representation of the YearTransform class.""" return "YearTransform()" + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": + import pyarrow as pa + import pyarrow.compute as pc + + if isinstance(source, DateType): + epoch = datetime.EPOCH_DATE + elif isinstance(source, TimestampType): + epoch = datetime.EPOCH_TIMESTAMP + elif isinstance(source, TimestamptzType): + epoch = datetime.EPOCH_TIMESTAMPTZ + else: + raise ValueError(f"Cannot apply year transform for type: {source}") + + return lambda v: pc.years_between(pa.scalar(epoch), v) if v is not None else None + class MonthTransform(TimeTransform[S]): """Transforms a datetime value into a month value. @@ -433,29 +451,25 @@ def __repr__(self) -> str: """Return the string representation of the MonthTransform class.""" return "MonthTransform()" - def pyarrow_transform(self, source: IcebergType) -> Callable: + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": import pyarrow as pa import pyarrow.compute as pc - - if isinstance(source, DateType): - - def month_func(v: Any) -> int: - return pc.add( - pc.multiply(pc.years_between(pa.scalar(date(1970, 1, 1)), v), pa.scalar(12)), - pc.add(pc.month(v), pa.scalar(-1)), - ) - - elif isinstance(source, (TimestampType, TimestamptzType)): - - def month_func(v: Any) -> int: - return pc.add( - pc.multiply(pc.years_between(pa.scalar(datetime(1970, 1, 1)), pc.local_timestamp(v)), pa.scalar(12)), - pc.add(pc.month(v), pa.scalar(-1)), - ) + if isinstance(source, DateType): + epoch = datetime.EPOCH_DATE + elif isinstance(source, TimestampType): + epoch = datetime.EPOCH_TIMESTAMP + elif isinstance(source, TimestamptzType): + epoch = datetime.EPOCH_TIMESTAMPTZ else: raise ValueError(f"Cannot apply month transform for type: {source}") + def month_func(v: pa.Array) -> pa.Array: + return pc.add( + pc.multiply(pc.years_between(pa.scalar(epoch), v), pa.scalar(12)), + pc.add(pc.month(v), pa.scalar(-1)), + ) + return lambda v: month_func(v) if v is not None else None @@ -503,6 +517,21 @@ def __repr__(self) -> str: """Return the string representation of the DayTransform class.""" return "DayTransform()" + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": + import pyarrow as pa + import pyarrow.compute as pc + + if isinstance(source, DateType): + epoch = datetime.EPOCH_DATE + elif isinstance(source, TimestampType): + epoch = datetime.EPOCH_TIMESTAMP + elif isinstance(source, TimestamptzType): + epoch = datetime.EPOCH_TIMESTAMPTZ + else: + raise ValueError(f"Cannot apply day transform for type: {source}") + + return lambda v: pc.days_between(pa.scalar(epoch), v) if v is not None else None + class HourTransform(TimeTransform[S]): """Transforms a datetime value into a hour value. @@ -540,6 +569,19 @@ def __repr__(self) -> str: """Return the string representation of the HourTransform class.""" return "HourTransform()" + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": + import pyarrow as pa + import pyarrow.compute as pc + + if isinstance(source, TimestampType): + epoch = datetime.EPOCH_TIMESTAMP + elif isinstance(source, TimestamptzType): + epoch = datetime.EPOCH_TIMESTAMPTZ + else: + raise ValueError(f"Cannot apply month transform for type: {source}") + + return lambda v: pc.hours_between(pa.scalar(epoch), v) if v is not None else None + def _base64encode(buffer: bytes) -> str: """Convert bytes to base64 string.""" diff --git a/tests/integration/test_writes/test_partitioned_writes.py b/tests/integration/test_writes/test_partitioned_writes.py index ddfb6b0f1d..f8335274ab 100644 --- a/tests/integration/test_writes/test_partitioned_writes.py +++ b/tests/integration/test_writes/test_partitioned_writes.py @@ -16,11 +16,11 @@ # under the License. # pylint:disable=redefined-outer-name -from datetime import date, datetime, timezone import pyarrow as pa import pytest from pyspark.sql import SparkSession +from typing import Any from pyiceberg.catalog import Catalog from pyiceberg.exceptions import NoSuchTableError @@ -31,6 +31,7 @@ HourTransform, IdentityTransform, MonthTransform, + Transform, TruncateTransform, YearTransform, ) @@ -38,54 +39,6 @@ from utils import TABLE_SCHEMA, _create_table -@pytest.fixture(scope="session") -def arrow_table_dates() -> pa.Table: - """Pyarrow table with only null values.""" - TEST_DATES = [date(2023, 12, 31), date(2024, 1, 1), date(2024, 1, 31), date(2024, 2, 1)] - return pa.Table.from_pydict( - {"dates": TEST_DATES}, - schema=pa.schema([ - ("dates", pa.date32()), - ]), - ) - - -@pytest.fixture(scope="session") -def arrow_table_timestamp() -> pa.Table: - """Pyarrow table with only null values.""" - TEST_DATETIMES = [ - datetime(2023, 12, 31, 0, 0, 0), - datetime(2024, 1, 1, 0, 0, 0), - datetime(2024, 1, 31, 0, 0, 0), - datetime(2024, 2, 1, 0, 0, 0), - datetime(2024, 2, 1, 6, 0, 0), - ] - return pa.Table.from_pydict( - {"dates": TEST_DATETIMES}, - schema=pa.schema([ - ("timestamp", pa.timestamp(unit="us")), - ]), - ) - - -@pytest.fixture(scope="session") -def arrow_table_timestamptz() -> pa.Table: - """Pyarrow table with only null values.""" - TEST_DATETIMES_WITH_TZ = [ - datetime(2023, 12, 31, 0, 0, 0, tzinfo=timezone.utc), - datetime(2024, 1, 1, 0, 0, 0, tzinfo=timezone.utc), - datetime(2024, 1, 31, 0, 0, 0, tzinfo=timezone.utc), - datetime(2024, 2, 1, 0, 0, 0, tzinfo=timezone.utc), - datetime(2024, 2, 1, 6, 0, 0, tzinfo=timezone.utc), - ] - return pa.Table.from_pydict( - {"dates": TEST_DATETIMES_WITH_TZ}, - schema=pa.schema([ - ("timestamptz", pa.timestamp(unit="us", tz="UTC")), - ]), - ) - - @pytest.mark.integration @pytest.mark.parametrize( "part_col", ["int", "bool", "string", "string_long", "long", "float", "double", "date", "timestamp", "timestamptz", "binary"] @@ -437,18 +390,19 @@ def test_unsupported_transform( @pytest.mark.integration +@pytest.mark.parametrize('transform', [YearTransform(), MonthTransform(), DayTransform()]) @pytest.mark.parametrize( - "part_col", ['int', 'bool', 'string', "string_long", "long", "float", "double", "date", "timestamptz", "timestamp", "binary"] + "part_col", ["date", "timestamp", "timestamptz"] ) @pytest.mark.parametrize("format_version", [1, 2]) -def test_append_time_transform_partitioned_table( - session_catalog: Catalog, spark: SparkSession, arrow_table_with_null: pa.Table, part_col: str, format_version: int +def test_append_ymd_transform_partitioned( + session_catalog: Catalog, spark: SparkSession, arrow_table_with_null: pa.Table, transform: Transform[Any, Any], part_col: str, format_version: int ) -> None: # Given - identifier = f"default.arrow_table_v{format_version}_appended_with_null_partitioned_on_col_{part_col}" + identifier = f"default.arrow_table_v{format_version}_with_ymd_transform_partitioned_on_col_{part_col}" nested_field = TABLE_SCHEMA.find_field(part_col) partition_spec = PartitionSpec( - PartitionField(source_id=nested_field.field_id, field_id=1001, transform=IdentityTransform(), name=part_col) + PartitionField(source_id=nested_field.field_id, field_id=1001, transform=transform, name=part_col) ) # When @@ -456,20 +410,14 @@ def test_append_time_transform_partitioned_table( session_catalog=session_catalog, identifier=identifier, properties={"format-version": str(format_version)}, - data=[], + data=[arrow_table_with_null], partition_spec=partition_spec, ) - # Append with arrow_table_1 with lines [A,B,C] and then arrow_table_2 with lines[A,B,C,A,B,C] - tbl.append(arrow_table_with_null) - tbl.append(pa.concat_tables([arrow_table_with_null, arrow_table_with_null])) # Then assert tbl.format_version == format_version, f"Expected v{format_version}, got: v{tbl.format_version}" df = spark.table(identifier) + assert df.count() == 3, f"Expected 3 total rows for {identifier}" for col in TEST_DATA_WITH_NULL.keys(): - df = spark.table(identifier) - assert df.where(f"{col} is not null").count() == 6, f"Expected 6 non-null rows for {col}" - assert df.where(f"{col} is null").count() == 3, f"Expected 3 null rows for {col}" - # expecting 6 files: first append with [A], [B], [C], second append with [A, A], [B, B], [C, C] - rows = spark.sql(f"select partition from {identifier}.files").collect() - assert len(rows) == 6 + assert df.where(f"{col} is not null").count() == 2, f"Expected 2 non-null rows for {col}" + assert df.where(f"{col} is null").count() == 1, f"Expected 1 null row for {col} is null" \ No newline at end of file diff --git a/tests/test_transforms.py b/tests/test_transforms.py index b8bef4b998..4a1e066b1e 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -15,9 +15,9 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=eval-used,protected-access,redefined-outer-name -from datetime import date +from datetime import date, datetime, timezone from decimal import Decimal -from typing import Any, Callable, Optional +from typing import TYPE_CHECKING, Any, Callable, Optional from uuid import UUID import mmh3 as mmh3 @@ -69,6 +69,7 @@ TimestampLiteral, literal, ) +from pyiceberg.partitioning import _to_partition_representation from pyiceberg.schema import Accessor from pyiceberg.transforms import ( BucketTransform, @@ -111,6 +112,9 @@ timestamptz_to_micros, ) +if TYPE_CHECKING: + import pyarrow as pa + @pytest.mark.parametrize( "test_input,test_type,expected", @@ -1808,3 +1812,66 @@ def test_strict_binary(bound_reference_binary: BoundReference[str]) -> None: _test_projection( lhs=transform.strict_project(name="name", pred=BoundIn(term=bound_reference_binary, literals=set_of_literals)), rhs=None ) + + +@pytest.fixture(scope="session") +def arrow_table_date_timestamps() -> "pa.Table": + """Pyarrow table with only date, timestamp and timestamptz values.""" + import pyarrow as pa + + return pa.Table.from_pydict( + { + "date": [date(2023, 12, 31), date(2024, 1, 1), date(2024, 1, 31), date(2024, 2, 1), date(2024, 2, 1), None], + "timestamp": [ + datetime(2023, 12, 31, 0, 0, 0), + datetime(2024, 1, 1, 0, 0, 0), + datetime(2024, 1, 31, 0, 0, 0), + datetime(2024, 2, 1, 0, 0, 0), + datetime(2024, 2, 1, 6, 0, 0), + None, + ], + "timestamptz": [ + datetime(2023, 12, 31, 0, 0, 0, tzinfo=timezone.utc), + datetime(2024, 1, 1, 0, 0, 0, tzinfo=timezone.utc), + datetime(2024, 1, 31, 0, 0, 0, tzinfo=timezone.utc), + datetime(2024, 2, 1, 0, 0, 0, tzinfo=timezone.utc), + datetime(2024, 2, 1, 6, 0, 0, tzinfo=timezone.utc), + None, + ], + }, + schema=pa.schema([ + ("date", pa.date32()), + ("timestamp", pa.timestamp(unit="us")), + ("timestamptz", pa.timestamp(unit="us", tz="UTC")), + ]), + ) + + +@pytest.mark.parametrize('transform', [YearTransform(), MonthTransform(), DayTransform()]) +@pytest.mark.parametrize( + "source_col, source_type", [("date", DateType()), ("timestamp", TimestampType()), ("timestamptz", TimestamptzType())] +) +def test_ymd_pyarrow_transforms( + arrow_table_date_timestamps: "pa.Table", + source_col: str, + source_type: PrimitiveType, + transform: Transform[Any, Any], +) -> None: + assert transform.pyarrow_transform(source_type)(arrow_table_date_timestamps[source_col]).to_pylist() == [ + transform.transform(source_type)(_to_partition_representation(source_type, v)) + for v in arrow_table_date_timestamps[source_col].to_pylist() + ] + + +@pytest.mark.parametrize("source_col, source_type", [("timestamp", TimestampType()), ("timestamptz", TimestamptzType())]) +def test_hour_pyarrow_transforms(arrow_table_date_timestamps: "pa.Table", source_col: str, source_type: PrimitiveType) -> None: + assert HourTransform().pyarrow_transform(source_type)(arrow_table_date_timestamps[source_col]).to_pylist() == [ + HourTransform().transform(source_type)(_to_partition_representation(source_type, v)) + for v in arrow_table_date_timestamps[source_col].to_pylist() + ] + + +def test_hour_pyarrow_transforms_throws_with_dates(arrow_table_date_timestamps: "pa.Table") -> None: + # HourTransform is not supported for DateType + with pytest.raises(ValueError): + HourTransform().pyarrow_transform(DateType())(arrow_table_date_timestamps["date"]) From c30a57cfe93aaf979df949f801929ecf10079601 Mon Sep 17 00:00:00 2001 From: Sung Yun <107272191+syun64@users.noreply.github.com> Date: Sun, 5 May 2024 19:51:42 +0000 Subject: [PATCH 72/80] todo: sort with pyarrow_transform vals --- pyiceberg/table/__init__.py | 7 +++-- pyiceberg/transforms.py | 18 +++++++++++ .../test_writes/test_partitioned_writes.py | 31 ++++++++++++++++--- tests/test_transforms.py | 28 ++++++----------- 4 files changed, 57 insertions(+), 27 deletions(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index aa108de08b..ea88312368 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -392,10 +392,11 @@ def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT) if not isinstance(df, pa.Table): raise ValueError(f"Expected PyArrow table, got: {df}") - supported_transforms = {IdentityTransform} - if not all(type(field.transform) in supported_transforms for field in self.table_metadata.spec().fields): + if unsupported_partitions := [ + field for field in self.table_metadata.spec().fields if not field.transform.supports_pyarrow_transform + ]: raise ValueError( - f"All transforms are not supported, expected: {supported_transforms}, but get: {[str(field) for field in self.table_metadata.spec().fields if field.transform not in supported_transforms]}." + f"Not all partition types are supported for writes. Following partitions cannot be written using pyarrow: {unsupported_partitions}." ) _check_schema_compatible(self._table.schema(), other_schema=df.schema) diff --git a/pyiceberg/transforms.py b/pyiceberg/transforms.py index 0cf26fe2a2..c8af97c301 100644 --- a/pyiceberg/transforms.py +++ b/pyiceberg/transforms.py @@ -178,6 +178,10 @@ def __eq__(self, other: Any) -> bool: return self.root == other.root return False + @property + def supports_pyarrow_transform(self) -> bool: + return False + class BucketTransform(Transform[S, int]): """Base Transform class to transform a value into a bucket partition value. @@ -352,6 +356,13 @@ def dedup_name(self) -> str: def preserves_order(self) -> bool: return True + @abstractmethod + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": ... + + @property + def supports_pyarrow_transform(self) -> bool: + return True + class YearTransform(TimeTransform[S]): """Transforms a datetime value into a year value. @@ -652,6 +663,13 @@ def __repr__(self) -> str: """Return the string representation of the IdentityTransform class.""" return "IdentityTransform()" + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": + return lambda v: v + + @property + def supports_pyarrow_transform(self) -> bool: + return True + class TruncateTransform(Transform[S, S]): """A transform for truncating a value to a specified width. diff --git a/tests/integration/test_writes/test_partitioned_writes.py b/tests/integration/test_writes/test_partitioned_writes.py index f8335274ab..3a0e38d3f2 100644 --- a/tests/integration/test_writes/test_partitioned_writes.py +++ b/tests/integration/test_writes/test_partitioned_writes.py @@ -17,10 +17,11 @@ # pylint:disable=redefined-outer-name +from typing import Any + import pyarrow as pa import pytest from pyspark.sql import SparkSession -from typing import Any from pyiceberg.catalog import Catalog from pyiceberg.exceptions import NoSuchTableError @@ -390,13 +391,24 @@ def test_unsupported_transform( @pytest.mark.integration -@pytest.mark.parametrize('transform', [YearTransform(), MonthTransform(), DayTransform()]) @pytest.mark.parametrize( - "part_col", ["date", "timestamp", "timestamptz"] + "transform,expected_rows", + [ + pytest.param(YearTransform(), 2, id="year_transform"), + pytest.param(MonthTransform(), 3, id="month_transform"), + pytest.param(DayTransform(), 3, id="day_transform"), + ], ) +@pytest.mark.parametrize("part_col", ["date", "timestamp", "timestamptz"]) @pytest.mark.parametrize("format_version", [1, 2]) def test_append_ymd_transform_partitioned( - session_catalog: Catalog, spark: SparkSession, arrow_table_with_null: pa.Table, transform: Transform[Any, Any], part_col: str, format_version: int + session_catalog: Catalog, + spark: SparkSession, + arrow_table_with_null: pa.Table, + transform: Transform[Any, Any], + expected_rows: int, + part_col: str, + format_version: int, ) -> None: # Given identifier = f"default.arrow_table_v{format_version}_with_ymd_transform_partitioned_on_col_{part_col}" @@ -420,4 +432,13 @@ def test_append_ymd_transform_partitioned( assert df.count() == 3, f"Expected 3 total rows for {identifier}" for col in TEST_DATA_WITH_NULL.keys(): assert df.where(f"{col} is not null").count() == 2, f"Expected 2 non-null rows for {col}" - assert df.where(f"{col} is null").count() == 1, f"Expected 1 null row for {col} is null" \ No newline at end of file + assert df.where(f"{col} is null").count() == 1, f"Expected 1 null row for {col} is null" + + assert tbl.inspect.partitions().num_rows == expected_rows + files_df = spark.sql( + f""" + SELECT * + FROM {identifier}.files + """ + ) + assert files_df.count() == expected_rows diff --git a/tests/test_transforms.py b/tests/test_transforms.py index 4a1e066b1e..3f1591c01c 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -1847,7 +1847,7 @@ def arrow_table_date_timestamps() -> "pa.Table": ) -@pytest.mark.parametrize('transform', [YearTransform(), MonthTransform(), DayTransform()]) +@pytest.mark.parametrize('transform', [YearTransform(), MonthTransform(), DayTransform(), HourTransform()]) @pytest.mark.parametrize( "source_col, source_type", [("date", DateType()), ("timestamp", TimestampType()), ("timestamptz", TimestamptzType())] ) @@ -1857,21 +1857,11 @@ def test_ymd_pyarrow_transforms( source_type: PrimitiveType, transform: Transform[Any, Any], ) -> None: - assert transform.pyarrow_transform(source_type)(arrow_table_date_timestamps[source_col]).to_pylist() == [ - transform.transform(source_type)(_to_partition_representation(source_type, v)) - for v in arrow_table_date_timestamps[source_col].to_pylist() - ] - - -@pytest.mark.parametrize("source_col, source_type", [("timestamp", TimestampType()), ("timestamptz", TimestamptzType())]) -def test_hour_pyarrow_transforms(arrow_table_date_timestamps: "pa.Table", source_col: str, source_type: PrimitiveType) -> None: - assert HourTransform().pyarrow_transform(source_type)(arrow_table_date_timestamps[source_col]).to_pylist() == [ - HourTransform().transform(source_type)(_to_partition_representation(source_type, v)) - for v in arrow_table_date_timestamps[source_col].to_pylist() - ] - - -def test_hour_pyarrow_transforms_throws_with_dates(arrow_table_date_timestamps: "pa.Table") -> None: - # HourTransform is not supported for DateType - with pytest.raises(ValueError): - HourTransform().pyarrow_transform(DateType())(arrow_table_date_timestamps["date"]) + if transform.can_transform(source_type): + assert transform.pyarrow_transform(source_type)(arrow_table_date_timestamps[source_col]).to_pylist() == [ + transform.transform(source_type)(_to_partition_representation(source_type, v)) + for v in arrow_table_date_timestamps[source_col].to_pylist() + ] + else: + with pytest.raises(ValueError): + transform.pyarrow_transform(DateType())(arrow_table_date_timestamps[source_col]) From 541655f16940998420688529c639e8481d178c93 Mon Sep 17 00:00:00 2001 From: Sung Yun <107272191+syun64@users.noreply.github.com> Date: Mon, 6 May 2024 02:05:44 +0000 Subject: [PATCH 73/80] checkpoint --- pyiceberg/table/__init__.py | 59 +++++++++++++++---------------------- 1 file changed, 24 insertions(+), 35 deletions(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index ea88312368..4040f9a616 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -3644,33 +3644,6 @@ class TablePartition: arrow_table_partition: pa.Table -def _get_partition_sort_order(partition_columns: list[str], reverse: bool = False) -> dict[str, Any]: - order = "ascending" if not reverse else "descending" - null_placement = "at_start" if reverse else "at_end" - return {"sort_keys": [(column_name, order) for column_name in partition_columns], "null_placement": null_placement} - - -def group_by_partition_scheme(arrow_table: pa.Table, partition_columns: list[str]) -> pa.Table: - """Given a table, sort it by current partition scheme.""" - # only works for identity for now - sort_options = _get_partition_sort_order(partition_columns, reverse=False) - sorted_arrow_table = arrow_table.sort_by(sorting=sort_options["sort_keys"], null_placement=sort_options["null_placement"]) - return sorted_arrow_table - - -def get_partition_columns( - spec: PartitionSpec, - schema: Schema, -) -> list[str]: - partition_cols = [] - for partition_field in spec.fields: - column_name = schema.find_column_name(partition_field.source_id) - if not column_name: - raise ValueError(f"{partition_field=} could not be found in {schema}.") - partition_cols.append(column_name) - return partition_cols - - def _get_table_partitions( arrow_table: pa.Table, partition_spec: PartitionSpec, @@ -3725,13 +3698,29 @@ def _determine_partitions(spec: PartitionSpec, schema: Schema, arrow_table: pa.T """ import pyarrow as pa - partition_columns = get_partition_columns(spec=spec, schema=schema) - arrow_table = group_by_partition_scheme(arrow_table, partition_columns) - - reversing_sort_order_options = _get_partition_sort_order(partition_columns, reverse=True) - reversed_indices = pa.compute.sort_indices(arrow_table, **reversing_sort_order_options).to_pylist() - - slice_instructions: list[dict[str, Any]] = [] + partition_columns: List[Tuple[PartitionField, NestedField]] = [ + (partition_field, schema.find_field(partition_field.source_id)) for partition_field in spec.fields + ] + partition_values_table = pa.table({ + str(partition.field_id): partition.pyarrow_transform(field.field_type)(arrow_table[field.name]) + for partition, field in partition_columns + }) + + # Sort by partitions + sort_indices = pa.compute.sort_indices( + partition_values_table, + sort_keys=[(col, "ascending") for col in partition_values_table.column_names], + null_placement="at_end", + ).to_pylist() + arrow_table = arrow_table.take(sort_indices) + + # Get slice_instructions to group by partitions + reversed_indices = pa.compute.sort_indices( + partition_values_table, + sort_keys=[(col, "descending") for col in partition_values_table.column_names], + null_placement="at_start", + ).to_pylist() + slice_instructions: List[Dict[str, Any]] = [] last = len(reversed_indices) reversed_indices_size = len(reversed_indices) ptr = 0 @@ -3742,6 +3731,6 @@ def _determine_partitions(spec: PartitionSpec, schema: Schema, arrow_table: pa.T last = reversed_indices[ptr] ptr = ptr + group_size - table_partitions: list[TablePartition] = _get_table_partitions(arrow_table, spec, schema, slice_instructions) + table_partitions: List[TablePartition] = _get_table_partitions(arrow_table, spec, schema, slice_instructions) return table_partitions From afe83b177a74039218a18cd0e49a80ff0513de1c Mon Sep 17 00:00:00 2001 From: Sung Yun <107272191+syun64@users.noreply.github.com> Date: Mon, 6 May 2024 02:09:32 +0000 Subject: [PATCH 74/80] checkpoint --- pyiceberg/table/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 4040f9a616..16482108a6 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -3702,7 +3702,7 @@ def _determine_partitions(spec: PartitionSpec, schema: Schema, arrow_table: pa.T (partition_field, schema.find_field(partition_field.source_id)) for partition_field in spec.fields ] partition_values_table = pa.table({ - str(partition.field_id): partition.pyarrow_transform(field.field_type)(arrow_table[field.name]) + str(partition.field_id): partition.transform.pyarrow_transform(field.field_type)(arrow_table[field.name]) for partition, field in partition_columns }) From 00ca5f04b2281a82ae3fc869d5b8a2b3cdd0e2b2 Mon Sep 17 00:00:00 2001 From: Sung Yun <107272191+syun64@users.noreply.github.com> Date: Mon, 6 May 2024 02:17:31 +0000 Subject: [PATCH 75/80] fix --- pyiceberg/table/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 16482108a6..f160ab2441 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -3715,6 +3715,7 @@ def _determine_partitions(spec: PartitionSpec, schema: Schema, arrow_table: pa.T arrow_table = arrow_table.take(sort_indices) # Get slice_instructions to group by partitions + partition_values_table = partition_values_table.take(sort_indices) reversed_indices = pa.compute.sort_indices( partition_values_table, sort_keys=[(col, "descending") for col in partition_values_table.column_names], From 511e98824aafc9c79d8c27ddefc3431797019ebf Mon Sep 17 00:00:00 2001 From: Sung Yun <107272191+syun64@users.noreply.github.com> Date: Mon, 6 May 2024 14:42:47 +0000 Subject: [PATCH 76/80] tests --- tests/conftest.py | 43 +++++++++++ .../test_writes/test_partitioned_writes.py | 76 +++++++++++++++---- tests/test_transforms.py | 45 +++-------- 3 files changed, 115 insertions(+), 49 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 01915b7d82..d3f23689a2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2158,3 +2158,46 @@ def arrow_table_with_only_nulls(pa_schema: "pa.Schema") -> "pa.Table": import pyarrow as pa return pa.Table.from_pylist([{}, {}], schema=pa_schema) + + +@pytest.fixture(scope="session") +def arrow_table_date_timestamps() -> "pa.Table": + """Pyarrow table with only date, timestamp and timestamptz values.""" + import pyarrow as pa + + return pa.Table.from_pydict( + { + "date": [date(2023, 12, 31), date(2024, 1, 1), date(2024, 1, 31), date(2024, 2, 1), date(2024, 2, 1), None], + "timestamp": [ + datetime(2023, 12, 31, 0, 0, 0), + datetime(2024, 1, 1, 0, 0, 0), + datetime(2024, 1, 31, 0, 0, 0), + datetime(2024, 2, 1, 0, 0, 0), + datetime(2024, 2, 1, 6, 0, 0), + None, + ], + "timestamptz": [ + datetime(2023, 12, 31, 0, 0, 0, tzinfo=timezone.utc), + datetime(2024, 1, 1, 0, 0, 0, tzinfo=timezone.utc), + datetime(2024, 1, 31, 0, 0, 0, tzinfo=timezone.utc), + datetime(2024, 2, 1, 0, 0, 0, tzinfo=timezone.utc), + datetime(2024, 2, 1, 6, 0, 0, tzinfo=timezone.utc), + None, + ], + }, + schema=pa.schema([ + ("date", pa.date32()), + ("timestamp", pa.timestamp(unit="us")), + ("timestamptz", pa.timestamp(unit="us", tz="UTC")), + ]), + ) + + +@pytest.fixture(scope="session") +def arrow_table_date_timestamps_schema() -> Schema: + """Pyarrow table Schema with only date, timestamp and timestamptz values.""" + return Schema( + NestedField(field_id=1, name="date", field_type=DateType(), required=False), + NestedField(field_id=2, name="timestamp", field_type=TimestampType(), required=False), + NestedField(field_id=3, name="timestamptz", field_type=TimestamptzType(), required=False), + ) diff --git a/tests/integration/test_writes/test_partitioned_writes.py b/tests/integration/test_writes/test_partitioned_writes.py index 3a0e38d3f2..9df2ec218e 100644 --- a/tests/integration/test_writes/test_partitioned_writes.py +++ b/tests/integration/test_writes/test_partitioned_writes.py @@ -26,6 +26,7 @@ from pyiceberg.catalog import Catalog from pyiceberg.exceptions import NoSuchTableError from pyiceberg.partitioning import PartitionField, PartitionSpec +from pyiceberg.schema import Schema from pyiceberg.transforms import ( BucketTransform, DayTransform, @@ -355,18 +356,6 @@ def test_invalid_arguments(spark: SparkSession, session_catalog: Catalog) -> Non (PartitionSpec(PartitionField(source_id=5, field_id=1001, transform=TruncateTransform(2), name="long_trunc"))), (PartitionSpec(PartitionField(source_id=2, field_id=1001, transform=TruncateTransform(2), name="string_trunc"))), (PartitionSpec(PartitionField(source_id=11, field_id=1001, transform=TruncateTransform(2), name="binary_trunc"))), - (PartitionSpec(PartitionField(source_id=8, field_id=1001, transform=YearTransform(), name="timestamp_year"))), - (PartitionSpec(PartitionField(source_id=9, field_id=1001, transform=YearTransform(), name="timestamptz_year"))), - (PartitionSpec(PartitionField(source_id=10, field_id=1001, transform=YearTransform(), name="date_year"))), - (PartitionSpec(PartitionField(source_id=8, field_id=1001, transform=MonthTransform(), name="timestamp_month"))), - (PartitionSpec(PartitionField(source_id=9, field_id=1001, transform=MonthTransform(), name="timestamptz_month"))), - (PartitionSpec(PartitionField(source_id=10, field_id=1001, transform=MonthTransform(), name="date_month"))), - (PartitionSpec(PartitionField(source_id=8, field_id=1001, transform=DayTransform(), name="timestamp_day"))), - (PartitionSpec(PartitionField(source_id=9, field_id=1001, transform=DayTransform(), name="timestamptz_day"))), - (PartitionSpec(PartitionField(source_id=10, field_id=1001, transform=DayTransform(), name="date_day"))), - (PartitionSpec(PartitionField(source_id=8, field_id=1001, transform=HourTransform(), name="timestamp_hour"))), - (PartitionSpec(PartitionField(source_id=9, field_id=1001, transform=HourTransform(), name="timestamptz_hour"))), - (PartitionSpec(PartitionField(source_id=10, field_id=1001, transform=HourTransform(), name="date_hour"))), ], ) def test_unsupported_transform( @@ -386,7 +375,10 @@ def test_unsupported_transform( properties={"format-version": "1"}, ) - with pytest.raises(ValueError, match="All transforms are not supported.*"): + with pytest.raises( + ValueError, + match="Not all partition types are supported for writes. Following partitions cannot be written using pyarrow: *", + ): tbl.append(arrow_table_with_null) @@ -411,7 +403,7 @@ def test_append_ymd_transform_partitioned( format_version: int, ) -> None: # Given - identifier = f"default.arrow_table_v{format_version}_with_ymd_transform_partitioned_on_col_{part_col}" + identifier = f"default.arrow_table_v{format_version}_with_{str(transform)}_partition_on_col_{part_col}" nested_field = TABLE_SCHEMA.find_field(part_col) partition_spec = PartitionSpec( PartitionField(source_id=nested_field.field_id, field_id=1001, transform=transform, name=part_col) @@ -442,3 +434,59 @@ def test_append_ymd_transform_partitioned( """ ) assert files_df.count() == expected_rows + + +@pytest.mark.integration +@pytest.mark.parametrize( + "transform,expected_partitions", + [ + pytest.param(YearTransform(), 3, id="year_transform"), + pytest.param(MonthTransform(), 4, id="month_transform"), + pytest.param(DayTransform(), 5, id="day_transform"), + pytest.param(HourTransform(), 6, id="hour_transform"), + ], +) +@pytest.mark.parametrize("format_version", [1, 2]) +def test_append_transform_partition_verify_partitions_count( + session_catalog: Catalog, + spark: SparkSession, + arrow_table_date_timestamps: pa.Table, + arrow_table_date_timestamps_schema: Schema, + transform: Transform[Any, Any], + expected_partitions: int, + format_version: int, +) -> None: + # Given + part_col = "timestamptz" + identifier = f"default.arrow_table_v{format_version}_with_{str(transform)}_transform_partitioned_on_col_{part_col}" + nested_field = arrow_table_date_timestamps_schema.find_field(part_col) + partition_spec = PartitionSpec( + PartitionField(source_id=nested_field.field_id, field_id=1001, transform=transform, name=part_col) + ) + + # When + tbl = _create_table( + session_catalog=session_catalog, + identifier=identifier, + properties={"format-version": str(format_version)}, + data=[arrow_table_date_timestamps], + partition_spec=partition_spec, + schema=arrow_table_date_timestamps_schema, + ) + + # Then + assert tbl.format_version == format_version, f"Expected v{format_version}, got: v{tbl.format_version}" + df = spark.table(identifier) + assert df.count() == 6, f"Expected 6 total rows for {identifier}" + for col in arrow_table_date_timestamps.column_names: + assert df.where(f"{col} is not null").count() == 5, f"Expected 2 non-null rows for {col}" + assert df.where(f"{col} is null").count() == 1, f"Expected 1 null row for {col} is null" + + assert tbl.inspect.partitions().num_rows == expected_partitions + files_df = spark.sql( + f""" + SELECT * + FROM {identifier}.files + """ + ) + assert files_df.count() == expected_partitions diff --git a/tests/test_transforms.py b/tests/test_transforms.py index 3f1591c01c..15ef7d0ea2 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=eval-used,protected-access,redefined-outer-name -from datetime import date, datetime, timezone +from datetime import date from decimal import Decimal from typing import TYPE_CHECKING, Any, Callable, Optional from uuid import UUID @@ -1814,40 +1814,15 @@ def test_strict_binary(bound_reference_binary: BoundReference[str]) -> None: ) -@pytest.fixture(scope="session") -def arrow_table_date_timestamps() -> "pa.Table": - """Pyarrow table with only date, timestamp and timestamptz values.""" - import pyarrow as pa - - return pa.Table.from_pydict( - { - "date": [date(2023, 12, 31), date(2024, 1, 1), date(2024, 1, 31), date(2024, 2, 1), date(2024, 2, 1), None], - "timestamp": [ - datetime(2023, 12, 31, 0, 0, 0), - datetime(2024, 1, 1, 0, 0, 0), - datetime(2024, 1, 31, 0, 0, 0), - datetime(2024, 2, 1, 0, 0, 0), - datetime(2024, 2, 1, 6, 0, 0), - None, - ], - "timestamptz": [ - datetime(2023, 12, 31, 0, 0, 0, tzinfo=timezone.utc), - datetime(2024, 1, 1, 0, 0, 0, tzinfo=timezone.utc), - datetime(2024, 1, 31, 0, 0, 0, tzinfo=timezone.utc), - datetime(2024, 2, 1, 0, 0, 0, tzinfo=timezone.utc), - datetime(2024, 2, 1, 6, 0, 0, tzinfo=timezone.utc), - None, - ], - }, - schema=pa.schema([ - ("date", pa.date32()), - ("timestamp", pa.timestamp(unit="us")), - ("timestamptz", pa.timestamp(unit="us", tz="UTC")), - ]), - ) - - -@pytest.mark.parametrize('transform', [YearTransform(), MonthTransform(), DayTransform(), HourTransform()]) +@pytest.mark.parametrize( + 'transform', + [ + pytest.param(YearTransform(), id="year_transform"), + pytest.param(MonthTransform(), id="month_transform"), + pytest.param(DayTransform(), id="day_transform"), + pytest.param(HourTransform(), id="hour_transform"), + ], +) @pytest.mark.parametrize( "source_col, source_type", [("date", DateType()), ("timestamp", TimestampType()), ("timestamptz", TimestamptzType())] ) From 3b784abf2aeba1bad07581bbdd1bf5eba6efc5c3 Mon Sep 17 00:00:00 2001 From: Sung Yun <107272191+syun64@users.noreply.github.com> Date: Mon, 6 May 2024 16:20:19 +0000 Subject: [PATCH 77/80] more tests --- Makefile | 2 +- pyiceberg/partitioning.py | 2 +- .../test_writes/test_partitioned_writes.py | 87 +++++++++++++++++-- 3 files changed, 80 insertions(+), 11 deletions(-) diff --git a/Makefile b/Makefile index 35051be9c1..de50374cfb 100644 --- a/Makefile +++ b/Makefile @@ -43,7 +43,7 @@ test-integration: sleep 10 docker compose -f dev/docker-compose-integration.yml cp ./dev/provision.py spark-iceberg:/opt/spark/provision.py docker compose -f dev/docker-compose-integration.yml exec -T spark-iceberg ipython ./provision.py - poetry run pytest tests/ -v -m integration ${PYTEST_ARGS} + poetry run pytest tests/integration/test_writes/test_partitioned_writes.py -v -m integration ${PYTEST_ARGS} test-integration-rebuild: docker compose -f dev/docker-compose-integration.yml kill diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py index 481207db7a..da52d5df8e 100644 --- a/pyiceberg/partitioning.py +++ b/pyiceberg/partitioning.py @@ -387,7 +387,7 @@ def partition(self) -> Record: # partition key transformed with iceberg interna for raw_partition_field_value in self.raw_partition_field_values: partition_fields = self.partition_spec.source_id_to_fields_map[raw_partition_field_value.field.source_id] if len(partition_fields) != 1: - raise ValueError("partition_fields must contain exactly one field.") + raise ValueError(f"Cannot have redundant partitions: {partition_fields}") partition_field = partition_fields[0] iceberg_typed_key_values[partition_field.name] = partition_record_value( partition_field=partition_field, diff --git a/tests/integration/test_writes/test_partitioned_writes.py b/tests/integration/test_writes/test_partitioned_writes.py index 9df2ec218e..2f2aabc1fc 100644 --- a/tests/integration/test_writes/test_partitioned_writes.py +++ b/tests/integration/test_writes/test_partitioned_writes.py @@ -17,7 +17,8 @@ # pylint:disable=redefined-outer-name -from typing import Any +from datetime import date +from typing import Any, Set import pyarrow as pa import pytest @@ -440,10 +441,12 @@ def test_append_ymd_transform_partitioned( @pytest.mark.parametrize( "transform,expected_partitions", [ - pytest.param(YearTransform(), 3, id="year_transform"), - pytest.param(MonthTransform(), 4, id="month_transform"), - pytest.param(DayTransform(), 5, id="day_transform"), - pytest.param(HourTransform(), 6, id="hour_transform"), + pytest.param(YearTransform(), {53, 54, None}, id="year_transform"), + pytest.param(MonthTransform(), {647, 648, 649, None}, id="month_transform"), + pytest.param( + DayTransform(), {date(2023, 12, 31), date(2024, 1, 1), date(2024, 1, 31), date(2024, 2, 1), None}, id="day_transform" + ), + pytest.param(HourTransform(), {473328, 473352, 474072, 474096, 474102, None}, id="hour_transform"), ], ) @pytest.mark.parametrize("format_version", [1, 2]) @@ -453,7 +456,7 @@ def test_append_transform_partition_verify_partitions_count( arrow_table_date_timestamps: pa.Table, arrow_table_date_timestamps_schema: Schema, transform: Transform[Any, Any], - expected_partitions: int, + expected_partitions: Set[Any], format_version: int, ) -> None: # Given @@ -461,7 +464,7 @@ def test_append_transform_partition_verify_partitions_count( identifier = f"default.arrow_table_v{format_version}_with_{str(transform)}_transform_partitioned_on_col_{part_col}" nested_field = arrow_table_date_timestamps_schema.find_field(part_col) partition_spec = PartitionSpec( - PartitionField(source_id=nested_field.field_id, field_id=1001, transform=transform, name=part_col) + PartitionField(source_id=nested_field.field_id, field_id=1001, transform=transform, name=part_col), ) # When @@ -482,11 +485,77 @@ def test_append_transform_partition_verify_partitions_count( assert df.where(f"{col} is not null").count() == 5, f"Expected 2 non-null rows for {col}" assert df.where(f"{col} is null").count() == 1, f"Expected 1 null row for {col} is null" - assert tbl.inspect.partitions().num_rows == expected_partitions + partitions_table = tbl.inspect.partitions() + assert partitions_table.num_rows == len(expected_partitions) + assert {part[part_col] for part in partitions_table['partition'].to_pylist()} == expected_partitions + files_df = spark.sql( + f""" + SELECT * + FROM {identifier}.files + """ + ) + assert files_df.count() == len(expected_partitions) + + +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_append_multiple_partitions( + session_catalog: Catalog, + spark: SparkSession, + arrow_table_date_timestamps: pa.Table, + arrow_table_date_timestamps_schema: Schema, + format_version: int, +) -> None: + # Given + identifier = f"default.arrow_table_v{format_version}_with_multiple_partitions" + partition_spec = PartitionSpec( + PartitionField( + source_id=arrow_table_date_timestamps_schema.find_field("date").field_id, + field_id=1001, + transform=YearTransform(), + name="date_year", + ), + PartitionField( + source_id=arrow_table_date_timestamps_schema.find_field("timestamptz").field_id, + field_id=1000, + transform=HourTransform(), + name="timestamptz_hour", + ), + ) + + # When + tbl = _create_table( + session_catalog=session_catalog, + identifier=identifier, + properties={"format-version": str(format_version)}, + data=[arrow_table_date_timestamps], + partition_spec=partition_spec, + schema=arrow_table_date_timestamps_schema, + ) + + # Then + assert tbl.format_version == format_version, f"Expected v{format_version}, got: v{tbl.format_version}" + df = spark.table(identifier) + assert df.count() == 6, f"Expected 6 total rows for {identifier}" + for col in arrow_table_date_timestamps.column_names: + assert df.where(f"{col} is not null").count() == 5, f"Expected 2 non-null rows for {col}" + assert df.where(f"{col} is null").count() == 1, f"Expected 1 null row for {col} is null" + + partitions_table = tbl.inspect.partitions() + assert partitions_table.num_rows == 6 + partitions = partitions_table['partition'].to_pylist() + assert {(part["date_year"], part["timestamptz_hour"]) for part in partitions} == { + (53, 473328), + (54, 473352), + (54, 474072), + (54, 474096), + (54, 474102), + (None, None), + } files_df = spark.sql( f""" SELECT * FROM {identifier}.files """ ) - assert files_df.count() == expected_partitions + assert files_df.count() == 6 From 3711b1b5eb9e9a04b4d596d919920b729cdfbb9b Mon Sep 17 00:00:00 2001 From: Sung Yun <107272191+syun64@users.noreply.github.com> Date: Tue, 7 May 2024 13:24:08 +0000 Subject: [PATCH 78/80] adopt review feedback --- Makefile | 2 +- pyiceberg/transforms.py | 18 +++++++++++++++--- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index de50374cfb..35051be9c1 100644 --- a/Makefile +++ b/Makefile @@ -43,7 +43,7 @@ test-integration: sleep 10 docker compose -f dev/docker-compose-integration.yml cp ./dev/provision.py spark-iceberg:/opt/spark/provision.py docker compose -f dev/docker-compose-integration.yml exec -T spark-iceberg ipython ./provision.py - poetry run pytest tests/integration/test_writes/test_partitioned_writes.py -v -m integration ${PYTEST_ARGS} + poetry run pytest tests/ -v -m integration ${PYTEST_ARGS} test-integration-rebuild: docker compose -f dev/docker-compose-integration.yml kill diff --git a/pyiceberg/transforms.py b/pyiceberg/transforms.py index c8af97c301..f4d0640d43 100644 --- a/pyiceberg/transforms.py +++ b/pyiceberg/transforms.py @@ -182,6 +182,9 @@ def __eq__(self, other: Any) -> bool: def supports_pyarrow_transform(self) -> bool: return False + @abstractmethod + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": ... + class BucketTransform(Transform[S, int]): """Base Transform class to transform a value into a bucket partition value. @@ -297,6 +300,9 @@ def __repr__(self) -> str: """Return the string representation of the BucketTransform class.""" return f"BucketTransform(num_buckets={self._num_buckets})" + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": + raise NotImplementedError() + class TimeResolution(IntEnum): YEAR = 6 @@ -356,9 +362,6 @@ def dedup_name(self) -> str: def preserves_order(self) -> bool: return True - @abstractmethod - def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": ... - @property def supports_pyarrow_transform(self) -> bool: return True @@ -810,6 +813,9 @@ def __repr__(self) -> str: """Return the string representation of the TruncateTransform class.""" return f"TruncateTransform(width={self._width})" + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": + raise NotImplementedError() + @singledispatch def _human_string(value: Any, _type: IcebergType) -> str: @@ -892,6 +898,9 @@ def __repr__(self) -> str: """Return the string representation of the UnknownTransform class.""" return f"UnknownTransform(transform={repr(self._transform)})" + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": + raise NotImplementedError() + class VoidTransform(Transform[S, None], Singleton): """A transform that always returns None.""" @@ -920,6 +929,9 @@ def __repr__(self) -> str: """Return the string representation of the VoidTransform class.""" return "VoidTransform()" + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": + raise NotImplementedError() + def _truncate_number( name: str, pred: BoundLiteralPredicate[L], transform: Callable[[Optional[L]], Optional[L]] From f16d77880b8d835caed554bb06d3bf605190ba2b Mon Sep 17 00:00:00 2001 From: Sung Yun <107272191+syun64@users.noreply.github.com> Date: Wed, 8 May 2024 22:12:57 +0000 Subject: [PATCH 79/80] comment --- pyiceberg/transforms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/transforms.py b/pyiceberg/transforms.py index f4d0640d43..38cc6221a2 100644 --- a/pyiceberg/transforms.py +++ b/pyiceberg/transforms.py @@ -592,7 +592,7 @@ def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Arr elif isinstance(source, TimestamptzType): epoch = datetime.EPOCH_TIMESTAMPTZ else: - raise ValueError(f"Cannot apply month transform for type: {source}") + raise ValueError(f"Cannot apply hour transform for type: {source}") return lambda v: pc.hours_between(pa.scalar(epoch), v) if v is not None else None From 9f0a92bfb45b4d4c5af4400a1d485826dc4449c5 Mon Sep 17 00:00:00 2001 From: Sung Yun <107272191+syun64@users.noreply.github.com> Date: Fri, 31 May 2024 18:52:23 +0000 Subject: [PATCH 80/80] rebase --- tests/integration/test_writes/test_partitioned_writes.py | 4 ++-- tests/test_transforms.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_writes/test_partitioned_writes.py b/tests/integration/test_writes/test_partitioned_writes.py index 2f2aabc1fc..76d559ca57 100644 --- a/tests/integration/test_writes/test_partitioned_writes.py +++ b/tests/integration/test_writes/test_partitioned_writes.py @@ -487,7 +487,7 @@ def test_append_transform_partition_verify_partitions_count( partitions_table = tbl.inspect.partitions() assert partitions_table.num_rows == len(expected_partitions) - assert {part[part_col] for part in partitions_table['partition'].to_pylist()} == expected_partitions + assert {part[part_col] for part in partitions_table["partition"].to_pylist()} == expected_partitions files_df = spark.sql( f""" SELECT * @@ -543,7 +543,7 @@ def test_append_multiple_partitions( partitions_table = tbl.inspect.partitions() assert partitions_table.num_rows == 6 - partitions = partitions_table['partition'].to_pylist() + partitions = partitions_table["partition"].to_pylist() assert {(part["date_year"], part["timestamptz_hour"]) for part in partitions} == { (53, 473328), (54, 473352), diff --git a/tests/test_transforms.py b/tests/test_transforms.py index 15ef7d0ea2..3a9ffd6009 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -1815,7 +1815,7 @@ def test_strict_binary(bound_reference_binary: BoundReference[str]) -> None: @pytest.mark.parametrize( - 'transform', + "transform", [ pytest.param(YearTransform(), id="year_transform"), pytest.param(MonthTransform(), id="month_transform"),