diff --git a/poetry.lock b/poetry.lock index d9c43b8b34..8d439a8929 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1887,7 +1887,7 @@ description = "Lightweight in-process concurrent programming" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "(extra == \"sql-postgres\" or extra == \"sql-sqlite\") and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\") and python_version < \"3.14\"" +markers = "(extra == \"sql-postgres\" or extra == \"sql-sqlite\") and python_version < \"3.14\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")" files = [ {file = "greenlet-3.1.1-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:0bbae94a29c9e5c7e4a2b7f0aae5c17e8e90acbfd3bf6270eeba60c39fce3563"}, {file = "greenlet-3.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fde093fb93f35ca72a556cf72c92ea3ebfda3d79fc35bb19fbe685853869a83"}, @@ -3184,7 +3184,7 @@ description = "Fundamental package for array computing in Python" optional = true python-versions = ">=3.10" groups = ["main"] -markers = "extra == \"pandas\" and python_version == \"3.11\" or extra == \"pandas\" and python_version >= \"3.12\" or extra == \"ray\" and python_version == \"3.11\" or extra == \"ray\" and python_version >= \"3.12\"" +markers = "extra == \"pandas\" and python_version >= \"3.12\" or extra == \"pandas\" and python_version == \"3.11\" or extra == \"ray\" and python_version >= \"3.12\" or extra == \"ray\" and python_version == \"3.11\"" files = [ {file = "numpy-2.2.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8146f3550d627252269ac42ae660281d673eb6f8b32f113538e0cc2a9aed42b9"}, {file = "numpy-2.2.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e642d86b8f956098b564a45e6f6ce68a22c2c97a04f5acd3f221f57b8cb850ae"}, @@ -3381,8 +3381,8 @@ files = [ [package.dependencies] numpy = [ {version = ">=1.22.4", markers = "python_version < \"3.11\""}, - {version = ">=1.23.2", markers = "python_version == \"3.11\""}, {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, + {version = ">=1.23.2", markers = "python_version == \"3.11\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -3676,22 +3676,22 @@ testing = ["google-api-core (>=1.31.5)"] [[package]] name = "protobuf" -version = "6.30.1" +version = "6.30.2" description = "" optional = true python-versions = ">=3.9" groups = ["main"] markers = "extra == \"gcsfs\"" files = [ - {file = "protobuf-6.30.1-cp310-abi3-win32.whl", hash = "sha256:ba0706f948d0195f5cac504da156d88174e03218d9364ab40d903788c1903d7e"}, - {file = "protobuf-6.30.1-cp310-abi3-win_amd64.whl", hash = "sha256:ed484f9ddd47f0f1bf0648806cccdb4fe2fb6b19820f9b79a5adf5dcfd1b8c5f"}, - {file = "protobuf-6.30.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:aa4f7dfaed0d840b03d08d14bfdb41348feaee06a828a8c455698234135b4075"}, - {file = "protobuf-6.30.1-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:47cd320b7db63e8c9ac35f5596ea1c1e61491d8a8eb6d8b45edc44760b53a4f6"}, - {file = "protobuf-6.30.1-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:e3083660225fa94748ac2e407f09a899e6a28bf9c0e70c75def8d15706bf85fc"}, - {file = "protobuf-6.30.1-cp39-cp39-win32.whl", hash = "sha256:554d7e61cce2aa4c63ca27328f757a9f3867bce8ec213bf09096a8d16bcdcb6a"}, - {file = "protobuf-6.30.1-cp39-cp39-win_amd64.whl", hash = "sha256:b510f55ce60f84dc7febc619b47215b900466e3555ab8cb1ba42deb4496d6cc0"}, - {file = "protobuf-6.30.1-py3-none-any.whl", hash = "sha256:3c25e51e1359f1f5fa3b298faa6016e650d148f214db2e47671131b9063c53be"}, - {file = "protobuf-6.30.1.tar.gz", hash = "sha256:535fb4e44d0236893d5cf1263a0f706f1160b689a7ab962e9da8a9ce4050b780"}, + {file = "protobuf-6.30.2-cp310-abi3-win32.whl", hash = "sha256:b12ef7df7b9329886e66404bef5e9ce6a26b54069d7f7436a0853ccdeb91c103"}, + {file = "protobuf-6.30.2-cp310-abi3-win_amd64.whl", hash = "sha256:7653c99774f73fe6b9301b87da52af0e69783a2e371e8b599b3e9cb4da4b12b9"}, + {file = "protobuf-6.30.2-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:0eb523c550a66a09a0c20f86dd554afbf4d32b02af34ae53d93268c1f73bc65b"}, + {file = "protobuf-6.30.2-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:50f32cc9fd9cb09c783ebc275611b4f19dfdfb68d1ee55d2f0c7fa040df96815"}, + {file = "protobuf-6.30.2-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:4f6c687ae8efae6cf6093389a596548214467778146b7245e886f35e1485315d"}, + {file = "protobuf-6.30.2-cp39-cp39-win32.whl", hash = "sha256:524afedc03b31b15586ca7f64d877a98b184f007180ce25183d1a5cb230ee72b"}, + {file = "protobuf-6.30.2-cp39-cp39-win_amd64.whl", hash = "sha256:acec579c39c88bd8fbbacab1b8052c793efe83a0a5bd99db4a31423a25c0a0e2"}, + {file = "protobuf-6.30.2-py3-none-any.whl", hash = "sha256:ae86b030e69a98e08c77beab574cbcb9fff6d031d57209f574a5aea1445f4b51"}, + {file = "protobuf-6.30.2.tar.gz", hash = "sha256:35c859ae076d8c56054c25b59e5e59638d86545ed6e2b6efac6be0b6ea3ba048"}, ] [[package]] @@ -4064,21 +4064,26 @@ windows-terminal = ["colorama (>=0.4.6)"] [[package]] name = "pyiceberg-core" -version = "0.4.0" +version = "0.4.0.dev20250326000154" description = "" optional = true -python-versions = "*" +python-versions = "~=3.9" groups = ["main"] markers = "extra == \"pyiceberg-core\"" files = [ - {file = "pyiceberg_core-0.4.0-cp39-abi3-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:5aec569271c96e18428d542f9b7007117a7232c06017f95cb239d42e952ad3b4"}, - {file = "pyiceberg_core-0.4.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5e74773e58efa4df83aba6f6265cdd41e446fa66fa4e343ca86395fed9f209ae"}, - {file = "pyiceberg_core-0.4.0-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7675d21a54bf3753c740d8df78ad7efe33f438096844e479d4f3493f84830925"}, - {file = "pyiceberg_core-0.4.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7058ad935a40b1838e4cdc5febd768878c1a51f83dca005d5a52a7fa280a2489"}, - {file = "pyiceberg_core-0.4.0-cp39-abi3-win_amd64.whl", hash = "sha256:a83eb4c2307ae3dd321a9360828fb043a4add2cc9797bef0bafa20894488fb07"}, - {file = "pyiceberg_core-0.4.0.tar.gz", hash = "sha256:d2e6138707868477b806ed354aee9c476e437913a331cb9ad9ad46b4054cd11f"}, + {file = "pyiceberg_core-0.4.0.dev20250326000154-cp39-abi3-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:0fb7b1b12a07170919c5f7b911eddd64eb3dd03340edbd06198c5b490ddca8cd"}, + {file = "pyiceberg_core-0.4.0.dev20250326000154-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:de884887a847f624878895ca8ca84a53683f50057954a3a8f3be5645443e6d65"}, + {file = "pyiceberg_core-0.4.0.dev20250326000154-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3133db2113d09c8dd256f7437e6687ece46c71efd53f8d3762556711e3882b74"}, + {file = "pyiceberg_core-0.4.0.dev20250326000154-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:49c8a4c68b0097f5f4eeb401f449fb77c9fcecf93544e49d0b2a61e24b9c65b4"}, + {file = "pyiceberg_core-0.4.0.dev20250326000154-cp39-abi3-win_amd64.whl", hash = "sha256:5a2159526bc56aa3e5e44bc76d979a5fe07d80c7d7d984b360f8a4f2c5ab7aec"}, + {file = "pyiceberg_core-0.4.0.dev20250326000154.tar.gz", hash = "sha256:5d4c04d11a99385c92f2572a7e9980df8e6109dde59f8b0ff016dca930e70bde"}, ] +[package.source] +type = "legacy" +url = "https://test.pypi.org/simple" +reference = "testpypi" + [[package]] name = "pyjwt" version = "2.10.1" @@ -5339,7 +5344,7 @@ description = "A lil' TOML parser" optional = false python-versions = ">=3.8" groups = ["dev"] -markers = "python_version < \"3.11\"" +markers = "python_full_version <= \"3.11.0a6\"" files = [ {file = "tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249"}, {file = "tomli-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6"}, @@ -5916,4 +5921,4 @@ zstandard = ["zstandard"] [metadata] lock-version = "2.1" python-versions = "^3.9.2, !=3.9.7" -content-hash = "1772c4ef73bf4d04da928ecd2185db3716191f42e20d72fec2b44ba0a633c607" +content-hash = "358dc5aa888edf4a49a296b79c42d27fadc22b7d5e5e885a04920601be2f25b9" diff --git a/pyiceberg/transforms.py b/pyiceberg/transforms.py index fd90859203..ac2b4a40ee 100644 --- a/pyiceberg/transforms.py +++ b/pyiceberg/transforms.py @@ -134,6 +134,36 @@ def parse_transform(v: Any) -> Any: return v +def _pyiceberg_transform_wrapper( + transform_func: Callable[["ArrayLike", Any], "ArrayLike"], + *args: Any, + expected_type: Optional["pa.DataType"] = None, +) -> Callable[["ArrayLike"], "ArrayLike"]: + try: + import pyarrow as pa + except ModuleNotFoundError as e: + raise ModuleNotFoundError("For bucket/truncate transforms, PyArrow needs to be installed") from e + + def _transform(array: "ArrayLike") -> "ArrayLike": + def _cast_if_needed(arr: "ArrayLike") -> "ArrayLike": + if expected_type is not None: + return arr.cast(expected_type) + else: + return arr + + if isinstance(array, pa.Array): + return _cast_if_needed(transform_func(array, *args)) + elif isinstance(array, pa.ChunkedArray): + result_chunks = [] + for arr in array.iterchunks(): + result_chunks.append(_cast_if_needed(transform_func(arr, *args))) + return pa.chunked_array(result_chunks) + else: + raise ValueError(f"PyArrow array can only be of type pa.Array or pa.ChunkedArray, but found {type(array)}") + + return _transform + + class Transform(IcebergRootModel[str], ABC, Generic[S, T]): """Transform base class for concrete transforms. @@ -198,27 +228,6 @@ def supports_pyarrow_transform(self) -> bool: @abstractmethod def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": ... - def _pyiceberg_transform_wrapper( - self, transform_func: Callable[["ArrayLike", Any], "ArrayLike"], *args: Any - ) -> Callable[["ArrayLike"], "ArrayLike"]: - try: - import pyarrow as pa - except ModuleNotFoundError as e: - raise ModuleNotFoundError("For bucket/truncate transforms, PyArrow needs to be installed") from e - - def _transform(array: "ArrayLike") -> "ArrayLike": - if isinstance(array, pa.Array): - return transform_func(array, *args) - elif isinstance(array, pa.ChunkedArray): - result_chunks = [] - for arr in array.iterchunks(): - result_chunks.append(transform_func(arr, *args)) - return pa.chunked_array(result_chunks) - else: - raise ValueError(f"PyArrow array can only be of type pa.Array or pa.ChunkedArray, but found {type(array)}") - - return _transform - class BucketTransform(Transform[S, int]): """Base Transform class to transform a value into a bucket partition value. @@ -375,7 +384,7 @@ def __repr__(self) -> str: def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": from pyiceberg_core import transform as pyiceberg_core_transform - return self._pyiceberg_transform_wrapper(pyiceberg_core_transform.bucket, self._num_buckets) + return _pyiceberg_transform_wrapper(pyiceberg_core_transform.bucket, self._num_buckets) @property def supports_pyarrow_transform(self) -> bool: @@ -501,22 +510,9 @@ def __repr__(self) -> str: def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": import pyarrow as pa - import pyarrow.compute as pc - - if isinstance(source, DateType): - epoch = pa.scalar(datetime.EPOCH_DATE) - elif isinstance(source, TimestampType): - epoch = pa.scalar(datetime.EPOCH_TIMESTAMP) - elif isinstance(source, TimestamptzType): - epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ) - elif isinstance(source, TimestampNanoType): - epoch = pa.scalar(datetime.EPOCH_TIMESTAMP).cast(pa.timestamp("ns")) - elif isinstance(source, TimestamptzNanoType): - epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ).cast(pa.timestamp("ns")) - else: - raise ValueError(f"Cannot apply year transform for type: {source}") + from pyiceberg_core import transform as pyiceberg_core_transform - return lambda v: pc.years_between(epoch, v) if v is not None else None + return _pyiceberg_transform_wrapper(pyiceberg_core_transform.year, expected_type=pa.int32()) class MonthTransform(TimeTransform[S]): @@ -575,28 +571,9 @@ def __repr__(self) -> str: def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": import pyarrow as pa - import pyarrow.compute as pc - - if isinstance(source, DateType): - epoch = pa.scalar(datetime.EPOCH_DATE) - elif isinstance(source, TimestampType): - epoch = pa.scalar(datetime.EPOCH_TIMESTAMP) - elif isinstance(source, TimestamptzType): - epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ) - elif isinstance(source, TimestampNanoType): - epoch = pa.scalar(datetime.EPOCH_TIMESTAMP).cast(pa.timestamp("ns")) - elif isinstance(source, TimestamptzNanoType): - epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ).cast(pa.timestamp("ns")) - else: - raise ValueError(f"Cannot apply month transform for type: {source}") - - def month_func(v: pa.Array) -> pa.Array: - return pc.add( - pc.multiply(pc.years_between(epoch, v), pa.scalar(12)), - pc.add(pc.month(v), pa.scalar(-1)), - ) + from pyiceberg_core import transform as pyiceberg_core_transform - return lambda v: month_func(v) if v is not None else None + return _pyiceberg_transform_wrapper(pyiceberg_core_transform.month, expected_type=pa.int32()) class DayTransform(TimeTransform[S]): @@ -663,22 +640,9 @@ def __repr__(self) -> str: def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": import pyarrow as pa - import pyarrow.compute as pc - - if isinstance(source, DateType): - epoch = pa.scalar(datetime.EPOCH_DATE) - elif isinstance(source, TimestampType): - epoch = pa.scalar(datetime.EPOCH_TIMESTAMP) - elif isinstance(source, TimestamptzType): - epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ) - elif isinstance(source, TimestampNanoType): - epoch = pa.scalar(datetime.EPOCH_TIMESTAMP).cast(pa.timestamp("ns")) - elif isinstance(source, TimestamptzNanoType): - epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ).cast(pa.timestamp("ns")) - else: - raise ValueError(f"Cannot apply day transform for type: {source}") + from pyiceberg_core import transform as pyiceberg_core_transform - return lambda v: pc.days_between(epoch, v) if v is not None else None + return _pyiceberg_transform_wrapper(pyiceberg_core_transform.day, expected_type=pa.int32()) class HourTransform(TimeTransform[S]): @@ -728,21 +692,9 @@ def __repr__(self) -> str: return "HourTransform()" def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": - import pyarrow as pa - import pyarrow.compute as pc - - if isinstance(source, TimestampType): - epoch = pa.scalar(datetime.EPOCH_TIMESTAMP) - elif isinstance(source, TimestamptzType): - epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ) - elif isinstance(source, TimestampNanoType): - epoch = pa.scalar(datetime.EPOCH_TIMESTAMP).cast(pa.timestamp("ns")) - elif isinstance(source, TimestamptzNanoType): - epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ).cast(pa.timestamp("ns")) - else: - raise ValueError(f"Cannot apply hour transform for type: {source}") + from pyiceberg_core import transform as pyiceberg_core_transform - return lambda v: pc.hours_between(epoch, v) if v is not None else None + return _pyiceberg_transform_wrapper(pyiceberg_core_transform.hour) def _base64encode(buffer: bytes) -> str: @@ -965,7 +917,7 @@ def __repr__(self) -> str: def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": from pyiceberg_core import transform as pyiceberg_core_transform - return self._pyiceberg_transform_wrapper(pyiceberg_core_transform.truncate, self._width) + return _pyiceberg_transform_wrapper(pyiceberg_core_transform.truncate, self._width) @property def supports_pyarrow_transform(self) -> bool: diff --git a/pyproject.toml b/pyproject.toml index d167686be7..49b63059be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -80,7 +80,7 @@ psycopg2-binary = { version = ">=2.9.6", optional = true } sqlalchemy = { version = "^2.0.18", optional = true } getdaft = { version = ">=0.2.12", optional = true } cachetools = "^5.5.0" -pyiceberg-core = { version = "^0.4.0", optional = true } +pyiceberg-core = { version = "0.4.0.dev20250326000154", source="testpypi", optional = true } polars = { version = "^1.21.0", optional = true } thrift-sasl = { version = ">=0.4.3", optional = true } @@ -115,6 +115,14 @@ mkdocs-material = "9.6.9" mkdocs-material-extensions = "1.3.1" mkdocs-section-index = "0.3.9" +[[tool.poetry.source]] +name = "pypi" +priority = "primary" + +[[tool.poetry.source]] +name = "testpypi" +url = "https://test.pypi.org/simple/" + [[tool.mypy.overrides]] module = "pytest_mock.*" ignore_missing_imports = true diff --git a/tests/table/test_partitioning.py b/tests/table/test_partitioning.py index edda6d3aa8..9c525c8604 100644 --- a/tests/table/test_partitioning.py +++ b/tests/table/test_partitioning.py @@ -186,8 +186,8 @@ def test_partition_type(table_schema_simple: Schema) -> None: (DecimalType(5, 9), Decimal(19.25)), (DateType(), datetime.date(1925, 5, 22)), (TimeType(), datetime.time(19, 25, 00)), - (TimestampType(), datetime.datetime(19, 5, 1, 22, 1, 1)), - (TimestamptzType(), datetime.datetime(19, 5, 1, 22, 1, 1, tzinfo=datetime.timezone.utc)), + (TimestampType(), datetime.datetime(2022, 5, 1, 22, 1, 1)), + (TimestamptzType(), datetime.datetime(2022, 5, 1, 22, 1, 1, tzinfo=datetime.timezone.utc)), (StringType(), "abc"), (UUIDType(), UUID("12345678-1234-5678-1234-567812345678").bytes), (FixedType(5), 'b"\x8e\xd1\x87\x01"'),