Skip to content

[backport 2.3.x] BUG: Fix copy semantics in __array__ (#60046) #60189

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v2.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ enhancement1
Other enhancements
^^^^^^^^^^^^^^^^^^

- The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called
when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been
updated to work correctly with NumPy >= 2 (:issue:`57739`)
- The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)
-

Expand Down
11 changes: 10 additions & 1 deletion pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -662,7 +662,16 @@ def __array__(
self, dtype: NpDtype | None = None, copy: bool | None = None
) -> np.ndarray:
"""Correctly construct numpy arrays when passed to `np.asarray()`."""
return self.to_numpy(dtype=dtype)
if copy is False:
# TODO: By using `zero_copy_only` it may be possible to implement this
raise ValueError(
"Unable to avoid copy while creating an array as requested."
)
elif copy is None:
# `to_numpy(copy=False)` has the meaning of NumPy `copy=None`.
copy = False

return self.to_numpy(dtype=dtype, copy=copy)

def __invert__(self) -> Self:
# This is a bit wise op for integer types
Expand Down
33 changes: 25 additions & 8 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,11 +577,12 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
raise ValueError("Cannot convert float NaN to integer")

elif len(self.codes) == 0 or len(self.categories) == 0:
result = np.array(
self,
dtype=dtype,
copy=copy,
)
# For NumPy 1.x compatibility we cannot use copy=None. And
# `copy=False` has the meaning of `copy=None` here:
if not copy:
result = np.asarray(self, dtype=dtype)
else:
result = np.array(self, dtype=dtype)

else:
# GH8628 (PERF): astype category codes instead of astyping array
Expand Down Expand Up @@ -1642,6 +1643,17 @@ def __array__(
"""
The numpy array interface.

Users should not call this directly. Rather, it is invoked by
:func:`numpy.array` and :func:`numpy.asarray`.

Parameters
----------
dtype : np.dtype or None
Specifies the the dtype for the array.

copy : bool or None, optional
See :func:`numpy.asarray`.

Returns
-------
numpy.array
Expand All @@ -1659,13 +1671,18 @@ def __array__(
>>> np.asarray(cat)
array(['a', 'b'], dtype=object)
"""
if copy is False:
raise ValueError(
"Unable to avoid copy while creating an array as requested."
)

ret = take_nd(self.categories._values, self._codes)
if dtype and np.dtype(dtype) != self.categories.dtype:
return np.asarray(ret, dtype)
# When we're a Categorical[ExtensionArray], like Interval,
# we need to ensure __array__ gets all the way to an
# ndarray.
return np.asarray(ret)

# `take_nd` should already make a copy, so don't force again.
return np.asarray(ret, dtype=dtype)

def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
# for binary ops, use our custom dunder methods
Expand Down
7 changes: 7 additions & 0 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,14 @@ def __array__(
) -> np.ndarray:
# used for Timedelta/DatetimeArray, overwritten by PeriodArray
if is_object_dtype(dtype):
if copy is False:
raise ValueError(
"Unable to avoid copy while creating an array as requested."
)
return np.array(list(self), dtype=object)

if copy is True:
return np.array(self._ndarray, dtype=dtype)
return self._ndarray

@overload
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/arrays/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -1574,6 +1574,11 @@ def __array__(
Return the IntervalArray's data as a numpy array of Interval
objects (with dtype='object')
"""
if copy is False:
raise ValueError(
"Unable to avoid copy while creating an array as requested."
)

left = self._left
right = self._right
mask = self.isna()
Expand Down
12 changes: 11 additions & 1 deletion pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -600,7 +600,17 @@ def __array__(
the array interface, return my values
We return an object array here to preserve our scalar values
"""
return self.to_numpy(dtype=dtype)
if copy is False:
if not self._hasna:
# special case, here we can simply return the underlying data
return np.array(self._data, dtype=dtype, copy=copy)
raise ValueError(
"Unable to avoid copy while creating an array as requested."
)

if copy is None:
copy = False # The NumPy copy=False meaning is different here.
return self.to_numpy(dtype=dtype, copy=copy)

_HANDLED_TYPES: tuple[type, ...]

Expand Down
3 changes: 3 additions & 0 deletions pandas/core/arrays/numpy_.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,9 @@ def dtype(self) -> NumpyEADtype:
def __array__(
self, dtype: NpDtype | None = None, copy: bool | None = None
) -> np.ndarray:
if copy is not None:
# Note: branch avoids `copy=None` for NumPy 1.x support
return np.array(self._ndarray, dtype=dtype, copy=copy)
return np.asarray(self._ndarray, dtype=dtype)

def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
Expand Down
15 changes: 13 additions & 2 deletions pandas/core/arrays/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,8 +407,19 @@ def __array__(
self, dtype: NpDtype | None = None, copy: bool | None = None
) -> np.ndarray:
if dtype == "i8":
return self.asi8
elif dtype == bool:
# For NumPy 1.x compatibility we cannot use copy=None. And
# `copy=False` has the meaning of `copy=None` here:
if not copy:
return np.asarray(self.asi8, dtype=dtype)
else:
return np.array(self.asi8, dtype=dtype)

if copy is False:
raise ValueError(
"Unable to avoid copy while creating an array as requested."
)

if dtype == bool:
return ~self._isnan

# This will raise TypeError for non-object dtypes
Expand Down
15 changes: 12 additions & 3 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,11 +554,20 @@ def from_spmatrix(cls, data: spmatrix) -> Self:
def __array__(
self, dtype: NpDtype | None = None, copy: bool | None = None
) -> np.ndarray:
fill_value = self.fill_value

if self.sp_index.ngaps == 0:
# Compat for na dtype and int values.
return self.sp_values
if copy is True:
return np.array(self.sp_values)
else:
return self.sp_values

if copy is False:
raise ValueError(
"Unable to avoid copy while creating an array as requested."
)

fill_value = self.fill_value

if dtype is None:
# Can NumPy represent this type?
# If not, `np.result_type` will raise. We catch that
Expand Down
10 changes: 8 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2150,9 +2150,15 @@ def __array__(
self, dtype: npt.DTypeLike | None = None, copy: bool_t | None = None
) -> np.ndarray:
values = self._values
arr = np.asarray(values, dtype=dtype)
if copy is None:
# Note: branch avoids `copy=None` for NumPy 1.x support
arr = np.asarray(values, dtype=dtype)
else:
arr = np.array(values, dtype=dtype, copy=copy)

if (
astype_is_view(values.dtype, arr.dtype)
copy is not True
and astype_is_view(values.dtype, arr.dtype)
and using_copy_on_write()
and self._mgr.is_single_block
):
Expand Down
6 changes: 5 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -917,7 +917,11 @@ def __array__(self, dtype=None, copy=None) -> np.ndarray:
"""
The array interface, return my values.
"""
return np.asarray(self._data, dtype=dtype)
if copy is None:
# Note, that the if branch exists for NumPy 1.x support
return np.asarray(self._data, dtype=dtype)

return np.array(self._data, dtype=dtype, copy=copy)

def __array_ufunc__(self, ufunc: np.ufunc, method: str_t, *inputs, **kwargs):
if any(isinstance(other, (ABCSeries, ABCDataFrame)) for other in inputs):
Expand Down
9 changes: 9 additions & 0 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1311,6 +1311,15 @@ def copy( # type: ignore[override]

def __array__(self, dtype=None, copy=None) -> np.ndarray:
"""the array interface, return my values"""
if copy is False:
# self.values is always a newly construct array, so raise.
raise ValueError(
"Unable to avoid copy while creating an array as requested."
)
if copy is True:
# explicit np.array call to ensure a copy is made and unique objects
# are returned, because self.values is cached
return np.array(self.values, dtype=dtype)
return self.values

def view(self, cls=None) -> Self:
Expand Down
12 changes: 6 additions & 6 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,12 +305,12 @@ def ndarray_to_mgr(

elif isinstance(values, (np.ndarray, ExtensionArray)):
# drop subclass info
_copy = (
copy_on_sanitize
if (dtype is None or astype_is_view(values.dtype, dtype))
else False
)
values = np.array(values, copy=_copy)
if copy_on_sanitize and (dtype is None or astype_is_view(values.dtype, dtype)):
# only force a copy now if copy=True was requested
# and a subsequent `astype` will not already result in a copy
values = np.array(values, copy=True, order="F")
else:
values = np.asarray(values)
values = _ensure_2d(values)

else:
Expand Down
15 changes: 12 additions & 3 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -991,7 +991,7 @@ def __array__(
the dtype is inferred from the data.

copy : bool or None, optional
Unused.
See :func:`numpy.asarray`.

Returns
-------
Expand Down Expand Up @@ -1028,8 +1028,17 @@ def __array__(
dtype='datetime64[ns]')
"""
values = self._values
arr = np.asarray(values, dtype=dtype)
if using_copy_on_write() and astype_is_view(values.dtype, arr.dtype):
if copy is None:
# Note: branch avoids `copy=None` for NumPy 1.x support
arr = np.asarray(values, dtype=dtype)
else:
arr = np.array(values, dtype=dtype, copy=copy)

if copy is True:
return arr
if using_copy_on_write() and (
copy is False or astype_is_view(values.dtype, arr.dtype)
):
arr = arr.view()
arr.flags.writeable = False
return arr
Expand Down
31 changes: 31 additions & 0 deletions pandas/tests/arrays/sparse/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pytest

from pandas._libs.sparse import IntIndex
from pandas.compat.numpy import np_version_gt2

import pandas as pd
from pandas import (
Expand Down Expand Up @@ -478,3 +479,33 @@ def test_zero_sparse_column():

expected = pd.DataFrame({"A": SparseArray([0, 0]), "B": [1, 3]}, index=[0, 2])
tm.assert_frame_equal(result, expected)


def test_array_interface(arr_data, arr):
# https://github.com/pandas-dev/pandas/pull/60046
result = np.asarray(arr)
tm.assert_numpy_array_equal(result, arr_data)

# it always gives a copy by default
result_copy1 = np.asarray(arr)
result_copy2 = np.asarray(arr)
assert not np.may_share_memory(result_copy1, result_copy2)

# or with explicit copy=True
result_copy1 = np.array(arr, copy=True)
result_copy2 = np.array(arr, copy=True)
assert not np.may_share_memory(result_copy1, result_copy2)

if not np_version_gt2:
# copy=False semantics are only supported in NumPy>=2.
return

# for sparse arrays, copy=False is never allowed
with pytest.raises(ValueError, match="Unable to avoid copy while creating"):
np.array(arr, copy=False)

# except when there are actually no sparse filled values
arr2 = SparseArray(np.array([1, 2, 3]))
result_nocopy1 = np.array(arr2, copy=False)
result_nocopy2 = np.array(arr2, copy=False)
assert np.may_share_memory(result_nocopy1, result_nocopy2)
8 changes: 8 additions & 0 deletions pandas/tests/arrays/test_datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -1148,9 +1148,17 @@ def test_array_interface(self, arr1d):
result = np.asarray(arr, dtype=object)
tm.assert_numpy_array_equal(result, expected)

# to int64 gives the underlying representation
result = np.asarray(arr, dtype="int64")
tm.assert_numpy_array_equal(result, arr.asi8)

result2 = np.asarray(arr, dtype="int64")
assert np.may_share_memory(result, result2)

result_copy1 = np.array(arr, dtype="int64", copy=True)
result_copy2 = np.array(arr, dtype="int64", copy=True)
assert not np.may_share_memory(result_copy1, result_copy2)

# to other dtypes
msg = r"float\(\) argument must be a string or a( real)? number, not 'Period'"
with pytest.raises(TypeError, match=msg):
Expand Down
Loading
Loading