pandas-dev · jorisvandenbossche · Nov 5, 2024 · Nov 4, 2024
diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
@@ -32,6 +32,9 @@ enhancement1
 Other enhancements
 ^^^^^^^^^^^^^^^^^^
 
+- The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called
+  when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been
+  updated to work correctly with NumPy >= 2 (:issue:`57739`)
 - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)
 -
 

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -662,7 +662,16 @@ def __array__(
         self, dtype: NpDtype | None = None, copy: bool | None = None
     ) -> np.ndarray:
         """Correctly construct numpy arrays when passed to `np.asarray()`."""
-        return self.to_numpy(dtype=dtype)
+        if copy is False:
+            # TODO: By using `zero_copy_only` it may be possible to implement this
+            raise ValueError(
+                "Unable to avoid copy while creating an array as requested."
+            )
+        elif copy is None:
+            # `to_numpy(copy=False)` has the meaning of NumPy `copy=None`.
+            copy = False
+
+        return self.to_numpy(dtype=dtype, copy=copy)
 
     def __invert__(self) -> Self:
         # This is a bit wise op for integer types

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -577,11 +577,12 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
             raise ValueError("Cannot convert float NaN to integer")
 
         elif len(self.codes) == 0 or len(self.categories) == 0:
-            result = np.array(
-                self,
-                dtype=dtype,
-                copy=copy,
-            )
+            # For NumPy 1.x compatibility we cannot use copy=None.  And
+            # `copy=False` has the meaning of `copy=None` here:
+            if not copy:
+                result = np.asarray(self, dtype=dtype)
+            else:
+                result = np.array(self, dtype=dtype)
 
         else:
             # GH8628 (PERF): astype category codes instead of astyping array
@@ -1642,6 +1643,17 @@ def __array__(
         """
         The numpy array interface.
 
+        Users should not call this directly. Rather, it is invoked by
+        :func:`numpy.array` and :func:`numpy.asarray`.
+
+        Parameters
+        ----------
+        dtype : np.dtype or None
+            Specifies the the dtype for the array.
+
+        copy : bool or None, optional
+            See :func:`numpy.asarray`.
+
         Returns
         -------
         numpy.array
@@ -1659,13 +1671,18 @@ def __array__(
         >>> np.asarray(cat)
         array(['a', 'b'], dtype=object)
         """
+        if copy is False:
+            raise ValueError(
+                "Unable to avoid copy while creating an array as requested."
+            )
+
         ret = take_nd(self.categories._values, self._codes)
-        if dtype and np.dtype(dtype) != self.categories.dtype:
-            return np.asarray(ret, dtype)
         # When we're a Categorical[ExtensionArray], like Interval,
         # we need to ensure __array__ gets all the way to an
         # ndarray.
-        return np.asarray(ret)
+
+        # `take_nd` should already make a copy, so don't force again.
+        return np.asarray(ret, dtype=dtype)
 
     def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
         # for binary ops, use our custom dunder methods

diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -358,7 +358,14 @@ def __array__(
     ) -> np.ndarray:
         # used for Timedelta/DatetimeArray, overwritten by PeriodArray
         if is_object_dtype(dtype):
+            if copy is False:
+                raise ValueError(
+                    "Unable to avoid copy while creating an array as requested."
+                )
             return np.array(list(self), dtype=object)
+
+        if copy is True:
+            return np.array(self._ndarray, dtype=dtype)
         return self._ndarray
 
     @overload

diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py
@@ -1574,6 +1574,11 @@ def __array__(
         Return the IntervalArray's data as a numpy array of Interval
         objects (with dtype='object')
         """
+        if copy is False:
+            raise ValueError(
+                "Unable to avoid copy while creating an array as requested."
+            )
+
         left = self._left
         right = self._right
         mask = self.isna()

diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -600,7 +600,17 @@ def __array__(
         the array interface, return my values
         We return an object array here to preserve our scalar values
         """
-        return self.to_numpy(dtype=dtype)
+        if copy is False:
+            if not self._hasna:
+                # special case, here we can simply return the underlying data
+                return np.array(self._data, dtype=dtype, copy=copy)
+            raise ValueError(
+                "Unable to avoid copy while creating an array as requested."
+            )
+
+        if copy is None:
+            copy = False  # The NumPy copy=False meaning is different here.
+        return self.to_numpy(dtype=dtype, copy=copy)
 
     _HANDLED_TYPES: tuple[type, ...]
 

diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py
@@ -150,6 +150,9 @@ def dtype(self) -> NumpyEADtype:
     def __array__(
         self, dtype: NpDtype | None = None, copy: bool | None = None
     ) -> np.ndarray:
+        if copy is not None:
+            # Note: branch avoids `copy=None` for NumPy 1.x support
+            return np.array(self._ndarray, dtype=dtype, copy=copy)
         return np.asarray(self._ndarray, dtype=dtype)
 
     def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):

diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py
@@ -407,8 +407,19 @@ def __array__(
         self, dtype: NpDtype | None = None, copy: bool | None = None
     ) -> np.ndarray:
         if dtype == "i8":
-            return self.asi8
-        elif dtype == bool:
+            # For NumPy 1.x compatibility we cannot use copy=None.  And
+            # `copy=False` has the meaning of `copy=None` here:
+            if not copy:
+                return np.asarray(self.asi8, dtype=dtype)
+            else:
+                return np.array(self.asi8, dtype=dtype)
+
+        if copy is False:
+            raise ValueError(
+                "Unable to avoid copy while creating an array as requested."
+            )
+
+        if dtype == bool:
             return ~self._isnan
 
         # This will raise TypeError for non-object dtypes

diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -554,11 +554,20 @@ def from_spmatrix(cls, data: spmatrix) -> Self:
     def __array__(
         self, dtype: NpDtype | None = None, copy: bool | None = None
     ) -> np.ndarray:
-        fill_value = self.fill_value
-
         if self.sp_index.ngaps == 0:
             # Compat for na dtype and int values.
-            return self.sp_values
+            if copy is True:
+                return np.array(self.sp_values)
+            else:
+                return self.sp_values
+
+        if copy is False:
+            raise ValueError(
+                "Unable to avoid copy while creating an array as requested."
+            )
+
+        fill_value = self.fill_value
+
         if dtype is None:
             # Can NumPy represent this type?
             # If not, `np.result_type` will raise. We catch that

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -2150,9 +2150,15 @@ def __array__(
         self, dtype: npt.DTypeLike | None = None, copy: bool_t | None = None
     ) -> np.ndarray:
         values = self._values
-        arr = np.asarray(values, dtype=dtype)
+        if copy is None:
+            # Note: branch avoids `copy=None` for NumPy 1.x support
+            arr = np.asarray(values, dtype=dtype)
+        else:
+            arr = np.array(values, dtype=dtype, copy=copy)
+
         if (
-            astype_is_view(values.dtype, arr.dtype)
+            copy is not True
+            and astype_is_view(values.dtype, arr.dtype)
             and using_copy_on_write()
             and self._mgr.is_single_block
         ):

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -917,7 +917,11 @@ def __array__(self, dtype=None, copy=None) -> np.ndarray:
         """
         The array interface, return my values.
         """
-        return np.asarray(self._data, dtype=dtype)
+        if copy is None:
+            # Note, that the if branch exists for NumPy 1.x support
+            return np.asarray(self._data, dtype=dtype)
+
+        return np.array(self._data, dtype=dtype, copy=copy)
 
     def __array_ufunc__(self, ufunc: np.ufunc, method: str_t, *inputs, **kwargs):
         if any(isinstance(other, (ABCSeries, ABCDataFrame)) for other in inputs):

diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -1311,6 +1311,15 @@ def copy(  # type: ignore[override]
 
     def __array__(self, dtype=None, copy=None) -> np.ndarray:
         """the array interface, return my values"""
+        if copy is False:
+            # self.values is always a newly construct array, so raise.
+            raise ValueError(
+                "Unable to avoid copy while creating an array as requested."
+            )
+        if copy is True:
+            # explicit np.array call to ensure a copy is made and unique objects
+            # are returned, because self.values is cached
+            return np.array(self.values, dtype=dtype)
         return self.values
 
     def view(self, cls=None) -> Self:

diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -305,12 +305,12 @@ def ndarray_to_mgr(
 
     elif isinstance(values, (np.ndarray, ExtensionArray)):
         # drop subclass info
-        _copy = (
-            copy_on_sanitize
-            if (dtype is None or astype_is_view(values.dtype, dtype))
-            else False
-        )
-        values = np.array(values, copy=_copy)
+        if copy_on_sanitize and (dtype is None or astype_is_view(values.dtype, dtype)):
+            # only force a copy now if copy=True was requested
+            # and a subsequent `astype` will not already result in a copy
+            values = np.array(values, copy=True, order="F")
+        else:
+            values = np.asarray(values)
         values = _ensure_2d(values)
 
     else:

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -991,7 +991,7 @@ def __array__(
             the dtype is inferred from the data.
 
         copy : bool or None, optional
-            Unused.
+            See :func:`numpy.asarray`.
 
         Returns
         -------
@@ -1028,8 +1028,17 @@ def __array__(
               dtype='datetime64[ns]')
         """
         values = self._values
-        arr = np.asarray(values, dtype=dtype)
-        if using_copy_on_write() and astype_is_view(values.dtype, arr.dtype):
+        if copy is None:
+            # Note: branch avoids `copy=None` for NumPy 1.x support
+            arr = np.asarray(values, dtype=dtype)
+        else:
+            arr = np.array(values, dtype=dtype, copy=copy)
+
+        if copy is True:
+            return arr
+        if using_copy_on_write() and (
+            copy is False or astype_is_view(values.dtype, arr.dtype)
+        ):
             arr = arr.view()
             arr.flags.writeable = False
         return arr

diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py
@@ -4,6 +4,7 @@
 import pytest
 
 from pandas._libs.sparse import IntIndex
+from pandas.compat.numpy import np_version_gt2
 
 import pandas as pd
 from pandas import (
@@ -478,3 +479,33 @@ def test_zero_sparse_column():
 
     expected = pd.DataFrame({"A": SparseArray([0, 0]), "B": [1, 3]}, index=[0, 2])
     tm.assert_frame_equal(result, expected)
+
+
+def test_array_interface(arr_data, arr):
+    # https://github.com/pandas-dev/pandas/pull/60046
+    result = np.asarray(arr)
+    tm.assert_numpy_array_equal(result, arr_data)
+
+    # it always gives a copy by default
+    result_copy1 = np.asarray(arr)
+    result_copy2 = np.asarray(arr)
+    assert not np.may_share_memory(result_copy1, result_copy2)
+
+    # or with explicit copy=True
+    result_copy1 = np.array(arr, copy=True)
+    result_copy2 = np.array(arr, copy=True)
+    assert not np.may_share_memory(result_copy1, result_copy2)
+
+    if not np_version_gt2:
+        # copy=False semantics are only supported in NumPy>=2.
+        return
+
+    # for sparse arrays, copy=False is never allowed
+    with pytest.raises(ValueError, match="Unable to avoid copy while creating"):
+        np.array(arr, copy=False)
+
+    # except when there are actually no sparse filled values
+    arr2 = SparseArray(np.array([1, 2, 3]))
+    result_nocopy1 = np.array(arr2, copy=False)
+    result_nocopy2 = np.array(arr2, copy=False)
+    assert np.may_share_memory(result_nocopy1, result_nocopy2)
diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py
@@ -1148,9 +1148,17 @@ def test_array_interface(self, arr1d):
         result = np.asarray(arr, dtype=object)
         tm.assert_numpy_array_equal(result, expected)
 
+        # to int64 gives the underlying representation
         result = np.asarray(arr, dtype="int64")
         tm.assert_numpy_array_equal(result, arr.asi8)
 
+        result2 = np.asarray(arr, dtype="int64")
+        assert np.may_share_memory(result, result2)
+
+        result_copy1 = np.array(arr, dtype="int64", copy=True)
+        result_copy2 = np.array(arr, dtype="int64", copy=True)
+        assert not np.may_share_memory(result_copy1, result_copy2)
+
         # to other dtypes
         msg = r"float\(\) argument must be a string or a( real)? number, not 'Period'"
         with pytest.raises(TypeError, match=msg):