diff --git a/doc/source/basics.rst b/doc/source/basics.rst index a188a5716f7bc..0f6b5cd0b5e43 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -86,6 +86,27 @@ be the same as :attr:`~Series.array`. When the Series or Index is backed by a :class:`~pandas.api.extension.ExtensionArray`, :meth:`~Series.to_numpy` may involve copying data and coercing values. +:meth:`~Series.to_numpy` gives some control over the ``dtype`` of the +resulting :class:`ndarray`. For example, consider datetimes with timezones. +NumPy doesn't have a dtype to represent timezone-aware datetimes, so there +are two possibly useful representations: + +1. An object-dtype :class:`ndarray` with :class:`Timestamp` objects, each + with the correct ``tz`` +2. A ``datetime64[ns]`` -dtype :class:`ndarray`, where the values have + been converted to UTC and the timezone discarded + +Timezones may be preserved with ``dtype=object`` + +.. ipython:: python + + ser = pd.Series(pd.date_range('2000', periods=2, tz="CET")) + ser.to_numpy(dtype=object) + +Or thrown away with ``dtype='datetime64[ns]'`` + + ser.to_numpy(dtype="datetime64[ns]") + Getting the "raw data" inside a :class:`DataFrame` is possibly a bit more complex. When your ``DataFrame`` only has a single data type for all the columns, :attr:`DataFrame.to_numpy` will return the underlying data: diff --git a/pandas/core/base.py b/pandas/core/base.py index 928e90977f95b..46f61c353056e 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -841,18 +841,22 @@ def array(self): """ return self._values - def to_numpy(self): + def to_numpy(self, dtype=None, copy=False): """ A NumPy ndarray representing the values in this Series or Index. .. versionadded:: 0.24.0 - The returned array will be the same up to equality (values equal - in `self` will be equal in the returned array; likewise for values - that are not equal). When `self` contains an ExtensionArray, the - dtype may be different. For example, for a category-dtype Series, - ``to_numpy()`` will return a NumPy array and the categorical dtype - will be lost. + + Parameters + ---------- + dtype : str or numpy.dtype, optional + The dtype to pass to :meth:`numpy.asarray` + copy : bool, default False + Whether to ensure that the returned value is a not a view on + another array. Note that ``copy=False`` does not *ensure* that + ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that + a copy is made, even if not strictly necessary. Returns ------- @@ -866,10 +870,18 @@ def to_numpy(self): Notes ----- + The returned array will be the same up to equality (values equal + in `self` will be equal in the returned array; likewise for values + that are not equal). When `self` contains an ExtensionArray, the + dtype may be different. For example, for a category-dtype Series, + ``to_numpy()`` will return a NumPy array and the categorical dtype + will be lost. + + For NumPy dtypes, this will be a reference to the actual data stored - in this Series or Index. Modifying the result in place will modify - the data stored in the Series or Index (not that we recommend doing - that). + in this Series or Index (assuming ``copy=False``). Modifying the result + in place will modify the data stored in the Series or Index (not that + we recommend doing that). For extension types, ``to_numpy()`` *may* require copying data and coercing the result to a NumPy type (possibly object), which may be @@ -894,12 +906,37 @@ def to_numpy(self): >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) >>> ser.to_numpy() array(['a', 'b', 'a'], dtype=object) + + Specify the `dtype` to control how datetime-aware data is represented. + Use ``dtype=object`` to return an ndarray of pandas :class:`Timestamp` + objects, each with the correct ``tz``. + + >>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET")) + >>> ser.to_numpy(dtype=object) + array([Timestamp('2000-01-01 00:00:00+0100', tz='CET', freq='D'), + Timestamp('2000-01-02 00:00:00+0100', tz='CET', freq='D')], + dtype=object) + + Or ``dtype='datetime64[ns]'`` to return an ndarray of native + datetime64 values. The values are converted to UTC and the timezone + info is dropped. + + >>> ser.to_numpy(dtype="datetime64[ns]") + ... # doctest: +ELLIPSIS + array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'], + dtype='datetime64[ns]') """ if (is_extension_array_dtype(self.dtype) or is_datetime64tz_dtype(self.dtype)): # TODO(DatetimeArray): remove the second clause. - return np.asarray(self._values) - return self._values + # TODO(GH-24345): Avoid potential double copy + result = np.asarray(self._values, dtype=dtype) + else: + result = self._values + + if copy: + result = result.copy() + return result @property def _ndarray_values(self): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6b74fd7e06de9..21085f4e1bf8f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1072,17 +1072,27 @@ def from_dict(cls, data, orient='columns', dtype=None, columns=None): return cls(data, index=index, columns=columns, dtype=dtype) - def to_numpy(self): + def to_numpy(self, dtype=None, copy=False): """ Convert the DataFrame to a NumPy array. .. versionadded:: 0.24.0 - The dtype of the returned array will be the common NumPy - dtype of all types in the DataFrame. For example, - if the dtypes are ``float16`` and ``float32``, the results - dtype will be ``float32``. This may require copying data and - coercing values, which may be expensive. + By default, the dtype of the returned array will be the common NumPy + dtype of all types in the DataFrame. For example, if the dtypes are + ``float16`` and ``float32``, the results dtype will be ``float32``. + This may require copying data and coercing values, which may be + expensive. + + Parameters + ---------- + dtype : str or numpy.dtype, optional + The dtype to pass to :meth:`numpy.asarray` + copy : bool, default False + Whether to ensure that the returned value is a not a view on + another array. Note that ``copy=False`` does not *ensure* that + ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that + a copy is made, even if not strictly necessary. Returns ------- @@ -1114,7 +1124,8 @@ def to_numpy(self): array([[1, 3.0, Timestamp('2000-01-01 00:00:00')], [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object) """ - return self.values + result = np.array(self.values, dtype=dtype, copy=copy) + return result def to_dict(self, orient='dict', into=dict): """ diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 074745429af0d..e434647abeb73 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -325,6 +325,19 @@ def test_to_numpy(self): result = df.to_numpy() tm.assert_numpy_array_equal(result, expected) + def test_to_numpy_dtype(self): + df = pd.DataFrame({"A": [1, 2], "B": [3, 4.5]}) + expected = np.array([[1, 3], [2, 4]], dtype="int64") + result = df.to_numpy(dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + def test_to_numpy_copy(self): + arr = np.random.randn(4, 3) + df = pd.DataFrame(arr) + assert df.values.base is arr + assert df.to_numpy(copy=False).base is arr + assert df.to_numpy(copy=True).base is None + def test_transpose(self, float_frame): frame = float_frame dft = frame.T diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py index 0c483873a335e..b2adf42092a91 100644 --- a/pandas/tests/indexes/multi/test_conversion.py +++ b/pandas/tests/indexes/multi/test_conversion.py @@ -17,6 +17,12 @@ def test_tolist(idx): assert result == exp +def test_to_numpy(idx): + result = idx.to_numpy() + exp = idx.values + tm.assert_numpy_array_equal(result, exp) + + def test_to_frame(): tuples = [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')] diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index ced7d0e75fd7d..6eada0e89b506 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1301,3 +1301,43 @@ def test_to_numpy(array, expected, box): result = thing.to_numpy() tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("as_series", [True, False]) +@pytest.mark.parametrize("arr", [ + np.array([1, 2, 3], dtype="int64"), + np.array(['a', 'b', 'c'], dtype=object), +]) +def test_to_numpy_copy(arr, as_series): + obj = pd.Index(arr, copy=False) + if as_series: + obj = pd.Series(obj.values, copy=False) + + # no copy by default + result = obj.to_numpy() + assert np.shares_memory(arr, result) is True + + result = obj.to_numpy(copy=False) + assert np.shares_memory(arr, result) is True + + # copy=True + result = obj.to_numpy(copy=True) + assert np.shares_memory(arr, result) is False + + +@pytest.mark.parametrize("as_series", [True, False]) +def test_to_numpy_dtype(as_series): + tz = "US/Eastern" + obj = pd.DatetimeIndex(['2000', '2001'], tz=tz) + if as_series: + obj = pd.Series(obj) + result = obj.to_numpy(dtype=object) + expected = np.array([pd.Timestamp('2000', tz=tz), + pd.Timestamp('2001', tz=tz)], + dtype=object) + tm.assert_numpy_array_equal(result, expected) + + result = obj.to_numpy() + expected = np.array(['2000-01-01T05', '2001-01-01T05'], + dtype='M8[ns]') + tm.assert_numpy_array_equal(result, expected)