From a2dd343b4e517e9a54a61ad4dab17fd73a5d1829 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Sun, 16 Dec 2018 21:21:29 -0600
Subject: [PATCH 1/3] API: Standard signature for to_numpy

This is part 1 of https://github.com/pandas-dev/pandas/issues/23995

We make the signature of

`to_numpy(dtype : Union[str, np.dtype], copy : bool) -> ndarray`
---
 doc/source/whatsnew/v0.24.0.rst               | 19 ++++++
 pandas/core/base.py                           | 60 +++++++++++++++----
 pandas/core/frame.py                          | 25 +++++---
 pandas/tests/frame/test_api.py                | 13 ++++
 pandas/tests/indexes/multi/test_conversion.py |  6 ++
 pandas/tests/test_base.py                     | 40 +++++++++++++
 6 files changed, 144 insertions(+), 19 deletions(-)

diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
index 689f5cc7951af..5cf3d07310913 100644
--- a/doc/source/whatsnew/v0.24.0.rst
+++ b/doc/source/whatsnew/v0.24.0.rst
@@ -73,6 +73,25 @@ as ``.values``).
    ser.array
    ser.to_numpy()
 
+:meth:`~Series.to_numpy` gives some control over the ``dtype`` of the resulting :class:`ndarray`,
+which :attr:`~Series.values` couldn't provide. For example, consider datetimes with timezones.
+NumPy doesn't have a dtype to represent datetimes with timezones, so there are two possibly
+useful representations:
+
+1. An object-dtype :class:`ndarray` with :class:`Timestamp` objects, each with the correct ``tz``
+2. A ``datetime64[ns]`` -dtype :class:`ndarray`, where the values have been converted to UTC and the timezone discarded
+
+Timezones may be preserved with ``dtype=object``
+
+.. ipython:: python
+
+   ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
+   ser.to_numpy(dtype=object)
+
+Or thrown away with ``dtype='datetime64[ns]'``
+
+   ser.to_numpy(dtype="datetime64[ns]")
+
 We haven't removed or deprecated :attr:`Series.values` or :attr:`DataFrame.values`, but we
 recommend and using ``.array`` or ``.to_numpy()`` instead.
 
diff --git a/pandas/core/base.py b/pandas/core/base.py
index 928e90977f95b..fff5ac7538e64 100644
--- a/pandas/core/base.py
+++ b/pandas/core/base.py
@@ -841,18 +841,22 @@ def array(self):
         """
         return self._values
 
-    def to_numpy(self):
+    def to_numpy(self, dtype=None, copy=False):
         """
         A NumPy ndarray representing the values in this Series or Index.
 
         .. versionadded:: 0.24.0
 
-        The returned array will be the same up to equality (values equal
-        in `self` will be equal in the returned array; likewise for values
-        that are not equal). When `self` contains an ExtensionArray, the
-        dtype may be different. For example, for a category-dtype Series,
-        ``to_numpy()`` will return a NumPy array and the categorical dtype
-        will be lost.
+
+        Parameters
+        ----------
+        dtype : str or numpy.dtype, optional
+            The dtype to pass to :meth:`numpy.asarray`
+        copy : bool, default False
+            Whether to ensure that the returned value is a not a view on
+            another array. Note that ``copy=False`` does not *ensure* that
+            ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
+            a copy is made, even if not strictly necessary.
 
         Returns
         -------
@@ -866,10 +870,18 @@ def to_numpy(self):
 
         Notes
         -----
+        The returned array will be the same up to equality (values equal
+        in `self` will be equal in the returned array; likewise for values
+        that are not equal). When `self` contains an ExtensionArray, the
+        dtype may be different. For example, for a category-dtype Series,
+        ``to_numpy()`` will return a NumPy array and the categorical dtype
+        will be lost.
+
+
         For NumPy dtypes, this will be a reference to the actual data stored
-        in this Series or Index. Modifying the result in place will modify
-        the data stored in the Series or Index (not that we recommend doing
-        that).
+        in this Series or Index (assuming ``copy=False``). Modifying the result
+        in place will modify the data stored in the Series or Index (not that
+        we recommend doing that).
 
         For extension types, ``to_numpy()`` *may* require copying data and
         coercing the result to a NumPy type (possibly object), which may be
@@ -894,12 +906,36 @@ def to_numpy(self):
         >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))
         >>> ser.to_numpy()
         array(['a', 'b', 'a'], dtype=object)
+
+        Specify the `dtype` to control how datetime-aware data is represented.
+        Use ``dtype=object`` to return an ndarray of pandas :class:`Timestamp`
+        objects, each with the correct ``tz``.
+
+        >>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
+        >>> ser.to_numpy(dtype=object)
+        array([Timestamp('2000-01-01 00:00:00+0100', tz='CET', freq='D'),
+               Timestamp('2000-01-02 00:00:00+0100', tz='CET', freq='D')],
+              dtype=object)
+
+        Or ``dtype='datetime64[ns]'`` to return an ndarray of native
+        datetime64 values. The values are converted to UTC and the timezone
+        info is dropped.
+
+        >>> ser.to_numpy(dtype="datetime64[ns]")
+        ... # doctest: +ELLIPSIS
+        array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'],
+              dtype='datetime64[ns]')
         """
         if (is_extension_array_dtype(self.dtype) or
                 is_datetime64tz_dtype(self.dtype)):
             # TODO(DatetimeArray): remove the second clause.
-            return np.asarray(self._values)
-        return self._values
+            result = np.asarray(self._values, dtype=dtype)
+        else:
+            result = self._values
+
+        if copy:
+            result = result.copy()
+        return result
 
     @property
     def _ndarray_values(self):
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 6b74fd7e06de9..21085f4e1bf8f 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1072,17 +1072,27 @@ def from_dict(cls, data, orient='columns', dtype=None, columns=None):
 
         return cls(data, index=index, columns=columns, dtype=dtype)
 
-    def to_numpy(self):
+    def to_numpy(self, dtype=None, copy=False):
         """
         Convert the DataFrame to a NumPy array.
 
         .. versionadded:: 0.24.0
 
-        The dtype of the returned array will be the common NumPy
-        dtype of all types in the DataFrame. For example,
-        if the dtypes are ``float16`` and ``float32``, the results
-        dtype will be ``float32``. This may require copying data and
-        coercing values, which may be expensive.
+        By default, the dtype of the returned array will be the common NumPy
+        dtype of all types in the DataFrame. For example, if the dtypes are
+        ``float16`` and ``float32``, the results dtype will be ``float32``.
+        This may require copying data and coercing values, which may be
+        expensive.
+
+        Parameters
+        ----------
+        dtype : str or numpy.dtype, optional
+            The dtype to pass to :meth:`numpy.asarray`
+        copy : bool, default False
+            Whether to ensure that the returned value is a not a view on
+            another array. Note that ``copy=False`` does not *ensure* that
+            ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
+            a copy is made, even if not strictly necessary.
 
         Returns
         -------
@@ -1114,7 +1124,8 @@ def to_numpy(self):
         array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
                [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
         """
-        return self.values
+        result = np.array(self.values, dtype=dtype, copy=copy)
+        return result
 
     def to_dict(self, orient='dict', into=dict):
         """
diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py
index 074745429af0d..e434647abeb73 100644
--- a/pandas/tests/frame/test_api.py
+++ b/pandas/tests/frame/test_api.py
@@ -325,6 +325,19 @@ def test_to_numpy(self):
         result = df.to_numpy()
         tm.assert_numpy_array_equal(result, expected)
 
+    def test_to_numpy_dtype(self):
+        df = pd.DataFrame({"A": [1, 2], "B": [3, 4.5]})
+        expected = np.array([[1, 3], [2, 4]], dtype="int64")
+        result = df.to_numpy(dtype="int64")
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_to_numpy_copy(self):
+        arr = np.random.randn(4, 3)
+        df = pd.DataFrame(arr)
+        assert df.values.base is arr
+        assert df.to_numpy(copy=False).base is arr
+        assert df.to_numpy(copy=True).base is None
+
     def test_transpose(self, float_frame):
         frame = float_frame
         dft = frame.T
diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py
index 0c483873a335e..b2adf42092a91 100644
--- a/pandas/tests/indexes/multi/test_conversion.py
+++ b/pandas/tests/indexes/multi/test_conversion.py
@@ -17,6 +17,12 @@ def test_tolist(idx):
     assert result == exp
 
 
+def test_to_numpy(idx):
+    result = idx.to_numpy()
+    exp = idx.values
+    tm.assert_numpy_array_equal(result, exp)
+
+
 def test_to_frame():
     tuples = [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')]
 
diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py
index ced7d0e75fd7d..6eada0e89b506 100644
--- a/pandas/tests/test_base.py
+++ b/pandas/tests/test_base.py
@@ -1301,3 +1301,43 @@ def test_to_numpy(array, expected, box):
 
     result = thing.to_numpy()
     tm.assert_numpy_array_equal(result, expected)
+
+
+@pytest.mark.parametrize("as_series", [True, False])
+@pytest.mark.parametrize("arr", [
+    np.array([1, 2, 3], dtype="int64"),
+    np.array(['a', 'b', 'c'], dtype=object),
+])
+def test_to_numpy_copy(arr, as_series):
+    obj = pd.Index(arr, copy=False)
+    if as_series:
+        obj = pd.Series(obj.values, copy=False)
+
+    # no copy by default
+    result = obj.to_numpy()
+    assert np.shares_memory(arr, result) is True
+
+    result = obj.to_numpy(copy=False)
+    assert np.shares_memory(arr, result) is True
+
+    # copy=True
+    result = obj.to_numpy(copy=True)
+    assert np.shares_memory(arr, result) is False
+
+
+@pytest.mark.parametrize("as_series", [True, False])
+def test_to_numpy_dtype(as_series):
+    tz = "US/Eastern"
+    obj = pd.DatetimeIndex(['2000', '2001'], tz=tz)
+    if as_series:
+        obj = pd.Series(obj)
+    result = obj.to_numpy(dtype=object)
+    expected = np.array([pd.Timestamp('2000', tz=tz),
+                         pd.Timestamp('2001', tz=tz)],
+                        dtype=object)
+    tm.assert_numpy_array_equal(result, expected)
+
+    result = obj.to_numpy()
+    expected = np.array(['2000-01-01T05', '2001-01-01T05'],
+                        dtype='M8[ns]')
+    tm.assert_numpy_array_equal(result, expected)

From 4059773625eda353b7687a98270c27d0f8d1c752 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Tue, 18 Dec 2018 15:18:44 -0600
Subject: [PATCH 2/3] move to basics

---
 doc/source/basics.rst           | 21 +++++++++++++++++++++
 doc/source/whatsnew/v0.24.0.rst | 19 -------------------
 2 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/doc/source/basics.rst b/doc/source/basics.rst
index a188a5716f7bc..0f6b5cd0b5e43 100644
--- a/doc/source/basics.rst
+++ b/doc/source/basics.rst
@@ -86,6 +86,27 @@ be the same as :attr:`~Series.array`. When the Series or Index is backed by
 a :class:`~pandas.api.extension.ExtensionArray`, :meth:`~Series.to_numpy`
 may involve copying data and coercing values.
 
+:meth:`~Series.to_numpy` gives some control over the ``dtype`` of the
+resulting :class:`ndarray`. For example, consider datetimes with timezones.
+NumPy doesn't have a dtype to represent timezone-aware datetimes, so there
+are two possibly useful representations:
+
+1. An object-dtype :class:`ndarray` with :class:`Timestamp` objects, each
+   with the correct ``tz``
+2. A ``datetime64[ns]`` -dtype :class:`ndarray`, where the values have
+   been converted to UTC and the timezone discarded
+
+Timezones may be preserved with ``dtype=object``
+
+.. ipython:: python
+
+   ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
+   ser.to_numpy(dtype=object)
+
+Or thrown away with ``dtype='datetime64[ns]'``
+
+   ser.to_numpy(dtype="datetime64[ns]")
+
 Getting the "raw data" inside a :class:`DataFrame` is possibly a bit more
 complex. When your ``DataFrame`` only has a single data type for all the
 columns, :attr:`DataFrame.to_numpy` will return the underlying data:
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
index 5cf3d07310913..689f5cc7951af 100644
--- a/doc/source/whatsnew/v0.24.0.rst
+++ b/doc/source/whatsnew/v0.24.0.rst
@@ -73,25 +73,6 @@ as ``.values``).
    ser.array
    ser.to_numpy()
 
-:meth:`~Series.to_numpy` gives some control over the ``dtype`` of the resulting :class:`ndarray`,
-which :attr:`~Series.values` couldn't provide. For example, consider datetimes with timezones.
-NumPy doesn't have a dtype to represent datetimes with timezones, so there are two possibly
-useful representations:
-
-1. An object-dtype :class:`ndarray` with :class:`Timestamp` objects, each with the correct ``tz``
-2. A ``datetime64[ns]`` -dtype :class:`ndarray`, where the values have been converted to UTC and the timezone discarded
-
-Timezones may be preserved with ``dtype=object``
-
-.. ipython:: python
-
-   ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
-   ser.to_numpy(dtype=object)
-
-Or thrown away with ``dtype='datetime64[ns]'``
-
-   ser.to_numpy(dtype="datetime64[ns]")
-
 We haven't removed or deprecated :attr:`Series.values` or :attr:`DataFrame.values`, but we
 recommend and using ``.array`` or ``.to_numpy()`` instead.
 

From 0b796a9f9b78dce5c15cc6135a000f7d1d938985 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Tue, 18 Dec 2018 16:40:11 -0600
Subject: [PATCH 3/3] Add TODO for double copy

---
 pandas/core/base.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/core/base.py b/pandas/core/base.py
index fff5ac7538e64..46f61c353056e 100644
--- a/pandas/core/base.py
+++ b/pandas/core/base.py
@@ -929,6 +929,7 @@ def to_numpy(self, dtype=None, copy=False):
         if (is_extension_array_dtype(self.dtype) or
                 is_datetime64tz_dtype(self.dtype)):
             # TODO(DatetimeArray): remove the second clause.
+            # TODO(GH-24345): Avoid potential double copy
             result = np.asarray(self._values, dtype=dtype)
         else:
             result = self._values