Skip to content

Commit 065a320

Browse files
authored
Backport PR #52008 on branch 2.0.x (CoW: Avoid copying Index in Series constructor) (#52048)
CoW: Avoid copying Index in Series constructor (#52008)
1 parent a498448 commit 065a320

File tree

14 files changed

+108
-30
lines changed

14 files changed

+108
-30
lines changed

doc/source/whatsnew/v2.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1212,6 +1212,7 @@ Conversion
12121212
- Bug in :func:`to_datetime` was not respecting ``exact`` argument when ``format`` was an ISO8601 format (:issue:`12649`)
12131213
- Bug in :meth:`TimedeltaArray.astype` raising ``TypeError`` when converting to a pyarrow duration type (:issue:`49795`)
12141214
- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` raising for extension array dtypes (:issue:`29618`, :issue:`50261`, :issue:`31913`)
1215+
- Bug in :meth:`Series` not copying data when created from :class:`Index` and ``dtype`` is equal to ``dtype`` from :class:`Index` (:issue:`52008`)
12151216

12161217
Strings
12171218
^^^^^^^

pandas/_libs/internals.pyi

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ class BlockManager:
9696

9797
class BlockValuesRefs:
9898
referenced_blocks: list[weakref.ref]
99-
def __init__(self, blk: SharedBlock) -> None: ...
99+
def __init__(self, blk: SharedBlock | None = ...) -> None: ...
100100
def add_reference(self, blk: SharedBlock) -> None: ...
101+
def add_index_reference(self, index: object) -> None: ...
101102
def has_reference(self) -> bool: ...

pandas/_libs/internals.pyx

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -877,8 +877,11 @@ cdef class BlockValuesRefs:
877877
cdef:
878878
public list referenced_blocks
879879

880-
def __cinit__(self, blk: SharedBlock) -> None:
881-
self.referenced_blocks = [weakref.ref(blk)]
880+
def __cinit__(self, blk: SharedBlock | None = None) -> None:
881+
if blk is not None:
882+
self.referenced_blocks = [weakref.ref(blk)]
883+
else:
884+
self.referenced_blocks = []
882885

883886
def add_reference(self, blk: SharedBlock) -> None:
884887
"""Adds a new reference to our reference collection.

pandas/core/indexes/base.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
index as libindex,
3232
lib,
3333
)
34+
from pandas._libs.internals import BlockValuesRefs
3435
import pandas._libs.join as libjoin
3536
from pandas._libs.lib import (
3637
is_datetime_array,
@@ -653,9 +654,11 @@ def _simple_new(
653654
result._name = name
654655
result._cache = {}
655656
result._reset_identity()
656-
result._references = refs
657657
if refs is not None:
658-
refs.add_index_reference(result)
658+
result._references = refs
659+
else:
660+
result._references = BlockValuesRefs()
661+
result._references.add_index_reference(result)
659662

660663
return result
661664

pandas/core/indexes/interval.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,8 @@ def inferred_type(self) -> str:
390390
"""Return a string of the type inferred from the values"""
391391
return "interval"
392392

393-
@Appender(Index.memory_usage.__doc__)
393+
# Cannot determine type of "memory_usage"
394+
@Appender(Index.memory_usage.__doc__) # type: ignore[has-type]
394395
def memory_usage(self, deep: bool = False) -> int:
395396
# we don't use an explicit engine
396397
# so return the bytes here

pandas/core/indexes/multi.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1233,7 +1233,8 @@ def f(level) -> bool:
12331233

12341234
return any(f(level) for level in self._inferred_type_levels)
12351235

1236-
@doc(Index.memory_usage)
1236+
# Cannot determine type of "memory_usage"
1237+
@doc(Index.memory_usage) # type: ignore[has-type]
12371238
def memory_usage(self, deep: bool = False) -> int:
12381239
# we are overwriting our base class to avoid
12391240
# computing .values here which could materialize

pandas/core/internals/managers.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,10 @@
2222
internals as libinternals,
2323
lib,
2424
)
25-
from pandas._libs.internals import BlockPlacement
25+
from pandas._libs.internals import (
26+
BlockPlacement,
27+
BlockValuesRefs,
28+
)
2629
from pandas._typing import (
2730
ArrayLike,
2831
AxisInt,
@@ -1868,11 +1871,13 @@ def from_blocks(
18681871
return cls(blocks[0], axes[0], verify_integrity=False)
18691872

18701873
@classmethod
1871-
def from_array(cls, array: ArrayLike, index: Index) -> SingleBlockManager:
1874+
def from_array(
1875+
cls, array: ArrayLike, index: Index, refs: BlockValuesRefs | None = None
1876+
) -> SingleBlockManager:
18721877
"""
18731878
Constructor for if we have an array that is not yet a Block.
18741879
"""
1875-
block = new_block(array, placement=slice(0, len(index)), ndim=1)
1880+
block = new_block(array, placement=slice(0, len(index)), ndim=1, refs=refs)
18761881
return cls(block, index)
18771882

18781883
def to_2d_mgr(self, columns: Index) -> BlockManager:

pandas/core/series.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -426,10 +426,15 @@ def __init__(
426426
raise NotImplementedError(
427427
"initializing a Series from a MultiIndex is not supported"
428428
)
429+
430+
refs = None
429431
if isinstance(data, Index):
430432
if dtype is not None:
431-
# astype copies
432-
data = data.astype(dtype)
433+
data = data.astype(dtype, copy=False)
434+
435+
if using_copy_on_write():
436+
refs = data._references
437+
data = data._values
433438
else:
434439
# GH#24096 we need to ensure the index remains immutable
435440
data = data._values.copy()
@@ -493,7 +498,7 @@ def __init__(
493498

494499
manager = get_option("mode.data_manager")
495500
if manager == "block":
496-
data = SingleBlockManager.from_array(data, index)
501+
data = SingleBlockManager.from_array(data, index, refs=refs)
497502
elif manager == "array":
498503
data = SingleArrayManager.from_array(data, index)
499504

pandas/tests/copy_view/test_astype.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ def test_astype_different_timezones(using_copy_on_write):
173173
result = df.astype("datetime64[ns, Europe/Berlin]")
174174
if using_copy_on_write:
175175
assert not result._mgr._has_no_reference(0)
176-
assert np.shares_memory(get_array(df, "a").asi8, get_array(result, "a").asi8)
176+
assert np.shares_memory(get_array(df, "a"), get_array(result, "a"))
177177

178178

179179
def test_astype_different_timezones_different_reso(using_copy_on_write):
@@ -183,9 +183,7 @@ def test_astype_different_timezones_different_reso(using_copy_on_write):
183183
result = df.astype("datetime64[ms, Europe/Berlin]")
184184
if using_copy_on_write:
185185
assert result._mgr._has_no_reference(0)
186-
assert not np.shares_memory(
187-
get_array(df, "a").asi8, get_array(result, "a").asi8
188-
)
186+
assert not np.shares_memory(get_array(df, "a"), get_array(result, "a"))
189187

190188

191189
@pytest.mark.skipif(pa_version_under7p0, reason="pyarrow not installed")
@@ -202,7 +200,7 @@ def test_astype_arrow_timestamp(using_copy_on_write):
202200
result = df.astype("timestamp[ns][pyarrow]")
203201
if using_copy_on_write:
204202
assert not result._mgr._has_no_reference(0)
205-
assert np.shares_memory(get_array(df, "a").asi8, get_array(result, "a")._data)
203+
assert np.shares_memory(get_array(df, "a"), get_array(result, "a")._data)
206204

207205

208206
def test_convert_dtypes_infer_objects(using_copy_on_write):

pandas/tests/copy_view/test_constructors.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,14 @@
33

44
from pandas import (
55
DataFrame,
6+
DatetimeIndex,
7+
Index,
8+
Period,
9+
PeriodIndex,
610
Series,
11+
Timedelta,
12+
TimedeltaIndex,
13+
Timestamp,
714
)
815
import pandas._testing as tm
916
from pandas.tests.copy_view.util import get_array
@@ -82,6 +89,35 @@ def test_series_from_series_with_reindex(using_copy_on_write):
8289
assert not result._mgr.blocks[0].refs.has_reference()
8390

8491

92+
@pytest.mark.parametrize(
93+
"idx",
94+
[
95+
Index([1, 2]),
96+
DatetimeIndex([Timestamp("2019-12-31"), Timestamp("2020-12-31")]),
97+
PeriodIndex([Period("2019-12-31"), Period("2020-12-31")]),
98+
TimedeltaIndex([Timedelta("1 days"), Timedelta("2 days")]),
99+
],
100+
)
101+
def test_series_from_index(using_copy_on_write, idx):
102+
ser = Series(idx)
103+
expected = idx.copy(deep=True)
104+
if using_copy_on_write:
105+
assert np.shares_memory(get_array(ser), get_array(idx))
106+
assert not ser._mgr._has_no_reference(0)
107+
else:
108+
assert not np.shares_memory(get_array(ser), get_array(idx))
109+
ser.iloc[0] = ser.iloc[1]
110+
tm.assert_index_equal(idx, expected)
111+
112+
113+
def test_series_from_index_different_dtypes(using_copy_on_write):
114+
idx = Index([1, 2, 3], dtype="int64")
115+
ser = Series(idx, dtype="int32")
116+
assert not np.shares_memory(get_array(ser), get_array(idx))
117+
if using_copy_on_write:
118+
assert ser._mgr._has_no_reference(0)
119+
120+
85121
@pytest.mark.parametrize("func", [lambda x: x, lambda x: x._mgr])
86122
@pytest.mark.parametrize("columns", [None, ["a"]])
87123
def test_dataframe_constructor_mgr_or_df(using_copy_on_write, columns, func):

0 commit comments

Comments
 (0)