From fdedea44e1317cc98fdc2d4f84c2f33f7f0e4d02 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 9 Feb 2023 09:43:25 +0100 Subject: [PATCH 1/7] ENH: Optimize putmask implementation for CoW --- pandas/conftest.py | 1 + pandas/core/generic.py | 4 +++ pandas/core/internals/blocks.py | 38 +++++++++++++++++--- pandas/core/internals/managers.py | 9 +---- pandas/tests/copy_view/test_methods.py | 50 ++++++++++++++++++++++++-- 5 files changed, 87 insertions(+), 15 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 2c410bb98b506..2a5c0395dfb4e 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1944,6 +1944,7 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ + pd.options.mode.copy_on_write = True return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e4f7bb43b23dc..4de7355c8fb83 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9582,6 +9582,8 @@ def _where( cond = common.apply_if_callable(cond, self) if isinstance(cond, NDFrame): cond, _ = cond.align(self, join="right", broadcast_axis=1, copy=False) + # CoW: Make sure reference goes out of scope + _ = None else: if not hasattr(cond, "shape"): cond = np.asanyarray(cond) @@ -9625,6 +9627,8 @@ def _where( fill_value=None, copy=False, ) + # CoW: Make sure reference goes out of scope + _ = None # if we are NOT aligned, raise as we cannot where index if axis is None and not other._indexed_same(self): diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 115ae5dc6bb9d..9dfd24c2df082 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -915,7 +915,7 @@ def _unstack( # --------------------------------------------------------------------- - def setitem(self, indexer, value) -> Block: + def setitem(self, indexer, value, using_cow: bool = False) -> Block: """ Attempt self.values[indexer] = value, possibly creating a new array. @@ -960,10 +960,19 @@ def setitem(self, indexer, value) -> Block: # checking lib.is_scalar here fails on # test_iloc_setitem_custom_object casted = setitem_datetimelike_compat(values, len(vi), casted) + + if using_cow and self.refs.has_reference(): + values = values.copy() + self = type(self)( + values.T if values.ndim == 2 else values, + self._mgr_locs, + ndim=self.ndim, + ) + values[indexer] = casted return self - def putmask(self, mask, new) -> list[Block]: + def putmask(self, mask, new, using_cow: bool = False) -> list[Block]: """ putmask the data to the block; it is possible that we may create a new dtype of block @@ -991,11 +1000,21 @@ def putmask(self, mask, new) -> list[Block]: new = extract_array(new, extract_numpy=True) if noop: + if using_cow: + return [self.copy(deep=False)] return [self] try: casted = np_can_hold_element(values.dtype, new) + + if using_cow and self.refs.has_reference(): + # Do this here to avoid copying twice + values = values.copy() + self = type(self)(values, self._mgr_locs, ndim=self.ndim) + putmask_without_repeat(values.T, mask, casted) + if using_cow: + return [self.copy(deep=False)] return [self] except LossySetitemError: @@ -1007,7 +1026,7 @@ def putmask(self, mask, new) -> list[Block]: return self.coerce_to_target_dtype(new).putmask(mask, new) else: indexer = mask.nonzero()[0] - nb = self.setitem(indexer, new[indexer]) + nb = self.setitem(indexer, new[indexer], using_cow=using_cow) return [nb] else: @@ -1022,7 +1041,7 @@ def putmask(self, mask, new) -> list[Block]: n = new[:, i : i + 1] submask = orig_mask[:, i : i + 1] - rbs = nb.putmask(submask, n) + rbs = nb.putmask(submask, n, using_cow=using_cow) res_blocks.extend(rbs) return res_blocks @@ -1523,7 +1542,7 @@ def where(self, other, cond, _downcast: str | bool = "infer") -> list[Block]: nb = self.make_block_same_class(res_values) return [nb] - def putmask(self, mask, new) -> list[Block]: + def putmask(self, mask, new, using_cow: bool = False) -> list[Block]: """ See Block.putmask.__doc__ """ @@ -1541,8 +1560,17 @@ def putmask(self, mask, new) -> list[Block]: mask = self._maybe_squeeze_arg(mask) if not mask.any(): + if using_cow: + return [self.copy(deep=False)] return [self] + if using_cow and self.refs.has_reference(): + # Do this here to avoid copying twice + values = values.copy() + self = type(self)( + values.T if values.ndim == 2 else values, self._mgr_locs, ndim=self.ndim + ) + try: # Caller is responsible for ensuring matching lengths values._putmask(mask, new) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 5d45b33871900..eee892335ed64 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -362,14 +362,6 @@ def setitem(self: T, indexer, value) -> T: return self.apply("setitem", indexer=indexer, value=value) def putmask(self, mask, new, align: bool = True): - if using_copy_on_write() and any( - not self._has_no_reference_block(i) for i in range(len(self.blocks)) - ): - # some reference -> copy full dataframe - # TODO(CoW) this could be optimized to only copy the blocks that would - # get modified - self = self.copy() - if align: align_keys = ["new", "mask"] else: @@ -381,6 +373,7 @@ def putmask(self, mask, new, align: bool = True): align_keys=align_keys, mask=mask, new=new, + using_cow=using_copy_on_write(), ) def diff(self: T, n: int, axis: AxisInt) -> T: diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 6b54345723118..c5edb096263e7 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1237,8 +1237,9 @@ def test_replace(using_copy_on_write, replace_kwargs): tm.assert_frame_equal(df, df_orig) -def test_putmask(using_copy_on_write): - df = DataFrame({"a": [1, 2], "b": 1, "c": 2}) +@pytest.mark.parametrize("dtype", ["int64", "Int64"]) +def test_putmask(using_copy_on_write, dtype): + df = DataFrame({"a": [1, 2], "b": 1, "c": 2}, dtype=dtype) view = df[:] df_orig = df.copy() df[df == df] = 5 @@ -1252,6 +1253,51 @@ def test_putmask(using_copy_on_write): assert view.iloc[0, 0] == 5 +@pytest.mark.parametrize("dtype", ["int64", "Int64"]) +def test_putmask_no_reference(using_copy_on_write, dtype): + df = DataFrame({"a": [1, 2], "b": 1, "c": 2}, dtype=dtype) + arr_a = get_array(df, "a") + df[df == df] = 5 + + if using_copy_on_write: + assert np.shares_memory(arr_a, get_array(df, "a")) + + +@pytest.mark.parametrize("dtype", ["float64", "Float64"]) +def test_putmask_aligns_rhs_no_reference(using_copy_on_write, dtype): + df = DataFrame({"a": [1.5, 2], "b": 1.5}, dtype=dtype) + arr_a = get_array(df, "a") + df[df == df] = DataFrame({"a": [5.5, 5]}) + + if using_copy_on_write: + assert np.shares_memory(arr_a, get_array(df, "a")) + + +@pytest.mark.parametrize("val, exp", [(5.5, True), (5, False)]) +def test_putmask_dont_copy_some_blocks(using_copy_on_write, val, exp): + df = DataFrame({"a": [1, 2], "b": 1, "c": 1.5}) + view = df[:] + df_orig = df.copy() + indexer = DataFrame( + [[True, False, False], [True, False, False]], columns=list("abc") + ) + df[indexer] = val + + if using_copy_on_write: + assert not np.shares_memory(get_array(view, "a"), get_array(df, "a")) + # TODO(CoW): Could split blocks to avoid copying the whole block + assert np.shares_memory(get_array(view, "b"), get_array(df, "b")) is exp + assert np.shares_memory(get_array(view, "c"), get_array(df, "c")) + assert df._mgr._has_no_reference(1) is not exp + assert not df._mgr._has_no_reference(2) + tm.assert_frame_equal(view, df_orig) + else: + # Without CoW the original will be modified + assert np.shares_memory(get_array(view, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(view, "c"), get_array(df, "c")) + assert view.iloc[0, 0] == 5 + + def test_asfreq_noop(using_copy_on_write): df = DataFrame( {"a": [0.0, None, 2.0, 3.0]}, From 94cd34a4d1dfeeece8af4422a5265851752973ca Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 9 Feb 2023 18:14:02 +0100 Subject: [PATCH 2/7] Fix --- pandas/conftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 2a5c0395dfb4e..2c410bb98b506 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1944,7 +1944,6 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ - pd.options.mode.copy_on_write = True return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" From 1411b6ce9bcb782fe113ea7c28e4309e8ff3a010 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 9 Feb 2023 18:15:39 +0100 Subject: [PATCH 3/7] remove comment --- pandas/core/internals/blocks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 1042ee3fc798e..61f7393745ade 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1579,7 +1579,6 @@ def putmask(self, mask, new, using_cow: bool = False) -> list[Block]: return [self] if using_cow and self.refs.has_reference(): - # Do this here to avoid copying twice values = values.copy() self = type(self)( values.T if values.ndim == 2 else values, self._mgr_locs, ndim=self.ndim From c4bd3a7eebeb82fcfbb8ca4bae4b583f5680062c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 9 Feb 2023 19:03:21 +0100 Subject: [PATCH 4/7] Refactor --- pandas/core/generic.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ffbfb9f182e75..3213421c866e2 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9581,9 +9581,8 @@ def _where( # align the cond to same shape as myself cond = common.apply_if_callable(cond, self) if isinstance(cond, NDFrame): - cond, _ = cond.align(self, join="right", broadcast_axis=1, copy=False) - # CoW: Make sure reference goes out of scope - _ = None + # CoW: Make sure reference is not kept alive + cond = cond.align(self, join="right", broadcast_axis=1, copy=False)[0] else: if not hasattr(cond, "shape"): cond = np.asanyarray(cond) @@ -9619,16 +9618,15 @@ def _where( # align with me if other.ndim <= self.ndim: - _, other = self.align( + # CoW: Make sure reference is not kept alive + other = self.align( other, join="left", axis=axis, level=level, fill_value=None, copy=False, - ) - # CoW: Make sure reference goes out of scope - _ = None + )[1] # if we are NOT aligned, raise as we cannot where index if axis is None and not other._indexed_same(self): From beb70b000b14b7629b34f4a6fb578d926795262f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 9 Feb 2023 20:10:11 +0100 Subject: [PATCH 5/7] Fix test --- pandas/conftest.py | 1 + pandas/tests/copy_view/test_methods.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 2c410bb98b506..a79f70375ba68 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1944,6 +1944,7 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ + # pd.options.mode.copy_on_write = True return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index c5edb096263e7..bd64d52ec157c 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1291,8 +1291,8 @@ def test_putmask_dont_copy_some_blocks(using_copy_on_write, val, exp): assert df._mgr._has_no_reference(1) is not exp assert not df._mgr._has_no_reference(2) tm.assert_frame_equal(view, df_orig) - else: - # Without CoW the original will be modified + elif val == 5: + # Without CoW the original will be modified, the other case upcasts, e.g. copy assert np.shares_memory(get_array(view, "a"), get_array(df, "a")) assert np.shares_memory(get_array(view, "c"), get_array(df, "c")) assert view.iloc[0, 0] == 5 From c5c3a2e913acc39a1f95ff9d3dc2ed0b5794f852 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 10 Feb 2023 12:05:24 +0100 Subject: [PATCH 6/7] Address review --- pandas/conftest.py | 1 - pandas/core/internals/blocks.py | 18 +++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index a79f70375ba68..2c410bb98b506 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1944,7 +1944,6 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ - # pd.options.mode.copy_on_write = True return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 752ce34bf53b2..634bfc7a52ff7 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -956,6 +956,8 @@ def setitem(self, indexer, value, using_cow: bool = False) -> Block: The subset of self.values to set value : object The value being set + using_cow: bool, default False + Signaling if CoW is used. Returns ------- @@ -994,10 +996,8 @@ def setitem(self, indexer, value, using_cow: bool = False) -> Block: if using_cow and self.refs.has_reference(): values = values.copy() - self = type(self)( - values.T if values.ndim == 2 else values, - self._mgr_locs, - ndim=self.ndim, + self = self.make_block_same_class( + values.T if values.ndim == 2 else values ) values[indexer] = casted @@ -1041,7 +1041,7 @@ def putmask(self, mask, new, using_cow: bool = False) -> list[Block]: if using_cow and self.refs.has_reference(): # Do this here to avoid copying twice values = values.copy() - self = type(self)(values, self._mgr_locs, ndim=self.ndim) + self = self.make_block_same_class(values) putmask_without_repeat(values.T, mask, casted) if using_cow: @@ -1467,7 +1467,7 @@ class EABackedBlock(Block): values: ExtensionArray - def setitem(self, indexer, value): + def setitem(self, indexer, value, using_cow: bool = False): """ Attempt self.values[indexer] = value, possibly creating a new array. @@ -1480,6 +1480,8 @@ def setitem(self, indexer, value): The subset of self.values to set value : object The value being set + using_cow: bool, default False + Signaling if CoW is used. Returns ------- @@ -1610,9 +1612,7 @@ def putmask(self, mask, new, using_cow: bool = False) -> list[Block]: if using_cow and self.refs.has_reference(): values = values.copy() - self = type(self)( - values.T if values.ndim == 2 else values, self._mgr_locs, ndim=self.ndim - ) + self = self.make_block_same_class(values.T if values.ndim == 2 else values) try: # Caller is responsible for ensuring matching lengths From 01bf0b2b51987d228df9dae6fa16d5ce64ff1a2d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 10 Feb 2023 16:28:48 +0100 Subject: [PATCH 7/7] Fix mypy --- pandas/core/internals/blocks.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 634bfc7a52ff7..574e633bc4ba3 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1612,7 +1612,9 @@ def putmask(self, mask, new, using_cow: bool = False) -> list[Block]: if using_cow and self.refs.has_reference(): values = values.copy() - self = self.make_block_same_class(values.T if values.ndim == 2 else values) + self = self.make_block_same_class( # type: ignore[assignment] + values.T if values.ndim == 2 else values + ) try: # Caller is responsible for ensuring matching lengths