Skip to content
3 changes: 3 additions & 0 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1605,6 +1605,9 @@ def delete(self, loc) -> list[Block]:
values = self.values.delete(loc)
mgr_locs = self._mgr_locs.delete(loc)
return [type(self)(values, placement=mgr_locs, ndim=self.ndim)]
elif self.values.ndim == 1:
# We get here through to_stata
return []
return super().delete(loc)

@cache_readonly
Expand Down
32 changes: 21 additions & 11 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1250,7 +1250,10 @@ def value_getitem(placement):
self._known_consolidated = False

def _iset_split_block(
self, blkno_l: int, blk_locs: np.ndarray, value: ArrayLike | None = None
self,
blkno_l: int,
blk_locs: np.ndarray | list[int],
value: ArrayLike | None = None,
) -> None:
"""Removes columns from a block by splitting the block.

Expand All @@ -1271,12 +1274,8 @@ def _iset_split_block(

nbs_tup = tuple(blk.delete(blk_locs))
if value is not None:
# Argument 1 to "BlockPlacement" has incompatible type "BlockPlacement";
# expected "Union[int, slice, ndarray[Any, Any]]"
first_nb = new_block_2d(
value,
BlockPlacement(blk.mgr_locs[blk_locs]), # type: ignore[arg-type]
)
locs = blk.mgr_locs.as_array[blk_locs]
first_nb = new_block_2d(value, BlockPlacement(locs))
else:
first_nb = nbs_tup[0]
nbs_tup = tuple(nbs_tup[1:])
Expand All @@ -1287,6 +1286,10 @@ def _iset_split_block(
)
self.blocks = blocks_tup

if not nbs_tup and value is not None:
# No need to update anything if split did not happen
return

self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb))

for i, nb in enumerate(nbs_tup):
Expand Down Expand Up @@ -1330,11 +1333,18 @@ def column_setitem(
intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`)
"""
if using_copy_on_write() and not self._has_no_reference(loc):
# otherwise perform Copy-on-Write and clear the reference
blkno = self.blknos[loc]
blocks = list(self.blocks)
blocks[blkno] = blocks[blkno].copy()
self.blocks = tuple(blocks)
# Split blocks to only copy the column we want to modify
blk_loc = self.blklocs[loc]
# Copy our values
values = self.blocks[blkno].values
if values.ndim == 1:
values = values.copy()
else:
# Use [blk_loc] as indexer to keep ndim=2, this already results in a
# copy
values = values[[blk_loc]]
self._iset_split_block(blkno, [blk_loc], values)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still need to call this for the values.ndim == 1 case that already copied the values, or does this just also does the "wrapping in block and updating self.blocks" for that case?

And _iset_split_block assumes that values is always already copied, right?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes and Yes

Not calling it would duplicate the logic


# this manager is only created temporarily to mutate the values in place
# so don't track references, otherwise the `setitem` would perform CoW again
Expand Down
36 changes: 36 additions & 0 deletions pandas/tests/copy_view/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -883,3 +883,39 @@ def test_dataframe_add_column_from_series():
df.loc[2, "new"] = 100
expected_s = Series([0, 11, 12])
tm.assert_series_equal(s, expected_s)


@pytest.mark.parametrize("val", [100, "a"])
@pytest.mark.parametrize(
"indexer_func, indexer",
[
(tm.loc, (0, "a")),
(tm.iloc, (0, 0)),
(tm.loc, ([0], "a")),
(tm.iloc, ([0], 0)),
(tm.loc, (slice(None), "a")),
(tm.iloc, (slice(None), 0)),
],
)
def test_set_value_copy_only_necessary_column(
using_copy_on_write, indexer_func, indexer, val
):
# When setting inplace, only copy column that is modified instead of the whole
# block (by splitting the block)
# TODO multi-block only for now
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
df_orig = df.copy()
view = df[:]

indexer_func(df)[indexer] = val

if using_copy_on_write:
assert np.shares_memory(get_array(df, "b"), get_array(view, "b"))
assert not np.shares_memory(get_array(df, "a"), get_array(view, "a"))
tm.assert_frame_equal(view, df_orig)
else:
assert np.shares_memory(get_array(df, "c"), get_array(view, "c"))
if val == "a":
assert not np.shares_memory(get_array(df, "a"), get_array(view, "a"))
else:
assert np.shares_memory(get_array(df, "a"), get_array(view, "a"))