diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 5f93e08d51baa..7db790a0fddd7 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -287,6 +287,7 @@ Other enhancements - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`) - :meth:`MultiIndex.union` will now raise `RuntimeWarning` if the object inside are unsortable, pass `sort=False` to suppress this warning (:issue:`33015`) - :class:`Series.dt` and :class:`DatatimeIndex` now have an `isocalendar` method that returns a :class:`DataFrame` with year, week, and day calculated according to the ISO 8601 calendar (:issue:`33206`, :issue:`34392`). +- The :class:`DataFrame` constructor now uses ``copy`` for dict-inputs to control whether copies of the arrays are made, rather than ignoring it (:issue:`32960`) - The :meth:`DataFrame.to_feather` method now supports additional keyword arguments (e.g. to set the compression) that are added in pyarrow 0.17 (:issue:`33422`). diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 6c58698989e96..f9a111c14a666 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -390,7 +390,7 @@ def sanitize_array( data, index: Optional["Index"], dtype: Optional[DtypeObj] = None, - copy: bool = False, + copy: Optional[bool] = False, raise_cast_failure: bool = False, ) -> ArrayLike: """ @@ -412,6 +412,9 @@ def sanitize_array( # GH#846 if isinstance(data, np.ndarray): + if copy is None: + # copy by default for DataFrame({"A": ndarray}) + copy = True if dtype is not None and is_float_dtype(data.dtype) and is_integer_dtype(dtype): # possibility of nan -> garbage @@ -428,15 +431,20 @@ def sanitize_array( elif isinstance(data, ABCExtensionArray): # it is already ensured above this is not a PandasArray + # no copy by default for DataFrame({"A": extension_array}) + if copy is None: + copy = False subarr = data if dtype is not None: subarr = subarr.astype(dtype, copy=copy) elif copy: + # no copy by default from DataFrame.__init__ subarr = subarr.copy() return subarr elif isinstance(data, (list, tuple)) and len(data) > 0: + copy = bool(copy) # None -> False if dtype is not None: subarr = _try_cast(data, dtype, copy, raise_cast_failure) else: @@ -446,16 +454,19 @@ def sanitize_array( elif isinstance(data, range): # GH#16804 + copy = bool(copy) # None -> False arr = np.arange(data.start, data.stop, data.step, dtype="int64") subarr = _try_cast(arr, dtype, copy, raise_cast_failure) elif isinstance(data, abc.Set): raise TypeError("Set type is unordered") elif lib.is_scalar(data) and index is not None and dtype is not None: + copy = bool(copy) # None -> False data = maybe_cast_to_datetime(data, dtype) if not lib.is_scalar(data): data = data[0] subarr = construct_1d_arraylike_from_scalar(data, len(index), dtype) else: + copy = bool(copy) # None -> False subarr = _try_cast(data, dtype, copy, raise_cast_failure) # scalar like, GH diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cfe5621fec14e..0502a459dd921 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -364,8 +364,22 @@ class DataFrame(NDFrame): RangeIndex (0, 1, 2, ..., n) if no column labels are provided. dtype : dtype, default None Data type to force. Only a single dtype is allowed. If None, infer. - copy : bool, default False - Copy data from inputs. Only affects DataFrame / 2d ndarray input. + copy : bool, optional + Copy data from inputs. This only applies for specific types of `data` + and the default behavior depends on `data`. + + * `data` is a DataFrame or 2D NumPy array: *no* copy by default. + Specifying ``copy=True`` will copy the data. + * `data` is a dict: + By default arrays in `data` with NumPy dtypes in `data` are + copied, while extension types are not. Specifying ``copy=True`` + will copy all of the values, and ``copy=False`` will attempt to + not copy the data. Note that if `data` has more than one value with + the same NumPy dtype then then data will be copied, regardless of + the value of `copy`. + + For all other types of `data`, zero-copy construction cannot be ensured + and `copy` has no effect. See Also -------- @@ -441,7 +455,7 @@ def __init__( index: Optional[Axes] = None, columns: Optional[Axes] = None, dtype: Optional[Dtype] = None, - copy: bool = False, + copy: Optional[bool] = None, ): if data is None: data = {} @@ -452,6 +466,7 @@ def __init__( data = data._mgr if isinstance(data, BlockManager): + copy = bool(copy) # None -> False if index is None and columns is None and dtype is None and copy is False: # GH#33357 fastpath NDFrame.__init__(self, data) @@ -462,10 +477,12 @@ def __init__( ) elif isinstance(data, dict): - mgr = init_dict(data, index, columns, dtype=dtype) + mgr = init_dict(data, index, columns, dtype=dtype, copy=copy) elif isinstance(data, ma.MaskedArray): import numpy.ma.mrecords as mrecords + copy = bool(copy) # None -> False + # masked recarray if isinstance(data, mrecords.MaskedRecords): mgr = masked_rec_array_to_mgr(data, index, columns, dtype, copy) @@ -482,6 +499,7 @@ def __init__( mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) elif isinstance(data, (np.ndarray, Series, Index)): + copy = bool(copy) # None -> False if data.dtype.names: data_columns = list(data.dtype.names) data = {k: data[k] for k in data_columns} @@ -495,6 +513,7 @@ def __init__( # For data is list-like, or Iterable (will consume into list) elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)): + copy = bool(copy) # None -> False if not isinstance(data, (abc.Sequence, ExtensionArray)): data = list(data) if len(data) > 0: @@ -538,6 +557,7 @@ def __init__( mgr = arrays_to_mgr(values, columns, index, columns, dtype=None) else: # Attempt to coerce to a numpy array + copy = bool(copy) # None -> False try: arr = np.array(data, dtype=dtype, copy=copy) except (ValueError, TypeError) as err: diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 4b9db810dead0..cb1a045735381 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -64,6 +64,7 @@ def arrays_to_mgr( columns, dtype: Optional[DtypeObj] = None, verify_integrity: bool = True, + copy: Optional[bool] = False, ): """ Segregate Series based on type and coerce into matrices. @@ -80,7 +81,7 @@ def arrays_to_mgr( index = ensure_index(index) # don't force copy because getting jammed in an ndarray anyway - arrays = _homogenize(arrays, index, dtype) + arrays = _homogenize(arrays, index, dtype, copy=copy) columns = ensure_index(columns) else: @@ -234,7 +235,13 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): return create_block_manager_from_blocks(block_values, [columns, index]) -def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None): +def init_dict( + data: Dict, + index, + columns, + dtype: Optional[DtypeObj] = None, + copy: Optional[bool] = False, +): """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. @@ -280,7 +287,7 @@ def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None): arrays = [ arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays ] - return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype) + return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype, copy=copy) # --------------------------------------------------------------------- @@ -326,14 +333,16 @@ def convert(v): return values -def _homogenize(data, index, dtype: Optional[DtypeObj]): +def _homogenize(data, index, dtype: Optional[DtypeObj], copy: Optional[bool] = False): oindex = None homogenized = [] for val in data: if isinstance(val, ABCSeries): if dtype is not None: - val = val.astype(dtype) + val = val.astype(dtype, copy=copy) + elif copy: + val = val.copy() if val.index is not index: # Forces alignment. No need to copy data since we # are putting it into an ndarray later @@ -349,7 +358,7 @@ def _homogenize(data, index, dtype: Optional[DtypeObj]): val = dict(val) val = lib.fast_multiget(val, oindex._values, default=np.nan) val = sanitize_array( - val, index, dtype=dtype, copy=False, raise_cast_failure=False + val, index, dtype=dtype, copy=copy, raise_cast_failure=False ) homogenized.append(val) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index c82670106d3b6..1b19daeda405b 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1822,10 +1822,13 @@ def _shape_compat(x): first = arrays[0] shape = (len(arrays),) + _shape_compat(first) - - stacked = np.empty(shape, dtype=dtype) - for i, arr in enumerate(arrays): - stacked[i] = _asarray_compat(arr) + if len(arrays) == 1: + # allow for 0-copy construction from dict + stacked = _asarray_compat(first).reshape(shape) + else: + stacked = np.empty(shape, dtype=dtype) + for i, arr in enumerate(arrays): + stacked[i] = _asarray_compat(arr) return stacked, placement diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index bfff58d05007f..0faa59f9dcf45 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1941,11 +1941,15 @@ def test_constructor_ndarray_copy(self, float_frame): def test_constructor_series_copy(self, float_frame): series = float_frame._series - df = DataFrame({"A": series["A"]}) + df = DataFrame({"A": series["A"]}, copy=True) df["A"][:] = 5 assert not (series["A"] == 5).all() + df = DataFrame({"A": series["A"]}) # no copy by default + df["A"][:] = 5 + assert (series["A"] == 5).all() + def test_constructor_with_nas(self): # GH 5016 # na's in indices @@ -2746,3 +2750,23 @@ def test_construction_from_set_raises(self): msg = "Set type is unordered" with pytest.raises(TypeError, match=msg): pd.DataFrame({"a": {1, 2, 3}}) + + @pytest.mark.parametrize("copy", [None, False, True]) + def test_dict_nocopy(self, copy): + a = np.array([1, 2]) + b = pd.array([1, 2]) + df = pd.DataFrame({"a": a, "b": b}, copy=copy) + df.iloc[0, 0] = 0 + df.iloc[0, 1] = 0 + + if copy is None: + # copy for ndarray, no copy for EA + assert a[0] == 1 + assert b[0] == 0 + + elif copy: + assert a[0] == 1 + assert b[0] == 1 + else: + assert a[0] == 0 + assert b[0] == 0