Skip to content

Copy on write using weakrefs (part 2) #12036

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 14 commits into from
33 changes: 18 additions & 15 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ class DataFrame(NDFrame):
np.arange(n) if no column labels are provided
dtype : dtype, default None
Data type to force, otherwise infer
copy : boolean, default False
copy : boolean, default True
Copy data from inputs. Only affects DataFrame / 2d ndarray input

Examples
Expand Down Expand Up @@ -211,13 +211,17 @@ def _constructor_expanddim(self):

def __init__(self, data=None, index=None, columns=None, dtype=None,
copy=False):

parent = None

if data is None:
data = {}
if dtype is not None:
dtype = self._validate_dtype(dtype)

if isinstance(data, DataFrame):
data = data._data
parent = data
data = data._get_view()._data

if isinstance(data, BlockManager):
mgr = self._init_mgr(data, axes=dict(index=index, columns=columns),
Expand Down Expand Up @@ -306,6 +310,9 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,

NDFrame.__init__(self, mgr, fastpath=True)

if parent is not None:
parent._register_new_child(self)

def _init_dict(self, data, index, columns, dtype=None):
"""
Segregate Series based on type and coerce into matrices.
Expand Down Expand Up @@ -1963,8 +1970,10 @@ def __getitem__(self, key):
# shortcut if we are an actual column
is_mi_columns = isinstance(self.columns, MultiIndex)
try:
if key in self.columns and not is_mi_columns:
return self._getitem_column(key)
if key in self.columns:
result = self._getitem_column(key)
result._is_column_view = True
return result
except:
pass

Expand Down Expand Up @@ -2338,7 +2347,6 @@ def __setitem__(self, key, value):
self._set_item(key, value)

def _setitem_slice(self, key, value):
self._check_setitem_copy()
self.ix._setitem_with_indexer(key, value)

def _setitem_array(self, key, value):
Expand All @@ -2349,7 +2357,6 @@ def _setitem_array(self, key, value):
(len(key), len(self.index)))
key = check_bool_indexer(self.index, key)
indexer = key.nonzero()[0]
self._check_setitem_copy()
self.ix._setitem_with_indexer(indexer, value)
else:
if isinstance(value, DataFrame):
Expand All @@ -2359,7 +2366,6 @@ def _setitem_array(self, key, value):
self[k1] = value[k2]
else:
indexer = self.ix._convert_to_indexer(key, axis=1)
self._check_setitem_copy()
self.ix._setitem_with_indexer((slice(None), indexer), value)

def _setitem_frame(self, key, value):
Expand All @@ -2369,7 +2375,6 @@ def _setitem_frame(self, key, value):
raise TypeError('Must pass DataFrame with boolean values only')

self._check_inplace_setting(value)
self._check_setitem_copy()
self.where(-key, value, inplace=True)

def _ensure_valid_index(self, value):
Expand Down Expand Up @@ -2405,11 +2410,6 @@ def _set_item(self, key, value):
value = self._sanitize_column(key, value)
NDFrame._set_item(self, key, value)

# check if we are modifying a copy
# try to set first as we want an invalid
# value exeption to occur first
if len(self):
self._check_setitem_copy()

def insert(self, loc, column, value, allow_duplicates=False):
"""
Expand Down Expand Up @@ -4377,12 +4377,12 @@ def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='',
@Appender(_merge_doc, indents=2)
def merge(self, right, how='inner', on=None, left_on=None, right_on=None,
left_index=False, right_index=False, sort=False,
suffixes=('_x', '_y'), copy=True, indicator=False):
suffixes=('_x', '_y'), indicator=False):
from pandas.tools.merge import merge
return merge(self, right, how=how, on=on,
left_on=left_on, right_on=right_on,
left_index=left_index, right_index=right_index, sort=sort,
suffixes=suffixes, copy=copy, indicator=indicator)
suffixes=suffixes, indicator=indicator)

def round(self, decimals=0, out=None):
"""
Expand Down Expand Up @@ -5227,6 +5227,9 @@ def combineMult(self, other):
FutureWarning, stacklevel=2)
return self.mul(other, fill_value=1.)

def _get_view(self):
return self.loc[:,:]


DataFrame._setup_axes(['index', 'columns'], info_axis=1, stat_axis=0,
axes_are_reversed=True, aliases={'rows': 0})
Expand Down
155 changes: 49 additions & 106 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,14 +83,18 @@ class NDFrame(PandasObject):
_internal_names = ['_data', '_cacher', '_item_cache', '_cache',
'is_copy', '_subtyp', '_index',
'_default_kind', '_default_fill_value', '_metadata',
'__array_struct__', '__array_interface__']
'__array_struct__', '__array_interface__', '_children',
'_is_column_view', '_original_parent']
_internal_names_set = set(_internal_names)
_accessors = frozenset([])
_metadata = []
is_copy = None

_is_column_view = None
_original_parent = None
_children = None

def __init__(self, data, axes=None, copy=False, dtype=None,
fastpath=False):
fastpath=False, ):

if not fastpath:
if dtype is not None:
Expand All @@ -105,6 +109,10 @@ def __init__(self, data, axes=None, copy=False, dtype=None,
object.__setattr__(self, 'is_copy', None)
object.__setattr__(self, '_data', data)
object.__setattr__(self, '_item_cache', {})
object.__setattr__(self, '_children', weakref.WeakValueDictionary())
object.__setattr__(self, '_is_column_view', False)
object.__setattr__(self, '_original_parent', weakref.WeakValueDictionary())


def _validate_dtype(self, dtype):
""" validate the passed dtype """
Expand Down Expand Up @@ -469,7 +477,8 @@ def transpose(self, *args, **kwargs):
raise TypeError('transpose() got an unexpected keyword '
'argument "{0}"'.format(list(kwargs.keys())[0]))

return self._constructor(new_values, **new_axes).__finalize__(self)
result = self._constructor(new_values, **new_axes).__finalize__(self)
return result.copy()

def swapaxes(self, axis1, axis2, copy=True):
"""
Expand Down Expand Up @@ -1077,13 +1086,16 @@ def get(self, key, default=None):
-------
value : type of items contained in object
"""

try:
return self[key]
except (KeyError, ValueError, IndexError):
return default

def __getitem__(self, item):
return self._get_item_cache(item)
result = self._get_item_cache(item)

return result

def _get_item_cache(self, item):
""" return the cached item, item represents a label indexer """
Expand Down Expand Up @@ -1177,9 +1189,6 @@ def _maybe_update_cacher(self, clear=False, verify_is_copy=True):
except:
pass

if verify_is_copy:
self._check_setitem_copy(stacklevel=5, t='referant')

if clear:
self._clear_item_cache()

Expand All @@ -1204,9 +1213,20 @@ def _slice(self, slobj, axis=0, kind=None):
# but only in a single-dtyped view slicable case
is_copy = axis!=0 or result._is_view
result._set_is_copy(self, copy=is_copy)

self._register_new_child(result)

return result

def _set_item(self, key, value):

if hasattr(self, 'columns'):
if key in self.columns:
# If children are views, reset to copies before setting.
self._execute_copy_on_write()
else:
self._execute_copy_on_write()

self._data.set(key, value)
self._clear_item_cache()

Expand All @@ -1219,104 +1239,22 @@ def _set_is_copy(self, ref=None, copy=True):
else:
self.is_copy = None

def _check_is_chained_assignment_possible(self):
"""
check if we are a view, have a cacher, and are of mixed type
if so, then force a setitem_copy check

should be called just near setting a value

will return a boolean if it we are a view and are cached, but a single-dtype
meaning that the cacher should be updated following setting
"""
if self._is_view and self._is_cached:
ref = self._get_cacher()
if ref is not None and ref._is_mixed_type:
self._check_setitem_copy(stacklevel=4, t='referant', force=True)
return True
elif self.is_copy:
self._check_setitem_copy(stacklevel=4, t='referant')
return False

def _check_setitem_copy(self, stacklevel=4, t='setting', force=False):
"""

Parameters
----------
stacklevel : integer, default 4
the level to show of the stack when the error is output
t : string, the type of setting error
force : boolean, default False
if True, then force showing an error

validate if we are doing a settitem on a chained copy.

If you call this function, be sure to set the stacklevel such that the
user will see the error *at the level of setting*

It is technically possible to figure out that we are setting on
a copy even WITH a multi-dtyped pandas object. In other words, some blocks
may be views while other are not. Currently _is_view will ALWAYS return False
for multi-blocks to avoid having to handle this case.

df = DataFrame(np.arange(0,9), columns=['count'])
df['group'] = 'b'

# this technically need not raise SettingWithCopy if both are view (which is not
# generally guaranteed but is usually True
# however, this is in general not a good practice and we recommend using .loc
df.iloc[0:5]['group'] = 'a'

"""

if force or self.is_copy:

value = config.get_option('mode.chained_assignment')
if value is None:
return

# see if the copy is not actually refererd; if so, then disolve
# the copy weakref
try:
gc.collect(2)
if not gc.get_referents(self.is_copy()):
self.is_copy = None
return
except:
pass

# we might be a false positive
try:
if self.is_copy().shape == self.shape:
self.is_copy = None
return
except:
pass

# a custom message
if isinstance(self.is_copy, string_types):
t = self.is_copy

elif t == 'referant':
t = ("\n"
"A value is trying to be set on a copy of a slice from a "
"DataFrame\n\n"
"See the caveats in the documentation: "
"http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy")

else:
t = ("\n"
"A value is trying to be set on a copy of a slice from a "
"DataFrame.\n"
"Try using .loc[row_indexer,col_indexer] = value instead\n\n"
"See the caveats in the documentation: "
"http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy")

if value == 'raise':
raise SettingWithCopyError(t)
elif value == 'warn':
warnings.warn(t, SettingWithCopyWarning, stacklevel=stacklevel)

def _execute_copy_on_write(self):

# Don't set on views.
if (self._is_view and not self._is_column_view) or len(self._children) is not 0:
self._data = self._data.copy()
self._children = weakref.WeakValueDictionary()


def _register_new_child(self, view_to_append):
self._children[id(view_to_append)] = view_to_append

if len(self._original_parent) is 0:
view_to_append._original_parent['parent'] = self
else:
self._original_parent['parent']._register_new_child(view_to_append)

def __delitem__(self, key):
"""
Delete item
Expand Down Expand Up @@ -2383,6 +2321,7 @@ def __finalize__(self, other, method=None, **kwargs):
return self

def __getattr__(self, name):

"""After regular attribute access, try looking up the name
This allows simpler access to columns for interactive use.
"""
Expand All @@ -2405,6 +2344,10 @@ def __setattr__(self, name, value):
# e.g. ``obj.x`` and ``obj.x = 4`` will always reference/modify
# the same attribute.

if hasattr(self, 'columns'):
if name in self.columns:
self._execute_copy_on_write()

try:
object.__getattribute__(self, name)
return object.__setattr__(self, name, value)
Expand Down
Loading