From 50eab43cd5e5c7d1089a626c8c106d791da7df59 Mon Sep 17 00:00:00 2001 From: keisukefujii Date: Wed, 19 Dec 2018 11:28:51 +0100 Subject: [PATCH 1/7] Fix multiindex selection --- doc/whats-new.rst | 3 ++- xarray/core/indexing.py | 3 +++ xarray/tests/test_dataarray.py | 14 ++++++++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 3ef4375c499..6084c77b220 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -57,7 +57,8 @@ Bug fixes By `Martin Raspaud `_. - Fix parsing of ``_Unsigned`` attribute set by OPENDAP servers. (:issue:`2583`). By `Deepak Cherian `_ - +- Fix MultiIndex selection to update label and level (:issue:`2619`). + By `Keisuke Fujii `_. .. _whats-new.0.11.0: diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index d51da471c8d..66b0b743498 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -1265,6 +1265,9 @@ def __getitem__(self, indexer): result = self.array[key] if isinstance(result, pd.Index): + # GH2619. For MultiIndex, we need to call remove_unused. + if isinstance(result, pd.MultiIndex): + result = result.remove_unused_levels() result = PandasIndexAdapter(result, dtype=self.dtype) else: # result is a scalar diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index ecb60239b72..0672861da2c 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -1027,6 +1027,20 @@ def test_sel(lab_indexer, pos_indexer, replaced_idx=False, assert_identical(mdata.sel(x={'one': 'a', 'two': 1}), mdata.sel(one='a', two=1)) + def test_selection_multiindex(self): + # GH2619. For MultiIndex, we need to call remove_unused. + ds = xr.DataArray(np.arange(40).reshape(8, 5), dims=['x', 'y'], + coords={'x': np.arange(8), 'y': np.arange(5)}) + ds = ds.stack(xy=['x', 'y']) + ds_isel = ds.isel(xy=ds['x'] < 4) + with pytest.raises(KeyError): + ds_isel.sel(x=5) + + actual = ds_isel.unstack() + expected = ds.reset_index('xy').isel(xy=ds['x'] < 4) + expected = expected.set_index(xy=['x', 'y']).unstack() + assert_identical(expected, actual) + def test_virtual_default_coords(self): array = DataArray(np.zeros((5,)), dims='x') expected = DataArray(range(5), dims='x', name='x') From 762f4965954e34a9c61dcec8842cc1c5c40c93ea Mon Sep 17 00:00:00 2001 From: keisukefujii Date: Wed, 19 Dec 2018 22:21:52 +0100 Subject: [PATCH 2/7] Support pandas0.19 --- xarray/core/indexing.py | 9 +++-- xarray/core/pdcompat.py | 77 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 2 deletions(-) create mode 100644 xarray/core/pdcompat.py diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 66b0b743498..4e5e934cc2d 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -4,11 +4,12 @@ import operator from collections import Hashable, defaultdict from datetime import timedelta +from distutils.version import LooseVersion import numpy as np import pandas as pd -from . import duck_array_ops, nputils, utils +from . import duck_array_ops, nputils, pdcompat, utils from .pycompat import ( dask_array_type, integer_types, iteritems, range, suppress) from .utils import is_dict_like @@ -1267,7 +1268,11 @@ def __getitem__(self, indexer): if isinstance(result, pd.Index): # GH2619. For MultiIndex, we need to call remove_unused. if isinstance(result, pd.MultiIndex): - result = result.remove_unused_levels() + if LooseVersion(pd.__version__) >= "0.20": + result = result.remove_unused_levels() + else: # for pandas 0.19 + result = pdcompat.remove_unused_levels(result) + result = PandasIndexAdapter(result, dtype=self.dtype) else: # result is a scalar diff --git a/xarray/core/pdcompat.py b/xarray/core/pdcompat.py new file mode 100644 index 00000000000..15e7b9ccdd3 --- /dev/null +++ b/xarray/core/pdcompat.py @@ -0,0 +1,77 @@ +import pandas as pd + + +# for pandas 0.19 +def remove_unused_levels(self): + """ + create a new MultiIndex from the current that removing + unused levels, meaning that they are not expressed in the labels + The resulting MultiIndex will have the same outward + appearance, meaning the same .values and ordering. It will also + be .equals() to the original. + .. versionadded:: 0.20.0 + Returns + ------- + MultiIndex + Examples + -------- + >>> i = pd.MultiIndex.from_product([range(2), list('ab')]) + MultiIndex(levels=[[0, 1], ['a', 'b']], + labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) + >>> i[2:] + MultiIndex(levels=[[0, 1], ['a', 'b']], + labels=[[1, 1], [0, 1]]) + The 0 from the first level is not represented + and can be removed + >>> i[2:].remove_unused_levels() + MultiIndex(levels=[[1], ['a', 'b']], + labels=[[0, 0], [0, 1]]) + """ + + new_levels = [] + new_labels = [] + + changed = False + for lev, lab in zip(self.levels, self.labels): + + # Since few levels are typically unused, bincount() is more + # efficient than unique() - however it only accepts positive values + # (and drops order): + uniques = np.where(np.bincount(lab + 1) > 0)[0] - 1 + has_na = int(len(uniques) and (uniques[0] == -1)) + + if len(uniques) != len(lev) + has_na: + # We have unused levels + changed = True + + # Recalculate uniques, now preserving order. + # Can easily be cythonized by exploiting the already existing + # "uniques" and stop parsing "lab" when all items are found: + uniques = algos.unique(lab) + if has_na: + na_idx = np.where(uniques == -1)[0] + # Just ensure that -1 is in first position: + uniques[[0, na_idx[0]]] = uniques[[na_idx[0], 0]] + + # labels get mapped from uniques to 0:len(uniques) + # -1 (if present) is mapped to last position + label_mapping = np.zeros(len(lev) + has_na) + # ... and reassigned value -1: + label_mapping[uniques] = np.arange(len(uniques)) - has_na + + lab = label_mapping[lab] + + # new levels are simple + lev = lev.take(uniques[has_na:]) + + new_levels.append(lev) + new_labels.append(lab) + + result = self._shallow_copy() + + if changed: + result._reset_identity() + result._set_levels(new_levels, validate=False) + result._set_labels(new_labels, validate=False) + + return result From 6bb8166b88b4675e2c7c12ae1b993a9f6147d13b Mon Sep 17 00:00:00 2001 From: keisukefujii Date: Wed, 19 Dec 2018 22:36:51 +0100 Subject: [PATCH 3/7] a bugfix --- xarray/core/pdcompat.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xarray/core/pdcompat.py b/xarray/core/pdcompat.py index 15e7b9ccdd3..987cca3fc59 100644 --- a/xarray/core/pdcompat.py +++ b/xarray/core/pdcompat.py @@ -1,3 +1,4 @@ +import numpy as np import pandas as pd From a806c64b3885bed5fca077d992d6c7e0e9fce227 Mon Sep 17 00:00:00 2001 From: keisukefujii Date: Thu, 20 Dec 2018 08:31:16 +0100 Subject: [PATCH 4/7] Do remove_unused_levels only once in unstack. --- xarray/core/dataset.py | 8 +++++++- xarray/core/indexing.py | 14 +++++--------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index b253d956a80..397a0f72fef 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -14,7 +14,7 @@ from . import ( alignment, computation, duck_array_ops, formatting, groupby, indexing, ops, - resample, rolling, utils) + pdcompat, resample, rolling, utils) from .. import conventions from ..coding.cftimeindex import _parse_array_of_cftime_strings from .alignment import align @@ -2425,6 +2425,12 @@ def stack(self, dimensions=None, **dimensions_kwargs): def _unstack_once(self, dim): index = self.get_index(dim) + # GH2619. For MultiIndex, we need to call remove_unused. + if LooseVersion(pd.__version__) >= "0.20": + index = index.remove_unused_levels() + else: # for pandas 0.19 + index = pdcompat.remove_unused_levels(index) + full_idx = pd.MultiIndex.from_product(index.levels, names=index.names) # take a shortcut in case the MultiIndex was not modified. diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 4e5e934cc2d..c7329bc6201 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -9,7 +9,7 @@ import numpy as np import pandas as pd -from . import duck_array_ops, nputils, pdcompat, utils +from . import duck_array_ops, nputils, utils from .pycompat import ( dask_array_type, integer_types, iteritems, range, suppress) from .utils import is_dict_like @@ -160,6 +160,10 @@ def convert_label_indexer(index, label, index_name='', method=None, indexer, new_index = index.get_loc_level( tuple(label.values()), level=tuple(label.keys())) + # GH2619. Raise a KeyError if nothing is chosen + if indexer.dtype.kind == 'b' and indexer.sum() == 0: + raise KeyError('{} not found'.format(label)) + elif isinstance(label, tuple) and isinstance(index, pd.MultiIndex): if _is_nested_tuple(label): indexer = index.get_locs(label) @@ -169,7 +173,6 @@ def convert_label_indexer(index, label, index_name='', method=None, indexer, new_index = index.get_loc_level( label, level=list(range(len(label))) ) - else: label = (label if getattr(label, 'ndim', 1) > 1 # vectorized-indexing else _asarray_tuplesafe(label)) @@ -1266,13 +1269,6 @@ def __getitem__(self, indexer): result = self.array[key] if isinstance(result, pd.Index): - # GH2619. For MultiIndex, we need to call remove_unused. - if isinstance(result, pd.MultiIndex): - if LooseVersion(pd.__version__) >= "0.20": - result = result.remove_unused_levels() - else: # for pandas 0.19 - result = pdcompat.remove_unused_levels(result) - result = PandasIndexAdapter(result, dtype=self.dtype) else: # result is a scalar From 205f94840066dcee5fd7863d6305c09d7a54cbff Mon Sep 17 00:00:00 2001 From: keisukefujii Date: Thu, 20 Dec 2018 08:33:03 +0100 Subject: [PATCH 5/7] import algos --- xarray/core/pdcompat.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xarray/core/pdcompat.py b/xarray/core/pdcompat.py index 987cca3fc59..3456a616629 100644 --- a/xarray/core/pdcompat.py +++ b/xarray/core/pdcompat.py @@ -1,5 +1,6 @@ import numpy as np import pandas as pd +import pandas.core.algorithms as algos # for pandas 0.19 From b15cab3bd0fbd50e5d471c203f5bb11a694f42f0 Mon Sep 17 00:00:00 2001 From: keisukefujii Date: Thu, 20 Dec 2018 08:53:17 +0100 Subject: [PATCH 6/7] Remove unused import --- xarray/core/indexing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index c7329bc6201..7e0418d25b4 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -4,7 +4,6 @@ import operator from collections import Hashable, defaultdict from datetime import timedelta -from distutils.version import LooseVersion import numpy as np import pandas as pd From edb4a24e27ba57a76c8c329393518977e6c21a2d Mon Sep 17 00:00:00 2001 From: keisukefujii Date: Mon, 24 Dec 2018 12:23:59 +0100 Subject: [PATCH 7/7] Adopt local import --- xarray/core/pdcompat.py | 42 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/xarray/core/pdcompat.py b/xarray/core/pdcompat.py index 3456a616629..c1e153f4d92 100644 --- a/xarray/core/pdcompat.py +++ b/xarray/core/pdcompat.py @@ -1,6 +1,45 @@ +# The remove_unused_levels defined here was copied based on the source code +# defined in pandas.core.indexes.muli.py + +# For reference, here is a copy of the pandas copyright notice: + +# (c) 2011-2012, Lambda Foundry, Inc. and PyData Development Team +# All rights reserved. + +# Copyright (c) 2008-2011 AQR Capital Management, LLC +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: + +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. + +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. + +# * Neither the name of the copyright holder nor the names of any +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + import numpy as np import pandas as pd -import pandas.core.algorithms as algos # for pandas 0.19 @@ -29,6 +68,7 @@ def remove_unused_levels(self): MultiIndex(levels=[[1], ['a', 'b']], labels=[[0, 0], [0, 1]]) """ + import pandas.core.algorithms as algos new_levels = [] new_labels = []