From 0fab6a9536a6369e54f65eec2d410873d14cc333 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Wed, 23 May 2018 19:23:00 -0700 Subject: [PATCH 1/6] fix hashing string-casting error --- pandas/_libs/hashing.pyx | 7 ++----- pandas/tests/series/test_repr.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index c6f182ac5003f..4489847518a1d 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -8,8 +8,7 @@ import numpy as np from numpy cimport ndarray, uint8_t, uint32_t, uint64_t from util cimport _checknull -from cpython cimport (PyString_Check, - PyBytes_Check, +from cpython cimport (PyBytes_Check, PyUnicode_Check) from libc.stdlib cimport malloc, free @@ -62,9 +61,7 @@ def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'): cdef list datas = [] for i in range(n): val = arr[i] - if PyString_Check(val): - data = val.encode(encoding) - elif PyBytes_Check(val): + if PyBytes_Check(val): data = val elif PyUnicode_Check(val): data = val.encode(encoding) diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 97236f028b1c4..22152034e22bc 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -5,6 +5,7 @@ import sys +import pytest import numpy as np import pandas as pd @@ -202,6 +203,34 @@ def test_latex_repr(self): class TestCategoricalRepr(object): + @pytest.mark.skipif(compat.PY3, reason="Decoding failure only in PY2") + def test_categorical_repr_unicode(self): + # GH#21002 if len(index) > 60, sys.getdefaultencoding()=='ascii', + # and we are working in PY2, then rendering a Categorical could raise + # UnicodeDecodeError by trying to decode when it shouldn't + from pandas.core.base import StringMixin + + class County(StringMixin): + name = u'San Sebastián' + state = u'PR' + def __unicode__(self): + return self.name + u', ' + self.state + + cat = pd.Categorical([County() for n in range(61)]) + idx = pd.Index(cat) + ser = idx.to_series() + + # set sys.defaultencoding to ascii, then change it back after the test + enc = sys.getdefaultencoding() + reload(sys) + sys.setdefaultencoding('ascii') + try: + repr(ser) + str(ser) + finally: + # restore encoding + sys.setdefaultencoding(enc) + def test_categorical_repr(self): a = Series(Categorical([1, 2, 3, 4])) exp = u("0 1\n1 2\n2 3\n3 4\n" + From 24a0f594aea1330953de08e69fec1a2defc5dd5f Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Wed, 23 May 2018 19:24:58 -0700 Subject: [PATCH 2/6] flake8 fixup --- pandas/tests/series/test_repr.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 22152034e22bc..a915f7de3f39f 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -213,6 +213,7 @@ def test_categorical_repr_unicode(self): class County(StringMixin): name = u'San Sebastián' state = u'PR' + def __unicode__(self): return self.name + u', ' + self.state @@ -222,7 +223,7 @@ def __unicode__(self): # set sys.defaultencoding to ascii, then change it back after the test enc = sys.getdefaultencoding() - reload(sys) + reload(sys) # noqa:F821 sys.setdefaultencoding('ascii') try: repr(ser) From 279a6e114432ce1f2c3ac489f392b5f5b8fbb9ce Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Mon, 11 Jun 2018 18:23:06 -0700 Subject: [PATCH 3/6] add test in py3, whatsnew note in 0.23.1 --- doc/source/whatsnew/v0.23.1.txt | 1 + pandas/tests/series/test_repr.py | 25 ++++++++++++++++--------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index db25bcf8113f5..0f4116fa961c5 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -100,6 +100,7 @@ Bug Fixes - Bug in :meth:`Series.str.replace()` where the method throws `TypeError` on Python 3.5.2 (:issue: `21078`) - Bug in :class:`Timedelta`: where passing a float with a unit would prematurely round the float precision (:issue: `14156`) - Bug in :func:`pandas.testing.assert_index_equal` which raised ``AssertionError`` incorrectly, when comparing two :class:`CategoricalIndex` objects with param ``check_categorical=False`` (:issue:`19776`) +- Bug in rendering :class:`Series` with ``Categorical`` dtype in rare conditions under Python 2.7 (:issue:`21002`) **Sparse** diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index a915f7de3f39f..baf6ab69caaa5 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -203,7 +203,6 @@ def test_latex_repr(self): class TestCategoricalRepr(object): - @pytest.mark.skipif(compat.PY3, reason="Decoding failure only in PY2") def test_categorical_repr_unicode(self): # GH#21002 if len(index) > 60, sys.getdefaultencoding()=='ascii', # and we are working in PY2, then rendering a Categorical could raise @@ -221,16 +220,24 @@ def __unicode__(self): idx = pd.Index(cat) ser = idx.to_series() - # set sys.defaultencoding to ascii, then change it back after the test - enc = sys.getdefaultencoding() - reload(sys) # noqa:F821 - sys.setdefaultencoding('ascii') - try: + if compat.PY3: + # no reloading of sys, just check that the default (utf8) works + # as expected repr(ser) str(ser) - finally: - # restore encoding - sys.setdefaultencoding(enc) + + else: + # set sys.defaultencoding to ascii, then change it back after + # the test + enc = sys.getdefaultencoding() + reload(sys) # noqa:F821 + sys.setdefaultencoding('ascii') + try: + repr(ser) + str(ser) + finally: + # restore encoding + sys.setdefaultencoding(enc) def test_categorical_repr(self): a = Series(Categorical([1, 2, 3, 4])) From 3f8e9b223cda26742f65dbcd4839f01ad2238452 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 12 Jun 2018 09:45:10 -0700 Subject: [PATCH 4/6] fixup remove unused import --- pandas/tests/series/test_repr.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index baf6ab69caaa5..6577e9e4b4a9d 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -5,7 +5,6 @@ import sys -import pytest import numpy as np import pandas as pd From 7f1201310471146155ab381781f95ce25225d0e6 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Mon, 18 Jun 2018 19:26:58 -0700 Subject: [PATCH 5/6] make set_defaultencoding context --- pandas/tests/series/test_repr.py | 8 +------- pandas/util/testing.py | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 6577e9e4b4a9d..38a8bf896cd5b 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -228,15 +228,9 @@ def __unicode__(self): else: # set sys.defaultencoding to ascii, then change it back after # the test - enc = sys.getdefaultencoding() - reload(sys) # noqa:F821 - sys.setdefaultencoding('ascii') - try: + with tm.set_defaultencoding('ascii'): repr(ser) str(ser) - finally: - # restore encoding - sys.setdefaultencoding(enc) def test_categorical_repr(self): a = Series(Categorical([1, 2, 3, 4])) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index d26a2116fb3ce..b9e53dfc80020 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -553,6 +553,28 @@ def _valid_locales(locales, normalize): # Stdout / stderr decorators +@contextmanager +def set_defaultencoding(encoding): + """ + Set default encoding (as given by sys.getdefaultencoding()) to the given + encoding; restore on exit. + + Parameters + ---------- + encoding : str + """ + if not PY2: + raise ValueError("set_defaultencoding context is only available " + "in Python 2.") + orig = sys.getdefaultencoding() + reload(sys) # noqa:F821 + sys.setdefaultencoding(encoding) + try: + yield + finally: + sys.setdefaultencoding(orig) + + def capture_stdout(f): """ Decorator to capture stdout in a buffer so that it can be checked From 3b91c000a9e8ce13bf958a2a96f64be5bdde7adb Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Mon, 18 Jun 2018 19:28:25 -0700 Subject: [PATCH 6/6] Move note to 0.23.2 --- doc/source/whatsnew/v0.23.1.txt | 1 - doc/source/whatsnew/v0.23.2.txt | 1 + pandas/tests/series/test_repr.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index fce65c840914d..af4eeffd87d01 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -100,7 +100,6 @@ Bug Fixes - Bug in :meth:`Series.str.replace()` where the method throws `TypeError` on Python 3.5.2 (:issue:`21078`) - Bug in :class:`Timedelta` where passing a float with a unit would prematurely round the float precision (:issue:`14156`) - Bug in :func:`pandas.testing.assert_index_equal` which raised ``AssertionError`` incorrectly, when comparing two :class:`CategoricalIndex` objects with param ``check_categorical=False`` (:issue:`19776`) -- Bug in rendering :class:`Series` with ``Categorical`` dtype in rare conditions under Python 2.7 (:issue:`21002`) **Sparse** diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 67c7ce150132a..9f06e1c729b8f 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -79,6 +79,7 @@ Bug Fixes **Categorical** +- Bug in rendering :class:`Series` with ``Categorical`` dtype in rare conditions under Python 2.7 (:issue:`21002`) - **Timezones** diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 38a8bf896cd5b..730c2b7865f1f 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -11,6 +11,7 @@ from pandas import (Index, Series, DataFrame, date_range, option_context, Categorical, period_range, timedelta_range) from pandas.core.index import MultiIndex +from pandas.core.base import StringMixin from pandas.compat import lrange, range, u from pandas import compat @@ -206,7 +207,6 @@ def test_categorical_repr_unicode(self): # GH#21002 if len(index) > 60, sys.getdefaultencoding()=='ascii', # and we are working in PY2, then rendering a Categorical could raise # UnicodeDecodeError by trying to decode when it shouldn't - from pandas.core.base import StringMixin class County(StringMixin): name = u'San Sebastián'