diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index ec002fae3b4b9..2071ec2985748 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -346,6 +346,7 @@ Backwards incompatible API changes - ``DataFrame.round()`` leaves non-numeric columns unchanged in its return, rather than raises. (:issue:`11885`) - ``DataFrame.head(0)`` and ``DataFrame.tail(0)`` return empty frames, rather than ``self``. (:issue:`11937`) - ``Series.head(0)`` and ``Series.tail(0)`` return empty series, rather than ``self``. (:issue:`11937`) +- ``to_msgpack`` and ``read_msgpack`` encoding now defaults to ``'utf-8'``. (:issue:`12170`) NaT and Timedelta operations ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7ae00fb501614..a746a93c3dc16 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -939,7 +939,7 @@ def to_hdf(self, path_or_buf, key, **kwargs): from pandas.io import pytables return pytables.to_hdf(path_or_buf, key, self, **kwargs) - def to_msgpack(self, path_or_buf=None, **kwargs): + def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs): """ msgpack (serialize) object to input file path @@ -957,7 +957,8 @@ def to_msgpack(self, path_or_buf=None, **kwargs): """ from pandas.io import packers - return packers.to_msgpack(path_or_buf, self, **kwargs) + return packers.to_msgpack(path_or_buf, self, encoding=encoding, + **kwargs) def to_sql(self, name, con, flavor='sqlite', schema=None, if_exists='fail', index=True, index_label=None, chunksize=None, dtype=None): diff --git a/pandas/io/packers.py b/pandas/io/packers.py index a16f3600736b8..33310893a4a66 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -75,6 +75,7 @@ def to_msgpack(path_or_buf, *args, **kwargs): path_or_buf : string File path, buffer-like, or None if None, return generated string args : an object or objects to serialize + encoding: encoding for unicode objects append : boolean whether to append to an existing msgpack (default is False) compress : type of compressor (zlib or blosc), default to None (no @@ -103,7 +104,7 @@ def writer(fh): writer(path_or_buf) -def read_msgpack(path_or_buf, iterator=False, **kwargs): +def read_msgpack(path_or_buf, encoding='utf-8', iterator=False, **kwargs): """ Load msgpack pandas object from the specified file path @@ -114,6 +115,7 @@ def read_msgpack(path_or_buf, iterator=False, **kwargs): Parameters ---------- path_or_buf : string File path, BytesIO like or string + encoding: Encoding for decoding msgpack str type iterator : boolean, if True, return an iterator to the unpacker (default is False) @@ -127,7 +129,7 @@ def read_msgpack(path_or_buf, iterator=False, **kwargs): return Iterator(path_or_buf) def read(fh): - l = list(unpack(fh, **kwargs)) + l = list(unpack(fh, encoding=encoding, **kwargs)) if len(l) == 1: return l[0] return l @@ -573,7 +575,7 @@ def create_block(b): def pack(o, default=encode, - encoding='latin1', unicode_errors='strict', use_single_float=False, + encoding='utf-8', unicode_errors='strict', use_single_float=False, autoreset=1, use_bin_type=1): """ Pack an object and return the packed bytes. @@ -587,7 +589,7 @@ def pack(o, default=encode, def unpack(packed, object_hook=decode, - list_hook=None, use_list=False, encoding='latin1', + list_hook=None, use_list=False, encoding='utf-8', unicode_errors='strict', object_pairs_hook=None, max_buffer_size=0, ext_hook=ExtType): """ @@ -607,7 +609,7 @@ def unpack(packed, object_hook=decode, class Packer(_Packer): def __init__(self, default=encode, - encoding='latin1', + encoding='utf-8', unicode_errors='strict', use_single_float=False, autoreset=1, @@ -624,7 +626,7 @@ class Unpacker(_Unpacker): def __init__(self, file_like=None, read_size=0, use_list=False, object_hook=decode, - object_pairs_hook=None, list_hook=None, encoding='latin1', + object_pairs_hook=None, list_hook=None, encoding='utf-8', unicode_errors='strict', max_buffer_size=0, ext_hook=ExtType): super(Unpacker, self).__init__(file_like=file_like, read_size=read_size, diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index 6905225600ae6..4bb34e276e81c 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -299,11 +299,8 @@ def test_multi_index(self): def test_unicode(self): i = tm.makeUnicodeIndex(100) - # this currently fails - self.assertRaises(UnicodeEncodeError, self.encode_decode, i) - - # i_rec = self.encode_decode(i) - # self.assertTrue(i.equals(i_rec)) + i_rec = self.encode_decode(i) + self.assertTrue(i.equals(i_rec)) class TestSeries(TestPackers): @@ -615,6 +612,14 @@ def test_utf(self): result = self.encode_decode(frame, encoding=encoding) assert_frame_equal(result, frame) + def test_default_encoding(self): + for frame in compat.itervalues(self.frame): + result = frame.to_msgpack() + expected = frame.to_msgpack(encoding='utf8') + self.assertEqual(result, expected) + result = self.encode_decode(frame) + assert_frame_equal(result, frame) + class TestMsgpack(): """ @@ -652,7 +657,11 @@ def check_min_structure(self, data): typ], '"{0}" not found in data["{1}"]'.format(kind, typ) def compare(self, vf, version): - data = read_msgpack(vf) + # GH12277 encoding default used to be latin-1, now utf-8 + if LooseVersion(version) < '0.18.0': + data = read_msgpack(vf, encoding='latin-1') + else: + data = read_msgpack(vf) self.check_min_structure(data) for typ, dv in data.items(): assert typ in self.all_data, ('unpacked data contains '