From fdd3ce9360b4d0f8238d341e2bad5f5bd0b688ca Mon Sep 17 00:00:00 2001 From: Ming Li Date: Thu, 31 May 2018 22:08:07 +0100 Subject: [PATCH 01/19] handle encoding type --- pandas/io/formats/csvs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 0be2a180fbfa2..aeb2c52aa6e0c 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -154,8 +154,8 @@ def save(self): # GH 17778 handles compression for byte strings. if not close and self.compression: f.close() - with open(f.name, 'r') as f: - data = f.read() + with open(f.name, 'rb') as f: + data = f.read().decode(encoding) f, handles = _get_handle(f.name, self.mode, encoding=encoding, compression=self.compression) From 826aa2c8a44ef82464443a26efe25bfb976b5b5d Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sat, 2 Jun 2018 17:26:50 +0100 Subject: [PATCH 02/19] enrich comment --- pandas/io/formats/csvs.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index aeb2c52aa6e0c..bb5613c6448a4 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -151,7 +151,8 @@ def save(self): self._save() finally: - # GH 17778 handles compression for byte strings. + # GH 17778 handles zip compression for byte strings separately to + # support Python 2 if not close and self.compression: f.close() with open(f.name, 'rb') as f: From fac53c071a54e64701aeac749fea49040267d486 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sat, 2 Jun 2018 21:25:15 +0100 Subject: [PATCH 03/19] add encoding fixture --- pandas/tests/series/test_io.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index e369dfda6deac..6d91444cb50c8 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -10,7 +10,7 @@ from pandas import Series, DataFrame -from pandas.compat import StringIO, u +from pandas.compat import StringIO, u, PY2 from pandas.util.testing import (assert_series_equal, assert_almost_equal, assert_frame_equal, ensure_clean) import pandas.util.testing as tm @@ -137,29 +137,35 @@ def test_to_csv_path_is_none(self): csv_str = s.to_csv(path=None) assert isinstance(csv_str, str) - def test_to_csv_compression(self, compression): - - s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], - name='X') + @pytest.mark.parametrize('s, encoding', [ + (Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], + name='X'), 'utf-8'), + (Series(['abc', 'def', 'ghi'], name='X'), 'ascii'), + (Series(["123", u"你好", u"世界"], name=u"中文"), 'gb2312'), + (Series(["123", u"Γειά σου", u"Κόσμε"], name=u"Ελληνικά"), 'cp737') + ]) + def test_to_csv_compression(self, s, encoding, compression): with ensure_clean() as filename: - s.to_csv(filename, compression=compression, header=True) + s.to_csv(filename, compression=compression, encoding=encoding, + header=True) # test the round trip - to_csv -> read_csv rs = pd.read_csv(filename, compression=compression, - index_col=0, squeeze=True) + encoding=encoding, index_col=0, squeeze=True) assert_series_equal(s, rs) # explicitly ensure file was compressed with tm.decompress_file(filename, compression) as fh: - text = fh.read().decode('utf8') + text = fh.read().decode(encoding) assert s.name in text with tm.decompress_file(filename, compression) as fh: assert_series_equal(s, pd.read_csv(fh, index_col=0, - squeeze=True)) + squeeze=True, + encoding=encoding)) class TestSeriesIO(TestData): From a4de620c0da8c2479903f8304c8d3e598bc1ff55 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sat, 2 Jun 2018 21:31:31 +0100 Subject: [PATCH 04/19] remove redundant import --- pandas/tests/series/test_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 6d91444cb50c8..ea48abad97120 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -10,7 +10,7 @@ from pandas import Series, DataFrame -from pandas.compat import StringIO, u, PY2 +from pandas.compat import StringIO, u from pandas.util.testing import (assert_series_equal, assert_almost_equal, assert_frame_equal, ensure_clean) import pandas.util.testing as tm From b833abd601b722318056684b29d7094093d04605 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sat, 2 Jun 2018 21:33:12 +0100 Subject: [PATCH 05/19] add encoding fixture --- pandas/tests/frame/test_to_csv.py | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index e4829ebf48561..44cb614edb264 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -919,29 +919,37 @@ def test_to_csv_path_is_none(self): recons = pd.read_csv(StringIO(csv_str), index_col=0) assert_frame_equal(self.frame, recons) - def test_to_csv_compression(self, compression): - - df = DataFrame([[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - index=['A', 'B'], columns=['X', 'Y', 'Z']) + @pytest.mark.parametrize('frame, encoding', [ + (DataFrame([[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + index=['A', 'B'], columns=['X', 'Y', 'Z']), 'utf-8'), + (DataFrame([['abc', 'def', 'ghi']], columns=['X', 'Y', 'Z']), 'ascii'), + (DataFrame(5 * [[123, u"你好", u"世界"]], + columns=['X', 'Y', 'Z']), 'gb2312'), + (DataFrame(5 * [[123, u"Γειά σου", u"Κόσμε"]], + columns=['X', 'Y', 'Z']), 'cp737') + ]) + def test_to_csv_compression(self, frame, encoding, compression): with ensure_clean() as filename: - df.to_csv(filename, compression=compression) + frame.to_csv(filename, compression=compression, encoding=encoding) # test the round trip - to_csv -> read_csv rs = read_csv(filename, compression=compression, - index_col=0) - assert_frame_equal(df, rs) + index_col=0, encoding=encoding) + assert_frame_equal(frame, rs) # explicitly make sure file is compressed with tm.decompress_file(filename, compression) as fh: - text = fh.read().decode('utf8') - for col in df.columns: + text = fh.read().decode(encoding) + for col in frame.columns: assert col in text with tm.decompress_file(filename, compression) as fh: - assert_frame_equal(df, read_csv(fh, index_col=0)) + assert_frame_equal(frame, read_csv(fh, + index_col=0, + encoding=encoding)) def test_to_csv_date_format(self): with ensure_clean('__tmp_to_csv_date_format__') as path: From b625f080d6d9f145d101baf28b4b632a66e020f5 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 3 Jun 2018 10:48:06 +0100 Subject: [PATCH 06/19] handle PY2 differently --- pandas/io/formats/csvs.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index bb5613c6448a4..8dec48402395c 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -152,11 +152,15 @@ def save(self): finally: # GH 17778 handles zip compression for byte strings separately to - # support Python 2 + # support Python 2, also allow compression file handle if not close and self.compression: f.close() - with open(f.name, 'rb') as f: - data = f.read().decode(encoding) + if compat.PY2: + _fh = open(f.name, 'r') + else: + _fh = open(f.name, 'r', encoding=encoding) + with _fh: + data = _fh.read() f, handles = _get_handle(f.name, self.mode, encoding=encoding, compression=self.compression) From 9d5c25be0f8ad77c2d504e0c0b6b480435c33f1a Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 3 Jun 2018 11:11:16 +0100 Subject: [PATCH 07/19] restore original test case --- pandas/tests/frame/test_to_csv.py | 4 ++-- pandas/tests/series/test_io.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 44cb614edb264..44fa5468f03b2 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -922,7 +922,7 @@ def test_to_csv_path_is_none(self): @pytest.mark.parametrize('frame, encoding', [ (DataFrame([[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], - index=['A', 'B'], columns=['X', 'Y', 'Z']), 'utf-8'), + index=['A', 'B'], columns=['X', 'Y', 'Z']), None), (DataFrame([['abc', 'def', 'ghi']], columns=['X', 'Y', 'Z']), 'ascii'), (DataFrame(5 * [[123, u"你好", u"世界"]], columns=['X', 'Y', 'Z']), 'gb2312'), @@ -942,7 +942,7 @@ def test_to_csv_compression(self, frame, encoding, compression): # explicitly make sure file is compressed with tm.decompress_file(filename, compression) as fh: - text = fh.read().decode(encoding) + text = fh.read().decode(encoding or 'utf8') for col in frame.columns: assert col in text diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index ea48abad97120..7ecf26786742a 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -139,7 +139,7 @@ def test_to_csv_path_is_none(self): @pytest.mark.parametrize('s, encoding', [ (Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], - name='X'), 'utf-8'), + name='X'), None), (Series(['abc', 'def', 'ghi'], name='X'), 'ascii'), (Series(["123", u"你好", u"世界"], name=u"中文"), 'gb2312'), (Series(["123", u"Γειά σου", u"Κόσμε"], name=u"Ελληνικά"), 'cp737') @@ -158,7 +158,7 @@ def test_to_csv_compression(self, s, encoding, compression): # explicitly ensure file was compressed with tm.decompress_file(filename, compression) as fh: - text = fh.read().decode(encoding) + text = fh.read().decode(encoding or 'utf8') assert s.name in text with tm.decompress_file(filename, compression) as fh: From 8ed6fa2e3c86dc58023db6f0de72ae9a03437329 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 3 Jun 2018 14:26:04 +0100 Subject: [PATCH 08/19] update whatsnew --- doc/source/whatsnew/v0.23.1.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index f2bc81eea186b..223073271033a 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -92,6 +92,7 @@ I/O - Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`) - Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`) +- Bug in :meth:`DataFrame.to_csv` using compression causes encoding error (:issue:`21241`) - Plotting From 2d48d1047249cf6cdf62205bafab80781b5a5693 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 3 Jun 2018 15:53:46 +0100 Subject: [PATCH 09/19] default bytes strings unless requires unicode string --- pandas/io/formats/csvs.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 8dec48402395c..ffed1c0c30477 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -155,12 +155,10 @@ def save(self): # support Python 2, also allow compression file handle if not close and self.compression: f.close() - if compat.PY2: - _fh = open(f.name, 'r') - else: - _fh = open(f.name, 'r', encoding=encoding) - with _fh: - data = _fh.read() + with open(f.name, 'rb') as f: + data = f.read() + if not compat.PY2: + data = data.decode(encoding) f, handles = _get_handle(f.name, self.mode, encoding=encoding, compression=self.compression) From 4a6f5ff0929bec9e774ec82b1ee1f82a8e2034bb Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 3 Jun 2018 23:04:57 +0100 Subject: [PATCH 10/19] assert filehandle open --- pandas/tests/test_common.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 88e469731060d..7034e9ac2e0c8 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -252,12 +252,13 @@ def test_compression_size_fh(obj, method, compression_only): with tm.ensure_clean() as filename: with open(filename, 'w') as fh: getattr(obj, method)(fh, compression=compression_only) - # GH 17778 - assert fh.closed + assert not fh.closed + assert fh.closed compressed = os.path.getsize(filename) with tm.ensure_clean() as filename: with open(filename, 'w') as fh: getattr(obj, method)(fh, compression=None) assert not fh.closed + assert fh.closed uncompressed = os.path.getsize(filename) assert uncompressed > compressed From bf4225cca75933532748ffc131e3b30d435cbdb5 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 3 Jun 2018 23:07:16 +0100 Subject: [PATCH 11/19] use buffer and avoid roundtrip and encoding error. --- pandas/io/formats/csvs.py | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index ffed1c0c30477..3a13f5e90e35d 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -127,14 +127,18 @@ def save(self): else: encoding = self.encoding - if hasattr(self.path_or_buf, 'write'): - f = self.path_or_buf - close = False + # PR 21300 uses string buffer to receive csv writing and dump into + # file-like output with compression as option. + if hasattr(self.path_or_buf, 'name'): + path_or_buf = self.path_or_buf.name + f = StringIO() + elif isinstance(self.path_or_buf, compat.string_types): + path_or_buf = self.path_or_buf + f = StringIO() else: - f, handles = _get_handle(self.path_or_buf, self.mode, - encoding=encoding, - compression=None) - close = True if self.compression is None else False + f = self.path_or_buf + + close = f != self.path_or_buf try: writer_kwargs = dict(lineterminator=self.line_terminator, @@ -151,21 +155,16 @@ def save(self): self._save() finally: - # GH 17778 handles zip compression for byte strings separately to - # support Python 2, also allow compression file handle - if not close and self.compression: - f.close() - with open(f.name, 'rb') as f: - data = f.read() - if not compat.PY2: - data = data.decode(encoding) - f, handles = _get_handle(f.name, self.mode, + # GH 17778 handles zip compression for byte strings separately. + buf = f.getvalue() + if close: + f, handles = _get_handle(path_or_buf, self.mode, encoding=encoding, compression=self.compression) - f.write(data) - close = True - if close: + f.write(buf) f.close() + for _fh in handles: + _fh.close() def _save_header(self): From 486f3ff13993d44aa7a0fa4842a3c162c1add849 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 3 Jun 2018 23:12:24 +0100 Subject: [PATCH 12/19] update whatsnew --- doc/source/whatsnew/v0.23.1.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 223073271033a..5f3e810be834c 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -92,7 +92,7 @@ I/O - Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`) - Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`) -- Bug in :meth:`DataFrame.to_csv` using compression causes encoding error (:issue:`21241`) +- Bug in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` using compression causes encoding error (:issue:`21241`, :issue:`21118`) - Plotting From d8435efb0e63b665f09971d2e9eaf4311690ae68 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 3 Jun 2018 23:12:36 +0100 Subject: [PATCH 13/19] minor refactor of tests --- pandas/tests/frame/test_to_csv.py | 20 ++++++++++---------- pandas/tests/series/test_io.py | 8 ++++---- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 44fa5468f03b2..af44a1f73ad72 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -919,7 +919,7 @@ def test_to_csv_path_is_none(self): recons = pd.read_csv(StringIO(csv_str), index_col=0) assert_frame_equal(self.frame, recons) - @pytest.mark.parametrize('frame, encoding', [ + @pytest.mark.parametrize('df,encoding', [ (DataFrame([[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], index=['A', 'B'], columns=['X', 'Y', 'Z']), None), @@ -929,27 +929,27 @@ def test_to_csv_path_is_none(self): (DataFrame(5 * [[123, u"Γειά σου", u"Κόσμε"]], columns=['X', 'Y', 'Z']), 'cp737') ]) - def test_to_csv_compression(self, frame, encoding, compression): + def test_to_csv_compression(self, df, encoding, compression): with ensure_clean() as filename: - frame.to_csv(filename, compression=compression, encoding=encoding) + df.to_csv(filename, compression=compression, encoding=encoding) # test the round trip - to_csv -> read_csv - rs = read_csv(filename, compression=compression, - index_col=0, encoding=encoding) - assert_frame_equal(frame, rs) + result = read_csv(filename, compression=compression, + index_col=0, encoding=encoding) + assert_frame_equal(df, result) # explicitly make sure file is compressed with tm.decompress_file(filename, compression) as fh: text = fh.read().decode(encoding or 'utf8') - for col in frame.columns: + for col in df.columns: assert col in text with tm.decompress_file(filename, compression) as fh: - assert_frame_equal(frame, read_csv(fh, - index_col=0, - encoding=encoding)) + assert_frame_equal(df, read_csv(fh, + index_col=0, + encoding=encoding)) def test_to_csv_date_format(self): with ensure_clean('__tmp_to_csv_date_format__') as path: diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 7ecf26786742a..56cd23cd0c914 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -137,7 +137,7 @@ def test_to_csv_path_is_none(self): csv_str = s.to_csv(path=None) assert isinstance(csv_str, str) - @pytest.mark.parametrize('s, encoding', [ + @pytest.mark.parametrize('s,encoding', [ (Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], name='X'), None), (Series(['abc', 'def', 'ghi'], name='X'), 'ascii'), @@ -152,9 +152,9 @@ def test_to_csv_compression(self, s, encoding, compression): header=True) # test the round trip - to_csv -> read_csv - rs = pd.read_csv(filename, compression=compression, - encoding=encoding, index_col=0, squeeze=True) - assert_series_equal(s, rs) + result = pd.read_csv(filename, compression=compression, + encoding=encoding, index_col=0, squeeze=True) + assert_series_equal(s, result) # explicitly ensure file was compressed with tm.decompress_file(filename, compression) as fh: From 16cc951a0a93a357e1300c117d7ec5a552f170e9 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 3 Jun 2018 23:29:01 +0100 Subject: [PATCH 14/19] clearer categorisation of input type --- pandas/io/formats/csvs.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 3a13f5e90e35d..d57010b84c876 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -9,6 +9,7 @@ import numpy as np from pandas.core.dtypes.missing import notna +from pandas.core.dtypes.inference import is_file_like from pandas.core.index import Index, MultiIndex from pandas import compat from pandas.compat import (StringIO, range, zip) @@ -129,16 +130,17 @@ def save(self): # PR 21300 uses string buffer to receive csv writing and dump into # file-like output with compression as option. - if hasattr(self.path_or_buf, 'name'): - path_or_buf = self.path_or_buf.name - f = StringIO() - elif isinstance(self.path_or_buf, compat.string_types): + if not is_file_like(self.path_or_buf): path_or_buf = self.path_or_buf f = StringIO() + close = True + elif hasattr(self.path_or_buf, 'name'): + path_or_buf = self.path_or_buf.name + f = StringIO() + close = True else: f = self.path_or_buf - - close = f != self.path_or_buf + close = False try: writer_kwargs = dict(lineterminator=self.line_terminator, From 6714e684821dcd78d52270ec8513484aff6ee712 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Mon, 4 Jun 2018 00:08:09 +0100 Subject: [PATCH 15/19] more compact if statements --- pandas/io/formats/csvs.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index d57010b84c876..ca089b0597b5f 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -129,15 +129,13 @@ def save(self): encoding = self.encoding # PR 21300 uses string buffer to receive csv writing and dump into - # file-like output with compression as option. + # file-like output with compression as option. GH 21241, 21118 + f = StringIO() + close = True if not is_file_like(self.path_or_buf): path_or_buf = self.path_or_buf - f = StringIO() - close = True elif hasattr(self.path_or_buf, 'name'): path_or_buf = self.path_or_buf.name - f = StringIO() - close = True else: f = self.path_or_buf close = False From f73f9ff0ccfe35876912e632e9d74c271c1d17ae Mon Sep 17 00:00:00 2001 From: Ming Li Date: Mon, 4 Jun 2018 00:08:20 +0100 Subject: [PATCH 16/19] documentation --- doc/source/whatsnew/v0.23.1.txt | 2 +- pandas/tests/frame/test_to_csv.py | 1 + pandas/tests/series/test_io.py | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 5f3e810be834c..db4f4acc7ee16 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -92,7 +92,7 @@ I/O - Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`) - Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`) -- Bug in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` using compression causes encoding error (:issue:`21241`, :issue:`21118`) +- Bug in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`) - Plotting diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index af44a1f73ad72..1419da324663f 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -923,6 +923,7 @@ def test_to_csv_path_is_none(self): (DataFrame([[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], index=['A', 'B'], columns=['X', 'Y', 'Z']), None), + # GH 21241, 21118 (DataFrame([['abc', 'def', 'ghi']], columns=['X', 'Y', 'Z']), 'ascii'), (DataFrame(5 * [[123, u"你好", u"世界"]], columns=['X', 'Y', 'Z']), 'gb2312'), diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 56cd23cd0c914..4e7fed740c54c 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -140,6 +140,7 @@ def test_to_csv_path_is_none(self): @pytest.mark.parametrize('s,encoding', [ (Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], name='X'), None), + # GH 21241, 21118 (Series(['abc', 'def', 'ghi'], name='X'), 'ascii'), (Series(["123", u"你好", u"世界"], name=u"中文"), 'gb2312'), (Series(["123", u"Γειά σου", u"Κόσμε"], name=u"Ελληνικά"), 'cp737') From f89153354801b6d1c25c4fd6051069db3de248b3 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Mon, 4 Jun 2018 18:36:12 +0100 Subject: [PATCH 17/19] add comments on path_or_buf types --- pandas/io/formats/csvs.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index ca089b0597b5f..3e03033ebb709 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -133,10 +133,13 @@ def save(self): f = StringIO() close = True if not is_file_like(self.path_or_buf): + # path_or_buf is path path_or_buf = self.path_or_buf elif hasattr(self.path_or_buf, 'name'): + # path_or_buf is file handle path_or_buf = self.path_or_buf.name else: + # path_or_buf is file-like IO objects. f = self.path_or_buf close = False From f3c3ea76af02b6450cb269774e71fe1af3c68c4e Mon Sep 17 00:00:00 2001 From: Ming Li Date: Mon, 4 Jun 2018 19:05:09 +0100 Subject: [PATCH 18/19] remove close --- pandas/io/formats/csvs.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 3e03033ebb709..7f660e2644fa4 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -131,7 +131,6 @@ def save(self): # PR 21300 uses string buffer to receive csv writing and dump into # file-like output with compression as option. GH 21241, 21118 f = StringIO() - close = True if not is_file_like(self.path_or_buf): # path_or_buf is path path_or_buf = self.path_or_buf @@ -141,7 +140,7 @@ def save(self): else: # path_or_buf is file-like IO objects. f = self.path_or_buf - close = False + path_or_buf = None try: writer_kwargs = dict(lineterminator=self.line_terminator, @@ -160,7 +159,7 @@ def save(self): finally: # GH 17778 handles zip compression for byte strings separately. buf = f.getvalue() - if close: + if path_or_buf: f, handles = _get_handle(path_or_buf, self.mode, encoding=encoding, compression=self.compression) From 4e914ff8e05006c2ad6df6684c779d07f3ce67de Mon Sep 17 00:00:00 2001 From: Ming Li Date: Mon, 4 Jun 2018 23:45:55 +0100 Subject: [PATCH 19/19] test file-handle to_csv with compression and encoding --- pandas/tests/frame/test_to_csv.py | 7 +++++++ pandas/tests/series/test_io.py | 9 +++++++++ 2 files changed, 16 insertions(+) diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 1419da324663f..60dc336a85388 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -939,7 +939,14 @@ def test_to_csv_compression(self, df, encoding, compression): # test the round trip - to_csv -> read_csv result = read_csv(filename, compression=compression, index_col=0, encoding=encoding) + + with open(filename, 'w') as fh: + df.to_csv(fh, compression=compression, encoding=encoding) + + result_fh = read_csv(filename, compression=compression, + index_col=0, encoding=encoding) assert_frame_equal(df, result) + assert_frame_equal(df, result_fh) # explicitly make sure file is compressed with tm.decompress_file(filename, compression) as fh: diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 4e7fed740c54c..f98962685ad9a 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -155,7 +155,16 @@ def test_to_csv_compression(self, s, encoding, compression): # test the round trip - to_csv -> read_csv result = pd.read_csv(filename, compression=compression, encoding=encoding, index_col=0, squeeze=True) + + with open(filename, 'w') as fh: + s.to_csv(fh, compression=compression, encoding=encoding, + header=True) + + result_fh = pd.read_csv(filename, compression=compression, + encoding=encoding, index_col=0, + squeeze=True) assert_series_equal(s, result) + assert_series_equal(s, result_fh) # explicitly ensure file was compressed with tm.decompress_file(filename, compression) as fh: