From 8e5311243bd66b891777c3abe3f53112d3677a69 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 10 Jun 2016 06:14:32 +0100 Subject: [PATCH] BUG: Fix quoting behaviour in to_csv for csv.QUOTE_NONNUMERIC Float values were being quoted despite the quoting spec. Bug traced to the float formatting that was unconditionally casting all floats to string. Unconditional casting traced back to commit 2d51b33 (gh-12194) via bisection. This commit undoes some of those changes to rectify the behaviour. Closes gh-12922. [ci skip] --- doc/source/whatsnew/v0.18.2.txt | 2 + pandas/core/internals.py | 14 ++++ pandas/formats/format.py | 5 ++ pandas/tests/frame/test_to_csv.py | 109 ++++++++++++++++++++++-------- 4 files changed, 101 insertions(+), 29 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index f5dbfd80de7cc..b3ce9911d3f4d 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -388,6 +388,8 @@ Bug Fixes - Bug in various index types, which did not propagate the name of passed index (:issue:`12309`) - Bug in ``DatetimeIndex``, which did not honour the ``copy=True`` (:issue:`13205`) + +- Bug in ``DataFrame.to_csv()`` in which float values were being quoted even though quotations were specified for non-numeric values only (:issue:`12922`, :issue:`13259`) - Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 97df81ad6be48..c931adc9a31df 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1529,6 +1529,20 @@ def to_native_types(self, slicer=None, na_rep='', float_format=None, if slicer is not None: values = values[:, slicer] + # see gh-13418: no special formatting is desired at the + # output (important for appropriate 'quoting' behaviour), + # so do not pass it through the FloatArrayFormatter + if float_format is None and decimal == '.': + mask = isnull(values) + + if not quoting: + values = values.astype(str) + else: + values = np.array(values, dtype='object') + + values[mask] = na_rep + return values + from pandas.formats.format import FloatArrayFormatter formatter = FloatArrayFormatter(values, na_rep=na_rep, float_format=float_format, diff --git a/pandas/formats/format.py b/pandas/formats/format.py index 923ac25f0ebed..a8e184ce94c89 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -1,4 +1,9 @@ # -*- coding: utf-8 -*- +""" +Internal module for formatting output data in csv, html, +and latex files. This module also applies to display formatting. +""" + from __future__ import print_function from distutils.version import LooseVersion # pylint: disable=W0141 diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index bacf604c491b1..c23702ef46ad2 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -824,35 +824,6 @@ def test_to_csv_float_format(self): index=['A', 'B'], columns=['X', 'Y', 'Z']) assert_frame_equal(rs, xp) - def test_to_csv_quoting(self): - df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']}) - - buf = StringIO() - df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC) - - result = buf.getvalue() - expected = ('"A","B"\n' - '1,"foo"\n' - '2,"bar"\n' - '3,"baz"\n') - - self.assertEqual(result, expected) - - # quoting windows line terminators, presents with encoding? - # #3503 - text = 'a,b,c\n1,"test \r\n",3\n' - df = pd.read_csv(StringIO(text)) - buf = StringIO() - df.to_csv(buf, encoding='utf-8', index=False) - self.assertEqual(buf.getvalue(), text) - - # testing if quoting parameter is passed through with multi-indexes - # related to issue #7791 - df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}) - df = df.set_index(['a', 'b']) - expected = '"a","b","c"\n"1","3","5"\n"2","4","6"\n' - self.assertEqual(df.to_csv(quoting=csv.QUOTE_ALL), expected) - def test_to_csv_unicodewriter_quoting(self): df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']}) @@ -1131,3 +1102,83 @@ def test_to_csv_with_dst_transitions(self): df.to_pickle(path) result = pd.read_pickle(path) assert_frame_equal(result, df) + + def test_to_csv_quoting(self): + df = DataFrame({ + 'c_string': ['a', 'b,c'], + 'c_int': [42, np.nan], + 'c_float': [1.0, 3.2], + 'c_bool': [True, False], + }) + + expected = """\ +,c_bool,c_float,c_int,c_string +0,True,1.0,42.0,a +1,False,3.2,,"b,c" +""" + result = df.to_csv() + self.assertEqual(result, expected) + + result = df.to_csv(quoting=None) + self.assertEqual(result, expected) + + result = df.to_csv(quoting=csv.QUOTE_MINIMAL) + self.assertEqual(result, expected) + + expected = """\ +"","c_bool","c_float","c_int","c_string" +"0","True","1.0","42.0","a" +"1","False","3.2","","b,c" +""" + result = df.to_csv(quoting=csv.QUOTE_ALL) + self.assertEqual(result, expected) + + # see gh-12922, gh-13259: make sure changes to + # the formatters do not break this behaviour + expected = """\ +"","c_bool","c_float","c_int","c_string" +0,True,1.0,42.0,"a" +1,False,3.2,"","b,c" +""" + result = df.to_csv(quoting=csv.QUOTE_NONNUMERIC) + self.assertEqual(result, expected) + + msg = "need to escape, but no escapechar set" + tm.assertRaisesRegexp(csv.Error, msg, df.to_csv, + quoting=csv.QUOTE_NONE) + tm.assertRaisesRegexp(csv.Error, msg, df.to_csv, + quoting=csv.QUOTE_NONE, + escapechar=None) + + expected = """\ +,c_bool,c_float,c_int,c_string +0,True,1.0,42.0,a +1,False,3.2,,b!,c +""" + result = df.to_csv(quoting=csv.QUOTE_NONE, + escapechar='!') + self.assertEqual(result, expected) + + expected = """\ +,c_bool,c_ffloat,c_int,c_string +0,True,1.0,42.0,a +1,False,3.2,,bf,c +""" + result = df.to_csv(quoting=csv.QUOTE_NONE, + escapechar='f') + self.assertEqual(result, expected) + + # see gh-3503: quoting Windows line terminators + # presents with encoding? + text = 'a,b,c\n1,"test \r\n",3\n' + df = pd.read_csv(StringIO(text)) + buf = StringIO() + df.to_csv(buf, encoding='utf-8', index=False) + self.assertEqual(buf.getvalue(), text) + + # xref gh-7791: make sure the quoting parameter is passed through + # with multi-indexes + df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}) + df = df.set_index(['a', 'b']) + expected = '"a","b","c"\n"1","3","5"\n"2","4","6"\n' + self.assertEqual(df.to_csv(quoting=csv.QUOTE_ALL), expected)