diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index fb63dc16249b2..00ba6cc242d8b 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1099,6 +1099,7 @@ I/O - Bug in :meth:`pandas.io.json.json_normalize` where subrecords are not properly normalized if any subrecords values are NoneType (:issue:`20030`) - Bug in ``usecols`` parameter in :func:`pandas.io.read_csv` and :func:`pandas.io.read_table` where error is not raised correctly when passing a string. (:issue:`20529`) - Bug in :func:`HDFStore.keys` when reading a file with a softlink causes exception (:issue:`20523`) +- Bug in :func:`read_excel` and :func:`read_csv` where missing values turned to ``'nan'`` with ``dtype=str`` and ``na_filter=True``. Now, these missing values are converted to the string missing indicator, ``np.nan``. (:issue `20377`) Plotting ^^^^^^^^ diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 30521760327b4..540fc55a2f818 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -461,11 +461,17 @@ cpdef ndarray[object] astype_unicode(ndarray arr): cdef: Py_ssize_t i, n = arr.size ndarray[object] result = np.empty(n, dtype=object) + object arr_i for i in range(n): # we can use the unsafe version because we know `result` is mutable # since it was created from `np.empty` - util.set_value_at_unsafe(result, i, unicode(arr[i])) + arr_i = arr[i] + util.set_value_at_unsafe( + result, + i, + unicode(arr_i) if not checknull(arr_i) else np.nan + ) return result @@ -474,11 +480,17 @@ cpdef ndarray[object] astype_str(ndarray arr): cdef: Py_ssize_t i, n = arr.size ndarray[object] result = np.empty(n, dtype=object) + object arr_i for i in range(n): # we can use the unsafe version because we know `result` is mutable # since it was created from `np.empty` - util.set_value_at_unsafe(result, i, str(arr[i])) + arr_i = arr[i] + util.set_value_at_unsafe( + result, + i, + str(arr_i) if not checknull(arr_i) else np.nan + ) return result diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index a24e2cdd99f6f..da00a09f5bd16 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1218,6 +1218,7 @@ cdef class TextReader: # treat as a regular string parsing return self._string_convert(i, start, end, na_filter, na_hashset) + elif dtype.kind == 'U': width = dtype.itemsize if width > 0: @@ -1227,6 +1228,7 @@ cdef class TextReader: # unicode variable width return self._string_convert(i, start, end, na_filter, na_hashset) + elif is_categorical_dtype(dtype): # TODO: I suspect that _categorical_convert could be # optimized when dtype is an instance of CategoricalDtype diff --git a/pandas/core/series.py b/pandas/core/series.py index 1d6f770d92795..9131aa446396a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4153,4 +4153,8 @@ def _try_cast(arr, take_fast_path): data = np.array(data, dtype=dtype, copy=False) subarr = np.array(data, dtype=object, copy=copy) + # GH 20377 + # Turn all 'nan' to np.nan + subarr[subarr == 'nan'] = np.nan + return subarr diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 152159965036d..fb0dd4a0e343a 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -529,7 +529,7 @@ def test_astype_str(self): # consistency in astype(str) for tt in set([str, compat.text_type]): result = DataFrame([np.NaN]).astype(tt) - expected = DataFrame(['nan']) + expected = DataFrame([np.NaN], dtype=object) assert_frame_equal(result, expected) result = DataFrame([1.12345678901234567890]).astype(tt) diff --git a/pandas/tests/io/parser/na_values.py b/pandas/tests/io/parser/na_values.py index d2c3f82e95c4d..6cbc8cd752d50 100644 --- a/pandas/tests/io/parser/na_values.py +++ b/pandas/tests/io/parser/na_values.py @@ -369,3 +369,27 @@ def test_no_na_filter_on_index(self): expected = DataFrame({"a": [1, 4], "c": [3, 6]}, index=Index([np.nan, 5.0], name="b")) tm.assert_frame_equal(out, expected) + + def test_na_values_with_dtype_str_and_na_filter_true(self): + # see gh-20377 + data = "a,b,c\n1,,3\n4,5,6" + + out = self.read_csv(StringIO(data), na_filter=True, dtype=str) + + # missing data turn to np.nan, which stays as it is after dtype=str + expected = DataFrame({"a": ["1", "4"], + "b": [np.nan, "5"], + "c": ["3", "6"]}) + tm.assert_frame_equal(out, expected) + + def test_na_values_with_dtype_str_and_na_filter_false(self): + # see gh-20377 + data = "a,b,c\n1,,3\n4,5,6" + + out = self.read_csv(StringIO(data), na_filter=False, dtype=str) + + # missing data turn to empty string + expected = DataFrame({"a": ["1", "4"], + "b": ["", "5"], + "c": ["3", "6"]}) + tm.assert_frame_equal(out, expected) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 5ef6dc07a5c22..190edf1950cc7 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -360,6 +360,33 @@ def test_reader_dtype(self, ext): with pytest.raises(ValueError): actual = self.get_exceldf(basename, ext, dtype={'d': 'int64'}) + def test_reader_dtype_str(self, ext): + # GH 20377 + basename = 'testdtype' + actual = self.get_exceldf(basename, ext) + + expected = DataFrame({ + 'a': [1, 2, 3, 4], + 'b': [2.5, 3.5, 4.5, 5.5], + 'c': [1, 2, 3, 4], + 'd': [1.0, 2.0, np.nan, 4.0]}).reindex( + columns=['a', 'b', 'c', 'd']) + + tm.assert_frame_equal(actual, expected) + + actual = self.get_exceldf(basename, ext, + dtype={'a': 'float64', + 'b': 'float32', + 'c': str, + 'd': str}) + + expected['a'] = expected['a'].astype('float64') + expected['b'] = expected['b'].astype('float32') + expected['c'] = ['001', '002', '003', '004'] + expected['d'] = ['1', '2', np.nan, '4'] + + tm.assert_frame_equal(actual, expected) + def test_reading_all_sheets(self, ext): # Test reading all sheetnames by setting sheetname to None, # Ensure a dict is returned. diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index dd1b623f0f7ff..19894673e07c8 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -149,6 +149,7 @@ def test_astype_str_map(self, dtype, series): # see gh-4405 result = series.astype(dtype) expected = series.map(compat.text_type) + expected = expected.replace('nan', np.nan) # see gh-20377 tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", [str, compat.text_type])