From dd53df84571a9c29f9b2f8e33c3fc549eb81cc2e Mon Sep 17 00:00:00 2001 From: Nikos Karagiannakis Date: Tue, 20 Mar 2018 02:31:27 +0000 Subject: [PATCH 01/33] TST: Test for astype_nansafe. Modified test for astype --- pandas/tests/dtypes/test_cast.py | 13 ++++++++++++- pandas/tests/series/test_dtypes.py | 3 +++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py index 96a9e3227b40b..226e2a77878b2 100644 --- a/pandas/tests/dtypes/test_cast.py +++ b/pandas/tests/dtypes/test_cast.py @@ -23,7 +23,8 @@ maybe_convert_scalar, find_common_type, construct_1d_object_array_from_listlike, - construct_1d_arraylike_from_scalar) + construct_1d_arraylike_from_scalar, + astype_nansafe) from pandas.core.dtypes.dtypes import ( CategoricalDtype, DatetimeTZDtype, @@ -439,3 +440,13 @@ def test_cast_1d_arraylike_from_scalar_categorical(self): tm.assert_categorical_equal(result, expected, check_category_order=True, check_dtype=True) + + +class TestAstypeNansafe(object): + + def test_astype_nansafe_nan_to_str(self): + arr = np.array([np.nan, 'a', 'b'], dtype=object) + arr2 = astype_nansafe(arr, dtype=str) + assert arr[0] is np.nan + assert arr2[0] is np.nan + assert np.array_equal(arr[1:], arr2[1:]) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 56ff092dd0a27..123a4a3877924 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -147,6 +147,9 @@ def test_astype_str_map(self, dtype, series): # see gh-4405 result = series.astype(dtype) expected = series.map(compat.text_type) + if result[3] is np.nan: + assert result.pop(3) is np.nan # gh-20377 np.nan stays as it is even if we cast to str/basestring + assert expected.pop(3) == 'nan' tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", [str, compat.text_type]) From 6f771fbe8cb85ad7dcc449bd43374719f4bc8b1f Mon Sep 17 00:00:00 2001 From: Nikos Karagiannakis Date: Tue, 20 Mar 2018 02:32:53 +0000 Subject: [PATCH 02/33] BUG: np.nan should stay as it is when we cast to str/basestring --- pandas/_libs/lib.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 30521760327b4..3ff2177bf44db 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -465,7 +465,8 @@ cpdef ndarray[object] astype_unicode(ndarray arr): for i in range(n): # we can use the unsafe version because we know `result` is mutable # since it was created from `np.empty` - util.set_value_at_unsafe(result, i, unicode(arr[i])) + arr_i = arr[i] + util.set_value_at_unsafe(result, i, unicode(arr[i]) if arr_i is not np.nan else arr_i) return result @@ -478,7 +479,8 @@ cpdef ndarray[object] astype_str(ndarray arr): for i in range(n): # we can use the unsafe version because we know `result` is mutable # since it was created from `np.empty` - util.set_value_at_unsafe(result, i, str(arr[i])) + arr_i = arr[i] + util.set_value_at_unsafe(result, i, str(arr[i]) if arr_i is not np.nan else arr_i) return result From 37f00ad1e433be94a14017ab9dba25ee14d9c2d5 Mon Sep 17 00:00:00 2001 From: Nikos Karagiannakis Date: Tue, 20 Mar 2018 20:43:57 +0000 Subject: [PATCH 03/33] BUG: revert change in lib.pyx. modify excel functionality directly --- pandas/_libs/lib.pyx | 6 ++---- pandas/io/excel.py | 4 ++++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 3ff2177bf44db..30521760327b4 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -465,8 +465,7 @@ cpdef ndarray[object] astype_unicode(ndarray arr): for i in range(n): # we can use the unsafe version because we know `result` is mutable # since it was created from `np.empty` - arr_i = arr[i] - util.set_value_at_unsafe(result, i, unicode(arr[i]) if arr_i is not np.nan else arr_i) + util.set_value_at_unsafe(result, i, unicode(arr[i])) return result @@ -479,8 +478,7 @@ cpdef ndarray[object] astype_str(ndarray arr): for i in range(n): # we can use the unsafe version because we know `result` is mutable # since it was created from `np.empty` - arr_i = arr[i] - util.set_value_at_unsafe(result, i, str(arr[i]) if arr_i is not np.nan else arr_i) + util.set_value_at_unsafe(result, i, str(arr[i])) return result diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 0f9df845117db..4ac0de7f8d357 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -679,6 +679,10 @@ def _parse_cell(cell_contents, cell_typ): **kwds) output[asheetname] = parser.read(nrows=nrows) + dtypes = output[asheetname].dtypes + output[asheetname].replace('nan', np.nan, inplace=True) + output[asheetname] = output[asheetname].astype(dtypes, copy=False) + if names is not None: output[asheetname].columns = names if not squeeze or isinstance(output[asheetname], DataFrame): From f194b70ea8ea1a89737a66319e428d64d678824f Mon Sep 17 00:00:00 2001 From: Nikos Karagiannakis Date: Tue, 20 Mar 2018 20:45:32 +0000 Subject: [PATCH 04/33] TST: revert changes in dtypes/test_cast. test excel functionality --- pandas/tests/dtypes/test_cast.py | 10 ---------- pandas/tests/io/test_excel.py | 17 ++++++++++++++++- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py index 226e2a77878b2..28ed355d529fe 100644 --- a/pandas/tests/dtypes/test_cast.py +++ b/pandas/tests/dtypes/test_cast.py @@ -440,13 +440,3 @@ def test_cast_1d_arraylike_from_scalar_categorical(self): tm.assert_categorical_equal(result, expected, check_category_order=True, check_dtype=True) - - -class TestAstypeNansafe(object): - - def test_astype_nansafe_nan_to_str(self): - arr = np.array([np.nan, 'a', 'b'], dtype=object) - arr2 = astype_nansafe(arr, dtype=str) - assert arr[0] is np.nan - assert arr2[0] is np.nan - assert np.array_equal(arr[1:], arr2[1:]) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 6b39717213c0d..bf5feff06a270 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -207,12 +207,27 @@ def test_excel_passes_na(self, ext): columns=['Test']) tm.assert_frame_equal(parsed, expected) + # gh-20377 dtype=str (all 'nan' turn to np.nan) + + parsed = read_excel(excel, 'Sheet1', dtype=str, keep_default_na=False, + na_values=['apple']) + expected = DataFrame([['NA'], ['1'], ['NA'], [np.nan], ['rabbit']], + columns=['Test']) + tm.assert_frame_equal(parsed, expected) + + parsed = read_excel(excel, 'Sheet1', dtype=str, keep_default_na=True, + na_values=['apple']) + expected = DataFrame([[np.nan], ['1'], [np.nan], [np.nan], ['rabbit']], + columns=['Test']) + tm.assert_frame_equal(parsed, expected) + # 13967 excel = self.get_excelfile('test5', ext) parsed = read_excel(excel, 'Sheet1', keep_default_na=False, na_values=['apple']) - expected = DataFrame([['1.#QNAN'], [1], ['nan'], [np.nan], ['rabbit']], + # gh-20377 'nan' was given in the spreadsheet, but turned to np.nan as well + expected = DataFrame([['1.#QNAN'], [1], [np.nan], [np.nan], ['rabbit']], columns=['Test']) tm.assert_frame_equal(parsed, expected) From eb8f4c54e564ff9b9efbe422f5ba0ee219858cc5 Mon Sep 17 00:00:00 2001 From: Nikos Karagiannakis Date: Tue, 20 Mar 2018 20:45:58 +0000 Subject: [PATCH 05/33] DOC: added description --- doc/source/whatsnew/v0.23.0.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index acbfa2eb3ccac..867a39a6cc16a 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -981,6 +981,8 @@ I/O - :class:`Timedelta` now supported in :func:`DataFrame.to_excel` for all Excel file types (:issue:`19242`, :issue:`9155`, :issue:`19900`) - Bug in :meth:`pandas.io.stata.StataReader.value_labels` raising an ``AttributeError`` when called on very old files. Now returns an empty dict (:issue:`19417`) - Bug in :func:`read_pickle` when unpickling objects with :class:`TimedeltaIndex` or :class:`Float64Index` created with pandas prior to version 0.20 (:issue:`19939`) +- Bug in :`read_excel` where it transforms np.nan to 'nan' if dtype=str is chosen. Now keeps np.nan as they are. (:issue:`20377`) + Plotting ^^^^^^^^ From ac6a4095383c36c8709495bd2a8b0a9a76814652 Mon Sep 17 00:00:00 2001 From: Nikos Karagiannakis Date: Tue, 20 Mar 2018 20:51:19 +0000 Subject: [PATCH 06/33] TST: correction and pep8 --- pandas/tests/io/test_excel.py | 6 ++++-- pandas/tests/series/test_dtypes.py | 3 --- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index bf5feff06a270..e1c04c84a05c2 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -226,8 +226,10 @@ def test_excel_passes_na(self, ext): parsed = read_excel(excel, 'Sheet1', keep_default_na=False, na_values=['apple']) - # gh-20377 'nan' was given in the spreadsheet, but turned to np.nan as well - expected = DataFrame([['1.#QNAN'], [1], [np.nan], [np.nan], ['rabbit']], + # gh-20377 'nan' was given in the spreadsheet, but turned + # to np.nan as well + expected = DataFrame([['1.#QNAN'], [1], [np.nan], [np.nan], + ['rabbit']], columns=['Test']) tm.assert_frame_equal(parsed, expected) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 123a4a3877924..56ff092dd0a27 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -147,9 +147,6 @@ def test_astype_str_map(self, dtype, series): # see gh-4405 result = series.astype(dtype) expected = series.map(compat.text_type) - if result[3] is np.nan: - assert result.pop(3) is np.nan # gh-20377 np.nan stays as it is even if we cast to str/basestring - assert expected.pop(3) == 'nan' tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", [str, compat.text_type]) From 6994bb03c17a0846042e184e4f0c00851a77473a Mon Sep 17 00:00:00 2001 From: Nikos Karagiannakis Date: Tue, 20 Mar 2018 20:51:57 +0000 Subject: [PATCH 07/33] BUG: pep8 --- pandas/io/excel.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 4ac0de7f8d357..0265e631f5e82 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -681,7 +681,8 @@ def _parse_cell(cell_contents, cell_typ): output[asheetname] = parser.read(nrows=nrows) dtypes = output[asheetname].dtypes output[asheetname].replace('nan', np.nan, inplace=True) - output[asheetname] = output[asheetname].astype(dtypes, copy=False) + output[asheetname] = output[asheetname].astype(dtypes, + copy=False) if names is not None: output[asheetname].columns = names From 40a563fe6b6a8a318c939b926c44d17e634dac0e Mon Sep 17 00:00:00 2001 From: Nikos Karagiannakis Date: Tue, 20 Mar 2018 20:54:38 +0000 Subject: [PATCH 08/33] TST: remove unused import --- pandas/tests/dtypes/test_cast.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py index 28ed355d529fe..96a9e3227b40b 100644 --- a/pandas/tests/dtypes/test_cast.py +++ b/pandas/tests/dtypes/test_cast.py @@ -23,8 +23,7 @@ maybe_convert_scalar, find_common_type, construct_1d_object_array_from_listlike, - construct_1d_arraylike_from_scalar, - astype_nansafe) + construct_1d_arraylike_from_scalar) from pandas.core.dtypes.dtypes import ( CategoricalDtype, DatetimeTZDtype, From 9858259fed30af9aae35e641b3c9ccbabd48cdf3 Mon Sep 17 00:00:00 2001 From: Nikos Karagiannakis Date: Tue, 20 Mar 2018 21:01:46 +0000 Subject: [PATCH 09/33] DOC: resolved conflict --- doc/source/whatsnew/v0.23.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 867a39a6cc16a..4da5674a2cf2e 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -981,6 +981,7 @@ I/O - :class:`Timedelta` now supported in :func:`DataFrame.to_excel` for all Excel file types (:issue:`19242`, :issue:`9155`, :issue:`19900`) - Bug in :meth:`pandas.io.stata.StataReader.value_labels` raising an ``AttributeError`` when called on very old files. Now returns an empty dict (:issue:`19417`) - Bug in :func:`read_pickle` when unpickling objects with :class:`TimedeltaIndex` or :class:`Float64Index` created with pandas prior to version 0.20 (:issue:`19939`) +- Bug in :meth:`pandas.io.json.json_normalize` where subrecords are not properly normalized if any subrecords values are NoneType (:issue:`20030`) - Bug in :`read_excel` where it transforms np.nan to 'nan' if dtype=str is chosen. Now keeps np.nan as they are. (:issue:`20377`) From 5f71a9915e4d071c801509a796af19c9f677f692 Mon Sep 17 00:00:00 2001 From: Nikos Karagiannakis Date: Tue, 20 Mar 2018 21:03:43 +0000 Subject: [PATCH 10/33] Update v0.23.0.txt --- doc/source/whatsnew/v0.23.0.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 4da5674a2cf2e..732549d9aa452 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -984,7 +984,6 @@ I/O - Bug in :meth:`pandas.io.json.json_normalize` where subrecords are not properly normalized if any subrecords values are NoneType (:issue:`20030`) - Bug in :`read_excel` where it transforms np.nan to 'nan' if dtype=str is chosen. Now keeps np.nan as they are. (:issue:`20377`) - Plotting ^^^^^^^^ From 0a93b60a21d0f32b4f5814eeca784762be75acfa Mon Sep 17 00:00:00 2001 From: Nikos Karagiannakis Date: Tue, 20 Mar 2018 21:06:43 +0000 Subject: [PATCH 11/33] conflict again --- doc/source/whatsnew/v0.23.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 732549d9aa452..4da5674a2cf2e 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -984,6 +984,7 @@ I/O - Bug in :meth:`pandas.io.json.json_normalize` where subrecords are not properly normalized if any subrecords values are NoneType (:issue:`20030`) - Bug in :`read_excel` where it transforms np.nan to 'nan' if dtype=str is chosen. Now keeps np.nan as they are. (:issue:`20377`) + Plotting ^^^^^^^^ From f296f9ad8f01db5a7b2337c67fc060aa94022f44 Mon Sep 17 00:00:00 2001 From: Nikos Karagiannakis Date: Tue, 20 Mar 2018 22:18:43 +0000 Subject: [PATCH 12/33] arghh --- doc/source/whatsnew/v0.23.0.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 4da5674a2cf2e..732549d9aa452 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -984,7 +984,6 @@ I/O - Bug in :meth:`pandas.io.json.json_normalize` where subrecords are not properly normalized if any subrecords values are NoneType (:issue:`20030`) - Bug in :`read_excel` where it transforms np.nan to 'nan' if dtype=str is chosen. Now keeps np.nan as they are. (:issue:`20377`) - Plotting ^^^^^^^^ From 7c0af1f93250821c05c9aa3de95c221688cd9fb6 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 19 Mar 2018 11:01:02 +0100 Subject: [PATCH 13/33] DOC: add disallowing of Series construction of len-1 list with index to whatsnew (#20392) --- doc/source/whatsnew/v0.23.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index cfe28edd175b6..3c626a7abc11d 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -714,6 +714,7 @@ Other API Changes - ``pd.to_datetime('today')`` now returns a datetime, consistent with ``pd.Timestamp('today')``; previously ``pd.to_datetime('today')`` returned a ``.normalized()`` datetime (:issue:`19935`) - :func:`Series.str.replace` now takes an optional `regex` keyword which, when set to ``False``, uses literal string replacement rather than regex replacement (:issue:`16808`) - :func:`DatetimeIndex.strftime` and :func:`PeriodIndex.strftime` now return an ``Index`` instead of a numpy array to be consistent with similar accessors (:issue:`20127`) +- Constructing a Series from a list of length 1 no longer broadcasts this list when a longer index is specified (:issue:`19714`, :issue:`20391`). .. _whatsnew_0230.deprecations: From f0fd0a7e05ff687d4830a4bb27905983aae7afbf Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 19 Mar 2018 03:07:34 -0700 Subject: [PATCH 14/33] Bug: Allow np.timedelta64 objects to index TimedeltaIndex (#20408) --- doc/source/whatsnew/v0.23.0.txt | 3 ++- pandas/core/indexes/timedeltas.py | 3 ++- pandas/tests/indexing/test_timedelta.py | 12 ++++++++++++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 3c626a7abc11d..acbfa2eb3ccac 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -888,7 +888,8 @@ Timedelta - Bug in :func:`Timedelta.total_seconds()` causing precision errors i.e. ``Timedelta('30S').total_seconds()==30.000000000000004`` (:issue:`19458`) - Bug in :func: `Timedelta.__rmod__` where operating with a ``numpy.timedelta64`` returned a ``timedelta64`` object instead of a ``Timedelta`` (:issue:`19820`) - Multiplication of :class:`TimedeltaIndex` by ``TimedeltaIndex`` will now raise ``TypeError`` instead of raising ``ValueError`` in cases of length mis-match (:issue`19333`) -- +- Bug in indexing a :class:`TimedeltaIndex` with a ``np.timedelta64`` object which was raising a ``TypeError`` (:issue:`20393`) + Timezones ^^^^^^^^^ diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index b5a08fc0168e4..9757d775201cc 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -829,7 +829,8 @@ def _maybe_cast_slice_bound(self, label, side, kind): else: return (lbound + to_offset(parsed.resolution) - Timedelta(1, 'ns')) - elif is_integer(label) or is_float(label): + elif ((is_integer(label) or is_float(label)) and + not is_timedelta64_dtype(label)): self._invalid_indexer('slice', label) return label diff --git a/pandas/tests/indexing/test_timedelta.py b/pandas/tests/indexing/test_timedelta.py index 3ad3b771b2ab2..48ea49119356d 100644 --- a/pandas/tests/indexing/test_timedelta.py +++ b/pandas/tests/indexing/test_timedelta.py @@ -68,3 +68,15 @@ def test_listlike_setitem(self, value): series.iloc[0] = value expected = pd.Series([pd.NaT, 1, 2], dtype='timedelta64[ns]') tm.assert_series_equal(series, expected) + + @pytest.mark.parametrize('start,stop, expected_slice', [ + [np.timedelta64(0, 'ns'), None, slice(0, 11)], + [np.timedelta64(1, 'D'), np.timedelta64(6, 'D'), slice(1, 7)], + [None, np.timedelta64(4, 'D'), slice(0, 5)]]) + def test_numpy_timedelta_scalar_indexing(self, start, stop, + expected_slice): + # GH 20393 + s = pd.Series(range(11), pd.timedelta_range('0 days', '10 days')) + result = s.loc[slice(start, stop)] + expected = s.iloc[expected_slice] + tm.assert_series_equal(result, expected) From 61e05199a59af6cfcd62c7bd0b24001bce934a64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Israel=20Saeta=20P=C3=A9rez?= Date: Mon, 19 Mar 2018 11:58:57 +0100 Subject: [PATCH 15/33] DOC: Only use ~ in class links to hide prefixes. (#20402) This is ported from https://github.com/python-sprints/python-sprints.github.io/pull/78/. --- doc/source/contributing_docstring.rst | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/doc/source/contributing_docstring.rst b/doc/source/contributing_docstring.rst index cd56b76fa891b..c210bb7050fb8 100644 --- a/doc/source/contributing_docstring.rst +++ b/doc/source/contributing_docstring.rst @@ -107,10 +107,18 @@ backticks. It is considered inline code: - The name of a parameter - Python code, a module, function, built-in, type, literal... (e.g. ``os``, ``list``, ``numpy.abs``, ``datetime.date``, ``True``) -- A pandas class (in the form ``:class:`~pandas.Series```) +- A pandas class (in the form ``:class:`pandas.Series```) - A pandas method (in the form ``:meth:`pandas.Series.sum```) - A pandas function (in the form ``:func:`pandas.to_datetime```) +.. note:: + To display only the last component of the linked class, method or + function, prefix it with ``~``. For example, ``:class:`~pandas.Series``` + will link to ``pandas.Series`` but only display the last part, ``Series`` + as the link text. See `Sphinx cross-referencing syntax + `_ + for details. + **Good:** .. code-block:: python From 9fdac27f2ffd85b489c8854e5bacdb4cdf0b139b Mon Sep 17 00:00:00 2001 From: Julio Martinez Date: Mon, 19 Mar 2018 21:57:19 +0100 Subject: [PATCH 16/33] DOC: update the pandas.DataFrame.plot.hist docstring (#20155) --- pandas/plotting/_core.py | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index da7c58428fb54..fa2766bb63d55 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -3049,21 +3049,47 @@ def box(self, by=None, **kwds): def hist(self, by=None, bins=10, **kwds): """ - Histogram + Draw one histogram of the DataFrame's columns. + + A histogram is a representation of the distribution of data. + This function groups the values of all given Series in the DataFrame + into bins, and draws all bins in only one :ref:`matplotlib.axes.Axes`. + This is useful when the DataFrame's Series are in a similar scale. Parameters ---------- - by : string or sequence + by : str or sequence, optional Column in the DataFrame to group by. - bins: integer, default 10 - Number of histogram bins to be used - `**kwds` : optional + bins : int, default 10 + Number of histogram bins to be used. + **kwds Additional keyword arguments are documented in :meth:`pandas.DataFrame.plot`. Returns ------- - axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them + axes : matplotlib.AxesSubplot histogram. + + See Also + -------- + DataFrame.hist : Draw histograms per DataFrame's Series. + Series.hist : Draw a histogram with Series' data. + + Examples + -------- + When we draw a dice 6000 times, we expect to get each value around 1000 + times. But when we draw two dices and sum the result, the distribution + is going to be quite different. A histogram illustrates those + distributions. + + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame( + ... np.random.randint(1, 7, 6000), + ... columns = ['one']) + >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000) + >>> ax = df.plot.hist(bins=12, alpha=0.5) """ return self(kind='hist', by=by, bins=bins, **kwds) From ddb904f2ca1c45ca03c04b0a875e95abf2d4f02b Mon Sep 17 00:00:00 2001 From: scriptomation Date: Mon, 19 Mar 2018 14:13:22 -0700 Subject: [PATCH 17/33] DOC" update the Pandas core window rolling count docstring" (#20264) --- pandas/core/window.py | 41 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/pandas/core/window.py b/pandas/core/window.py index b6217ae344ca5..5cd4fffb5d7dd 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -892,8 +892,43 @@ def calc(x): class _Rolling_and_Expanding(_Rolling): - _shared_docs['count'] = """%(name)s count of number of non-NaN - observations inside provided window.""" + _shared_docs['count'] = dedent(r""" + The %(name)s count of any non-NaN observations inside the window. + + Returns + ------- + Series or DataFrame + Returned object type is determined by the caller of the %(name)s + calculation. + + See Also + -------- + pandas.Series.%(name)s : Calling object with Series data + pandas.DataFrame.%(name)s : Calling object with DataFrames + pandas.DataFrame.count : Count of the full DataFrame + + Examples + -------- + >>> s = pd.Series([2, 3, np.nan, 10]) + >>> s.rolling(2).count() + 0 1.0 + 1 2.0 + 2 1.0 + 3 1.0 + dtype: float64 + >>> s.rolling(3).count() + 0 1.0 + 1 2.0 + 2 2.0 + 3 2.0 + dtype: float64 + >>> s.rolling(4).count() + 0 1.0 + 1 2.0 + 2 2.0 + 3 3.0 + dtype: float64 + """) def count(self): @@ -1451,7 +1486,6 @@ def aggregate(self, arg, *args, **kwargs): agg = aggregate @Substitution(name='rolling') - @Appender(_doc_template) @Appender(_shared_docs['count']) def count(self): @@ -1715,7 +1749,6 @@ def aggregate(self, arg, *args, **kwargs): agg = aggregate @Substitution(name='expanding') - @Appender(_doc_template) @Appender(_shared_docs['count']) def count(self, **kwargs): return super(Expanding, self).count(**kwargs) From 694849da2654d832d5717adabf3fe4e1d5489d43 Mon Sep 17 00:00:00 2001 From: Nikos Karagiannakis Date: Sat, 24 Mar 2018 22:22:43 +0000 Subject: [PATCH 18/33] BUG: astype_unicode astype_str turn a np.nan to empty string (#20377) --- pandas/_libs/lib.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 30521760327b4..f3241b3596633 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -465,7 +465,8 @@ cpdef ndarray[object] astype_unicode(ndarray arr): for i in range(n): # we can use the unsafe version because we know `result` is mutable # since it was created from `np.empty` - util.set_value_at_unsafe(result, i, unicode(arr[i])) + arr_i = arr[i] + util.set_value_at_unsafe(result, i, unicode(arr_i) if arr_i is not np.nan else '') return result @@ -478,7 +479,8 @@ cpdef ndarray[object] astype_str(ndarray arr): for i in range(n): # we can use the unsafe version because we know `result` is mutable # since it was created from `np.empty` - util.set_value_at_unsafe(result, i, str(arr[i])) + arr_i = arr[i] + util.set_value_at_unsafe(result, i, str(arr_i) if arr_i is not np.nan else '') return result From 5ba95a1804157d60d4256853aa5c680d914cdd28 Mon Sep 17 00:00:00 2001 From: Nikos Karagiannakis Date: Sat, 24 Mar 2018 22:24:50 +0000 Subject: [PATCH 19/33] TST: added unitest for read_excel and modified series/test_dtypes for np.nan to empty string (#20377) --- pandas/tests/io/test_excel.py | 29 +++++++++++++++++++++++++++++ pandas/tests/series/test_dtypes.py | 2 +- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 6b39717213c0d..c77e1557652f9 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -361,6 +361,35 @@ def test_reader_dtype(self, ext): with pytest.raises(ValueError): actual = self.get_exceldf(basename, ext, dtype={'d': 'int64'}) + def test_reader_dtype_str(self, ext): + # GH 20377 + basename = 'testdtype' + actual = self.get_exceldf(basename, ext) + + expected = DataFrame({ + 'a': [1, 2, 3, 4], + 'b': [2.5, 3.5, 4.5, 5.5], + 'c': [1, 2, 3, 4], + 'd': [1.0, 2.0, np.nan, 4.0]}).reindex( + columns=['a', 'b', 'c', 'd']) + + tm.assert_frame_equal(actual, expected) + + actual = self.get_exceldf(basename, ext, + dtype={'a': 'float64', + 'b': 'float32', + 'c': str, + 'd': str}) + + expected['a'] = expected['a'].astype('float64') + expected['b'] = expected['b'].astype('float32') + expected['c'] = ['001', '002', '003', '004'] + expected['d'] = ['1', '2', '', '4'] + tm.assert_frame_equal(actual, expected) + + with pytest.raises(ValueError): + actual = self.get_exceldf(basename, ext, dtype={'d': 'int64'}) + def test_reading_all_sheets(self, ext): # Test reading all sheetnames by setting sheetname to None, # Ensure a dict is returned. diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 56ff092dd0a27..ca22a85c02baa 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -142,7 +142,7 @@ def test_astype_datetime64tz(self): tm.rands(1000)]), Series([string.digits * 10, tm.rands(63), - tm.rands(64), nan, 1.0])]) + tm.rands(64), '', 1.0])]) def test_astype_str_map(self, dtype, series): # see gh-4405 result = series.astype(dtype) From d3ceec3cae3ca7a1e67634401c58195a12e62542 Mon Sep 17 00:00:00 2001 From: Nikos Karagiannakis Date: Sun, 25 Mar 2018 00:00:16 +0000 Subject: [PATCH 20/33] TST: added unitest for read_csv (#20377) --- pandas/tests/series/test_io.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 62d1372525cc8..2db7cf4ad301c 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -162,6 +162,16 @@ def test_to_csv_compression(self, compression_no_zip): index_col=0, squeeze=True)) + def test_from_csv_dtype_str(self): + # GH20377 + s = Series([1, 2, np.nan, 4], index=['A', 'B', 'C', 'D'], + name='X') + with ensure_clean() as filename: + s.to_csv(filename, header=True) + rs = pd.read_csv(filename, dtype=str) + expected = Series(['1.0', '2.0', '', '4.0'], name=s.name) + assert_series_equal(rs.X, expected) + class TestSeriesIO(TestData): From ea1d73a7332b62a0b8855a4ddcc7eb962e0d060f Mon Sep 17 00:00:00 2001 From: Nikos Karagiannakis Date: Sun, 25 Mar 2018 00:01:12 +0000 Subject: [PATCH 21/33] BUG: patched TextReader to turn np.nan to empty string if dtype=str (#20377) --- pandas/_libs/parsers.pyx | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 52ca3d1226f79..1338cad1c3718 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1217,8 +1217,14 @@ cdef class TextReader: return result, 0 # treat as a regular string parsing - return self._string_convert(i, start, end, na_filter, - na_hashset) + res, na_count = self._string_convert(i, start, end, na_filter, + na_hashset) + + for i in range(len(res)): + if res[i] is np.nan: + res[i] = '' + return res, na_count + elif dtype.kind == 'U': width = dtype.itemsize if width > 0: @@ -1226,8 +1232,13 @@ cdef class TextReader: "supported for parsing" % dtype) # unicode variable width - return self._string_convert(i, start, end, na_filter, - na_hashset) + res, na_count = self._string_convert(i, start, end, na_filter, + na_hashset) + for i in range(len(res)): + if res[i] is np.nan: + res[i] = '' + return res, na_count + elif is_categorical_dtype(dtype): # TODO: I suspect that _categorical_convert could be # optimized when dtype is an instance of CategoricalDtype From c1376a5410a5285a4d8132bdd7fefc1a784a3ad0 Mon Sep 17 00:00:00 2001 From: Nikos Karagiannakis Date: Sun, 25 Mar 2018 00:04:54 +0000 Subject: [PATCH 22/33] DOC: updated IO section (#20377) --- doc/source/whatsnew/v0.23.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index acbfa2eb3ccac..098a5f56ab29e 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -981,6 +981,7 @@ I/O - :class:`Timedelta` now supported in :func:`DataFrame.to_excel` for all Excel file types (:issue:`19242`, :issue:`9155`, :issue:`19900`) - Bug in :meth:`pandas.io.stata.StataReader.value_labels` raising an ``AttributeError`` when called on very old files. Now returns an empty dict (:issue:`19417`) - Bug in :func:`read_pickle` when unpickling objects with :class:`TimedeltaIndex` or :class:`Float64Index` created with pandas prior to version 0.20 (:issue:`19939`) +- Bug in :func:`read_excel` and :class:`TextReader` now turn np.nan to empty string when dtype=str. They used to turn np.nan to 'nan' (:issue `20377`) Plotting ^^^^^^^^ From edb26d744d234149e2672129887b7e11f1610627 Mon Sep 17 00:00:00 2001 From: Nikos Karagiannakis Date: Mon, 2 Apr 2018 01:32:37 +0100 Subject: [PATCH 23/33] BUG: np.nan stays as np.nan (#20377) --- pandas/_libs/lib.pyx | 4 ++-- pandas/_libs/parsers.pyx | 17 ++++------------- 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index f3241b3596633..51adf361cf022 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -466,7 +466,7 @@ cpdef ndarray[object] astype_unicode(ndarray arr): # we can use the unsafe version because we know `result` is mutable # since it was created from `np.empty` arr_i = arr[i] - util.set_value_at_unsafe(result, i, unicode(arr_i) if arr_i is not np.nan else '') + util.set_value_at_unsafe(result, i, unicode(arr_i) if arr_i is not np.nan else np.nan) return result @@ -480,7 +480,7 @@ cpdef ndarray[object] astype_str(ndarray arr): # we can use the unsafe version because we know `result` is mutable # since it was created from `np.empty` arr_i = arr[i] - util.set_value_at_unsafe(result, i, str(arr_i) if arr_i is not np.nan else '') + util.set_value_at_unsafe(result, i, str(arr_i) if arr_i is not np.nan else np.nan) return result diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index dac957aeff9f4..da00a09f5bd16 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1216,13 +1216,8 @@ cdef class TextReader: return result, 0 # treat as a regular string parsing - res, na_count = self._string_convert(i, start, end, na_filter, - na_hashset) - - for i in range(len(res)): - if res[i] is np.nan: - res[i] = '' - return res, na_count + return self._string_convert(i, start, end, na_filter, + na_hashset) elif dtype.kind == 'U': width = dtype.itemsize @@ -1231,12 +1226,8 @@ cdef class TextReader: "supported for parsing" % dtype) # unicode variable width - res, na_count = self._string_convert(i, start, end, na_filter, - na_hashset) - for i in range(len(res)): - if res[i] is np.nan: - res[i] = '' - return res, na_count + return self._string_convert(i, start, end, na_filter, + na_hashset) elif is_categorical_dtype(dtype): # TODO: I suspect that _categorical_convert could be From c3ab9cb3c1ad811242178021321532788e84e9b4 Mon Sep 17 00:00:00 2001 From: Nikos Karagiannakis Date: Mon, 2 Apr 2018 01:35:20 +0100 Subject: [PATCH 24/33] TXT: Moved test from series.test_io to io.parser.na_values. Corrected tests for np.nan (#20377) --- pandas/tests/io/parser/na_values.py | 20 ++++++++++++++++++++ pandas/tests/io/test_excel.py | 20 +++----------------- pandas/tests/series/test_dtypes.py | 3 ++- pandas/tests/series/test_io.py | 10 ---------- 4 files changed, 25 insertions(+), 28 deletions(-) diff --git a/pandas/tests/io/parser/na_values.py b/pandas/tests/io/parser/na_values.py index d2c3f82e95c4d..627f6584fd81f 100644 --- a/pandas/tests/io/parser/na_values.py +++ b/pandas/tests/io/parser/na_values.py @@ -369,3 +369,23 @@ def test_no_na_filter_on_index(self): expected = DataFrame({"a": [1, 4], "c": [3, 6]}, index=Index([np.nan, 5.0], name="b")) tm.assert_frame_equal(out, expected) + + def test_na_values_with_dtype_str_and_na_filter_true(self): + # see gh-20377 + data = "a,b,c\n1,,3\n4,5,6" + + out = self.read_csv(StringIO(data), na_filter=True, dtype=str) + + # missing data turn to np.nan, which stays as it is after dtype=str + expected = DataFrame({"a": ["1", "4"], "b": [np.nan, "5"], "c": ["3", "6"]}) + tm.assert_frame_equal(out, expected) + + def test_na_values_with_dtype_str_and_na_filter_false(self): + # see gh-20377 + data = "a,b,c\n1,,3\n4,5,6" + + out = self.read_csv(StringIO(data), na_filter=False, dtype=str) + + # missing data turn to empty string + expected = DataFrame({"a": ["1", "4"], "b": ["", "5"], "c": ["3", "6"]}) + tm.assert_frame_equal(out, expected) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 4ebbd94f5a552..23b3719f469e5 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -366,29 +366,15 @@ def test_reader_dtype_str(self, ext): actual = self.get_exceldf(basename, ext) expected = DataFrame({ - 'a': [1, 2, 3, 4], - 'b': [2.5, 3.5, 4.5, 5.5], - 'c': [1, 2, 3, 4], - 'd': [1.0, 2.0, np.nan, 4.0]}).reindex( - columns=['a', 'b', 'c', 'd']) + 'a': [1.0, 2.0, np.nan, 4.0]}) tm.assert_frame_equal(actual, expected) - actual = self.get_exceldf(basename, ext, - dtype={'a': 'float64', - 'b': 'float32', - 'c': str, - 'd': str}) + actual = self.get_exceldf(basename, ext, dtype={'a': str}) - expected['a'] = expected['a'].astype('float64') - expected['b'] = expected['b'].astype('float32') - expected['c'] = ['001', '002', '003', '004'] - expected['d'] = ['1', '2', '', '4'] + expected['a'] = ['1', '2', np.nan, '4'] tm.assert_frame_equal(actual, expected) - with pytest.raises(ValueError): - actual = self.get_exceldf(basename, ext, dtype={'d': 'int64'}) - def test_reading_all_sheets(self, ext): # Test reading all sheetnames by setting sheetname to None, # Ensure a dict is returned. diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index a9d4bbc5b8600..44464d5ce5dd0 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -144,11 +144,12 @@ def test_astype_datetime64tz(self): tm.rands(1000)]), Series([string.digits * 10, tm.rands(63), - tm.rands(64), '', 1.0])]) + tm.rands(64), nan, 1.0])]) def test_astype_str_map(self, dtype, series): # see gh-4405 result = series.astype(dtype) expected = series.map(compat.text_type) + expected.replace('nan', np.nan, inplace=True) # see gh-20377 tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", [str, compat.text_type]) diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index ec97ccba493b7..0b0d4334c86a3 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -162,16 +162,6 @@ def test_to_csv_compression(self, compression): index_col=0, squeeze=True)) - def test_from_csv_dtype_str(self): - # GH20377 - s = Series([1, 2, np.nan, 4], index=['A', 'B', 'C', 'D'], - name='X') - with ensure_clean() as filename: - s.to_csv(filename, header=True) - rs = pd.read_csv(filename, dtype=str) - expected = Series(['1.0', '2.0', '', '4.0'], name=s.name) - assert_series_equal(rs.X, expected) - class TestSeriesIO(TestData): From 69f6c95ba7a368b6b91f2b4d77050821f1d746fa Mon Sep 17 00:00:00 2001 From: Nikos Karagiannakis Date: Mon, 2 Apr 2018 01:38:31 +0100 Subject: [PATCH 25/33] DOC: updated IO section (#20377) --- doc/source/whatsnew/v0.23.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 1ac09a0302a0c..3ab4fd7303041 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1098,7 +1098,7 @@ I/O - Bug in :func:`read_pickle` when unpickling objects with :class:`TimedeltaIndex` or :class:`Float64Index` created with pandas prior to version 0.20 (:issue:`19939`) - Bug in :meth:`pandas.io.json.json_normalize` where subrecords are not properly normalized if any subrecords values are NoneType (:issue:`20030`) - Bug in ``usecols`` parameter in :func:`pandas.io.read_csv` and :func:`pandas.io.read_table` where error is not raised correctly when passing a string. (:issue:`20529`) -- Bug in :func:`read_excel` and :func:`read_csv` where ``np.nan`` turns to ``'nan'`` with ``dtype=str`` and ``na_filter=True``. Now, it turns to ``np.nan``. (:issue `20377`) +- Bug in :func:`read_excel` and :func:`read_csv` where missing values turned to ``'nan'`` with ``dtype=str`` and ``na_filter=True``. Now, they turn to ``np.nan``. (:issue `20377`) Plotting From 97a345a6640782096f6398fbeea5929017b711b4 Mon Sep 17 00:00:00 2001 From: Nikos Karagiannakis Date: Mon, 2 Apr 2018 01:45:05 +0100 Subject: [PATCH 26/33] TST: pep8 (#20377) --- pandas/tests/io/parser/na_values.py | 8 ++++++-- pandas/tests/io/test_excel.py | 3 +-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/parser/na_values.py b/pandas/tests/io/parser/na_values.py index 627f6584fd81f..6cbc8cd752d50 100644 --- a/pandas/tests/io/parser/na_values.py +++ b/pandas/tests/io/parser/na_values.py @@ -377,7 +377,9 @@ def test_na_values_with_dtype_str_and_na_filter_true(self): out = self.read_csv(StringIO(data), na_filter=True, dtype=str) # missing data turn to np.nan, which stays as it is after dtype=str - expected = DataFrame({"a": ["1", "4"], "b": [np.nan, "5"], "c": ["3", "6"]}) + expected = DataFrame({"a": ["1", "4"], + "b": [np.nan, "5"], + "c": ["3", "6"]}) tm.assert_frame_equal(out, expected) def test_na_values_with_dtype_str_and_na_filter_false(self): @@ -387,5 +389,7 @@ def test_na_values_with_dtype_str_and_na_filter_false(self): out = self.read_csv(StringIO(data), na_filter=False, dtype=str) # missing data turn to empty string - expected = DataFrame({"a": ["1", "4"], "b": ["", "5"], "c": ["3", "6"]}) + expected = DataFrame({"a": ["1", "4"], + "b": ["", "5"], + "c": ["3", "6"]}) tm.assert_frame_equal(out, expected) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 23b3719f469e5..749a2bd441550 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -365,8 +365,7 @@ def test_reader_dtype_str(self, ext): basename = 'testdtype' actual = self.get_exceldf(basename, ext) - expected = DataFrame({ - 'a': [1.0, 2.0, np.nan, 4.0]}) + expected = DataFrame({'a': [1.0, 2.0, np.nan, 4.0]}) tm.assert_frame_equal(actual, expected) From 8b2fb0bf8355a60178998a4cecddd5ec5eb5dcf0 Mon Sep 17 00:00:00 2001 From: Nikos Karagiannakis Date: Mon, 2 Apr 2018 01:35:20 +0100 Subject: [PATCH 27/33] TXT: Moved test from series.test_io to io.parser.na_values. Corrected tests for np.nan (#20377) TST: pep8 (#20377) TST: Correction in a test (#20377) --- pandas/tests/io/parser/na_values.py | 24 ++++++++++++++++++++++++ pandas/tests/io/test_excel.py | 5 +---- pandas/tests/series/test_dtypes.py | 3 ++- pandas/tests/series/test_io.py | 10 ---------- 4 files changed, 27 insertions(+), 15 deletions(-) diff --git a/pandas/tests/io/parser/na_values.py b/pandas/tests/io/parser/na_values.py index d2c3f82e95c4d..6cbc8cd752d50 100644 --- a/pandas/tests/io/parser/na_values.py +++ b/pandas/tests/io/parser/na_values.py @@ -369,3 +369,27 @@ def test_no_na_filter_on_index(self): expected = DataFrame({"a": [1, 4], "c": [3, 6]}, index=Index([np.nan, 5.0], name="b")) tm.assert_frame_equal(out, expected) + + def test_na_values_with_dtype_str_and_na_filter_true(self): + # see gh-20377 + data = "a,b,c\n1,,3\n4,5,6" + + out = self.read_csv(StringIO(data), na_filter=True, dtype=str) + + # missing data turn to np.nan, which stays as it is after dtype=str + expected = DataFrame({"a": ["1", "4"], + "b": [np.nan, "5"], + "c": ["3", "6"]}) + tm.assert_frame_equal(out, expected) + + def test_na_values_with_dtype_str_and_na_filter_false(self): + # see gh-20377 + data = "a,b,c\n1,,3\n4,5,6" + + out = self.read_csv(StringIO(data), na_filter=False, dtype=str) + + # missing data turn to empty string + expected = DataFrame({"a": ["1", "4"], + "b": ["", "5"], + "c": ["3", "6"]}) + tm.assert_frame_equal(out, expected) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 4ebbd94f5a552..11359e5ef642d 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -383,12 +383,9 @@ def test_reader_dtype_str(self, ext): expected['a'] = expected['a'].astype('float64') expected['b'] = expected['b'].astype('float32') expected['c'] = ['001', '002', '003', '004'] - expected['d'] = ['1', '2', '', '4'] + expected['d'] = ['1', '2', np.nan, '4'] tm.assert_frame_equal(actual, expected) - with pytest.raises(ValueError): - actual = self.get_exceldf(basename, ext, dtype={'d': 'int64'}) - def test_reading_all_sheets(self, ext): # Test reading all sheetnames by setting sheetname to None, # Ensure a dict is returned. diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index a9d4bbc5b8600..44464d5ce5dd0 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -144,11 +144,12 @@ def test_astype_datetime64tz(self): tm.rands(1000)]), Series([string.digits * 10, tm.rands(63), - tm.rands(64), '', 1.0])]) + tm.rands(64), nan, 1.0])]) def test_astype_str_map(self, dtype, series): # see gh-4405 result = series.astype(dtype) expected = series.map(compat.text_type) + expected.replace('nan', np.nan, inplace=True) # see gh-20377 tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", [str, compat.text_type]) diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index ec97ccba493b7..0b0d4334c86a3 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -162,16 +162,6 @@ def test_to_csv_compression(self, compression): index_col=0, squeeze=True)) - def test_from_csv_dtype_str(self): - # GH20377 - s = Series([1, 2, np.nan, 4], index=['A', 'B', 'C', 'D'], - name='X') - with ensure_clean() as filename: - s.to_csv(filename, header=True) - rs = pd.read_csv(filename, dtype=str) - expected = Series(['1.0', '2.0', '', '4.0'], name=s.name) - assert_series_equal(rs.X, expected) - class TestSeriesIO(TestData): From c9f5120a182cefd8b141d0ddbb3ff470bd40b8e1 Mon Sep 17 00:00:00 2001 From: Nikos Karagiannakis Date: Mon, 2 Apr 2018 01:38:31 +0100 Subject: [PATCH 28/33] DOC: updated IO section (#20377) --- doc/source/whatsnew/v0.23.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 1ac09a0302a0c..3ab4fd7303041 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1098,7 +1098,7 @@ I/O - Bug in :func:`read_pickle` when unpickling objects with :class:`TimedeltaIndex` or :class:`Float64Index` created with pandas prior to version 0.20 (:issue:`19939`) - Bug in :meth:`pandas.io.json.json_normalize` where subrecords are not properly normalized if any subrecords values are NoneType (:issue:`20030`) - Bug in ``usecols`` parameter in :func:`pandas.io.read_csv` and :func:`pandas.io.read_table` where error is not raised correctly when passing a string. (:issue:`20529`) -- Bug in :func:`read_excel` and :func:`read_csv` where ``np.nan`` turns to ``'nan'`` with ``dtype=str`` and ``na_filter=True``. Now, it turns to ``np.nan``. (:issue `20377`) +- Bug in :func:`read_excel` and :func:`read_csv` where missing values turned to ``'nan'`` with ``dtype=str`` and ``na_filter=True``. Now, they turn to ``np.nan``. (:issue `20377`) Plotting From 571d5c473b0a9d082426f99a0165f285d071f91e Mon Sep 17 00:00:00 2001 From: Nikos Karagiannakis Date: Mon, 2 Apr 2018 03:53:54 +0100 Subject: [PATCH 29/33] pep8 correction --- pandas/_libs/lib.pyx | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 51adf361cf022..e27074d1cffe0 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -466,7 +466,10 @@ cpdef ndarray[object] astype_unicode(ndarray arr): # we can use the unsafe version because we know `result` is mutable # since it was created from `np.empty` arr_i = arr[i] - util.set_value_at_unsafe(result, i, unicode(arr_i) if arr_i is not np.nan else np.nan) + util.set_value_at_unsafe( + result, + i, + unicode(arr_i) if arr_i is not np.nan else np.nan) return result @@ -480,7 +483,10 @@ cpdef ndarray[object] astype_str(ndarray arr): # we can use the unsafe version because we know `result` is mutable # since it was created from `np.empty` arr_i = arr[i] - util.set_value_at_unsafe(result, i, str(arr_i) if arr_i is not np.nan else np.nan) + util.set_value_at_unsafe( + result, + i, + str(arr_i) if arr_i is not np.nan else np.nan) return result From 47bc10578aae39268cde013c301ea30c1e7df064 Mon Sep 17 00:00:00 2001 From: Nikos Karagiannakis Date: Fri, 6 Apr 2018 00:27:57 +0100 Subject: [PATCH 30/33] DOC: Better explanation (#20377) --- doc/source/whatsnew/v0.23.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 3ab4fd7303041..8829f3fc5bc83 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1098,7 +1098,7 @@ I/O - Bug in :func:`read_pickle` when unpickling objects with :class:`TimedeltaIndex` or :class:`Float64Index` created with pandas prior to version 0.20 (:issue:`19939`) - Bug in :meth:`pandas.io.json.json_normalize` where subrecords are not properly normalized if any subrecords values are NoneType (:issue:`20030`) - Bug in ``usecols`` parameter in :func:`pandas.io.read_csv` and :func:`pandas.io.read_table` where error is not raised correctly when passing a string. (:issue:`20529`) -- Bug in :func:`read_excel` and :func:`read_csv` where missing values turned to ``'nan'`` with ``dtype=str`` and ``na_filter=True``. Now, they turn to ``np.nan``. (:issue `20377`) +- Bug in :func:`read_excel` and :func:`read_csv` where missing values turned to ``'nan'`` with ``dtype=str`` and ``na_filter=True``. Now, these missing values are converted to the string missing indicator, ``np.nan``. (:issue `20377`) Plotting From 3740dfedccc872509c89cedb3f345f1d6b18093f Mon Sep 17 00:00:00 2001 From: Nikos Karagiannakis Date: Fri, 6 Apr 2018 00:37:57 +0100 Subject: [PATCH 31/33] BUG: use checknull (#20377) --- pandas/_libs/lib.pyx | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e27074d1cffe0..540fc55a2f818 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -461,6 +461,7 @@ cpdef ndarray[object] astype_unicode(ndarray arr): cdef: Py_ssize_t i, n = arr.size ndarray[object] result = np.empty(n, dtype=object) + object arr_i for i in range(n): # we can use the unsafe version because we know `result` is mutable @@ -469,7 +470,8 @@ cpdef ndarray[object] astype_unicode(ndarray arr): util.set_value_at_unsafe( result, i, - unicode(arr_i) if arr_i is not np.nan else np.nan) + unicode(arr_i) if not checknull(arr_i) else np.nan + ) return result @@ -478,6 +480,7 @@ cpdef ndarray[object] astype_str(ndarray arr): cdef: Py_ssize_t i, n = arr.size ndarray[object] result = np.empty(n, dtype=object) + object arr_i for i in range(n): # we can use the unsafe version because we know `result` is mutable @@ -486,7 +489,8 @@ cpdef ndarray[object] astype_str(ndarray arr): util.set_value_at_unsafe( result, i, - str(arr_i) if arr_i is not np.nan else np.nan) + str(arr_i) if not checknull(arr_i) else np.nan + ) return result From 7d453bb16bbf9d604108a6c81e25a7a6beae7e85 Mon Sep 17 00:00:00 2001 From: Nikos Karagiannakis Date: Sun, 8 Apr 2018 12:51:22 +0100 Subject: [PATCH 32/33] TST: update tests (#20377) --- pandas/tests/frame/test_dtypes.py | 2 +- pandas/tests/series/test_dtypes.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 152159965036d..fb0dd4a0e343a 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -529,7 +529,7 @@ def test_astype_str(self): # consistency in astype(str) for tt in set([str, compat.text_type]): result = DataFrame([np.NaN]).astype(tt) - expected = DataFrame(['nan']) + expected = DataFrame([np.NaN], dtype=object) assert_frame_equal(result, expected) result = DataFrame([1.12345678901234567890]).astype(tt) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 44464d5ce5dd0..19894673e07c8 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -149,7 +149,7 @@ def test_astype_str_map(self, dtype, series): # see gh-4405 result = series.astype(dtype) expected = series.map(compat.text_type) - expected.replace('nan', np.nan, inplace=True) # see gh-20377 + expected = expected.replace('nan', np.nan) # see gh-20377 tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", [str, compat.text_type]) From bcd739daea439070467c3e49664698ee6c5cd4e9 Mon Sep 17 00:00:00 2001 From: Nikos Karagiannakis Date: Sun, 8 Apr 2018 12:54:26 +0100 Subject: [PATCH 33/33] BUG: string nans to np.nan in Series for list data (#20377) --- pandas/core/series.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/series.py b/pandas/core/series.py index 1d6f770d92795..9131aa446396a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4153,4 +4153,8 @@ def _try_cast(arr, take_fast_path): data = np.array(data, dtype=dtype, copy=False) subarr = np.array(data, dtype=object, copy=copy) + # GH 20377 + # Turn all 'nan' to np.nan + subarr[subarr == 'nan'] = np.nan + return subarr