From 3f079083796e7323dd7951fec94ab57274d9da3a Mon Sep 17 00:00:00 2001 From: Sven Date: Tue, 5 Dec 2017 13:47:42 +1100 Subject: [PATCH 1/8] Fixed #18413, but test case not passing * Handle all-NaN columns differently when building metadata for categorical axes on saving hdf5 file * Categorical axes fail test case comparison due to type difference (even though there isn't a visibly type difference) --- pandas/io/pytables.py | 15 +++++++++++---- pandas/tests/io/test_pytables.py | 19 +++++++++++++++++++ 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 2a66aea88f6d9..2e7dd7bad4ee3 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2137,10 +2137,17 @@ def convert(self, values, nan_rep, encoding): # if we have stored a NaN in the categories # then strip it; in theory we could have BOTH # -1s in the codes and nulls :< - mask = isna(categories) - if mask.any(): - categories = categories[~mask] - codes[codes != -1] -= mask.astype(int).cumsum().values + if categories is None: + # Handle case of NaN-only categorical columns in which case + # the categories are an empty array; when this is stored, + # pytables cannot write a zero-len array, so on readback + # the categories would be None and `read_hdf()` would fail. + categories = [] + else: + mask = isna(categories) + if mask.any(): + categories = categories[~mask] + codes[codes != -1] -= mask.astype(int).cumsum().values self.data = Categorical.from_codes(codes, categories=categories, diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 5e5fc6e7eac62..868ea243b1d08 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -4927,6 +4927,25 @@ def test_categorical_conversion(self): result = read_hdf(path, 'df', where='obsids=B') tm.assert_frame_equal(result, expected) + def test_categorical_nan_only_columns(self): + # GH18413 + # Check that read_hdf with categorical columns with NaN-only values can + # be read back. + df = pd.DataFrame({ + 'a': ['a', 'b', 'c', np.nan], + 'b': [np.nan, np.nan, np.nan, np.nan], + 'c': [1, 2, 3, 4] + }) + df['a'] = df.a.astype('category') + df['b'] = df.b.astype('category') + expected = df.copy() + with ensure_clean_path(self.path) as path: + df.to_hdf(path, 'df', format='table', data_columns=True) + result = read_hdf(path, 'df') + print 'result', result.dtypes + print 'expected', expected.dtypes + tm.assert_frame_equal(result, expected) + def test_duplicate_column_name(self): df = DataFrame(columns=["a", "a"], data=[[0, 0]]) From 7a4f93a093ee7573eed5ced4f64e747d65e15bae Mon Sep 17 00:00:00 2001 From: Sven Date: Wed, 6 Dec 2017 11:40:20 +1100 Subject: [PATCH 2/8] Fixed test case (#18413) * Change empty category to `Index([], dtype=np.float64)` instead of `[]`. * Remove printouts in test case. --- pandas/io/pytables.py | 2 +- pandas/tests/io/test_pytables.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 2e7dd7bad4ee3..568540389958b 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2142,7 +2142,7 @@ def convert(self, values, nan_rep, encoding): # the categories are an empty array; when this is stored, # pytables cannot write a zero-len array, so on readback # the categories would be None and `read_hdf()` would fail. - categories = [] + categories = Index([], dtype=np.float64) else: mask = isna(categories) if mask.any(): diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 868ea243b1d08..d70966af176d1 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -4942,8 +4942,6 @@ def test_categorical_nan_only_columns(self): with ensure_clean_path(self.path) as path: df.to_hdf(path, 'df', format='table', data_columns=True) result = read_hdf(path, 'df') - print 'result', result.dtypes - print 'expected', expected.dtypes tm.assert_frame_equal(result, expected) def test_duplicate_column_name(self): From 8540a4afd71edaf5a3ff1bf7653037d6a60c8c12 Mon Sep 17 00:00:00 2001 From: Sven Date: Wed, 6 Dec 2017 11:48:06 +1100 Subject: [PATCH 3/8] Removed trailing whitespace --- pandas/io/pytables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 568540389958b..97b57fb9d5e43 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2147,7 +2147,7 @@ def convert(self, values, nan_rep, encoding): mask = isna(categories) if mask.any(): categories = categories[~mask] - codes[codes != -1] -= mask.astype(int).cumsum().values + codes[codes != -1] -= mask.astype(int).cumsum().values self.data = Categorical.from_codes(codes, categories=categories, From 928c258edaa06c2b94e7e8132aab5fe757a73baf Mon Sep 17 00:00:00 2001 From: Sven Date: Wed, 6 Dec 2017 12:39:05 +1100 Subject: [PATCH 4/8] Removed unnecessary dataframe copy --- pandas/tests/io/test_pytables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index d70966af176d1..b00464240db36 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -4938,7 +4938,7 @@ def test_categorical_nan_only_columns(self): }) df['a'] = df.a.astype('category') df['b'] = df.b.astype('category') - expected = df.copy() + expected = df with ensure_clean_path(self.path) as path: df.to_hdf(path, 'df', format='table', data_columns=True) result = read_hdf(path, 'df') From b3766f7e49979b4df2204cdf257a8295bd5b854f Mon Sep 17 00:00:00 2001 From: Sven Date: Wed, 6 Dec 2017 12:44:23 +1100 Subject: [PATCH 5/8] Update whatsnew section --- doc/source/whatsnew/v0.22.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 52ca05d9a76a9..af50ebf79649f 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -157,7 +157,7 @@ I/O - Bug in :func:`read_msgpack` with a non existent file is passed in Python 2 (:issue:`15296`) - Bug in :func:`read_csv` where a ``MultiIndex`` with duplicate columns was not being mangled appropriately (:issue:`18062`) - Bug in :func:`read_sas` where a file with 0 variables gave an ``AttributeError`` incorrectly. Now it gives an ``EmptyDataError`` (:issue:`18184`) -- +- Bug when storing NaN-only categorical columns in hdf5 store (:issue:`18413`) - Plotting From 52dc141109005b7458fdff8c1699ae00ea0924fa Mon Sep 17 00:00:00 2001 From: Sven Date: Wed, 6 Dec 2017 13:07:29 +1100 Subject: [PATCH 6/8] Moved whatsnew entry to 0.21.1 --- doc/source/whatsnew/v0.21.1.txt | 1 + doc/source/whatsnew/v0.22.0.txt | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index e307e605687bf..881babcf2c243 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -89,6 +89,7 @@ I/O - Bug in parsing integer datetime-like columns with specified format in ``read_sql`` (:issue:`17855`). - Bug in :meth:`DataFrame.to_msgpack` when serializing data of the numpy.bool_ datatype (:issue:`18390`) - Bug in :func:`read_json` not decoding when reading line deliminted JSON from S3 (:issue:`17200`) +- Bug when storing NaN-only categorical columns in hdf5 store (:issue:`18413`) Plotting ^^^^^^^^ diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index af50ebf79649f..52ca05d9a76a9 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -157,7 +157,7 @@ I/O - Bug in :func:`read_msgpack` with a non existent file is passed in Python 2 (:issue:`15296`) - Bug in :func:`read_csv` where a ``MultiIndex`` with duplicate columns was not being mangled appropriately (:issue:`18062`) - Bug in :func:`read_sas` where a file with 0 variables gave an ``AttributeError`` incorrectly. Now it gives an ``EmptyDataError`` (:issue:`18184`) -- Bug when storing NaN-only categorical columns in hdf5 store (:issue:`18413`) +- - Plotting From e6aad40a35c1adfe87af5f96a57a92d23b4b2624 Mon Sep 17 00:00:00 2001 From: Sven Date: Fri, 8 Dec 2017 21:08:11 +1100 Subject: [PATCH 7/8] Addressed requested changes * Added additional all-None Series * Provided more detail in whatsnew description --- .gitignore | 1 + doc/source/whatsnew/v0.21.1.txt | 1 + pandas/tests/io/test_pytables.py | 4 +++- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index ff0a6aef47163..00004f262e84c 100644 --- a/.gitignore +++ b/.gitignore @@ -106,3 +106,4 @@ doc/build/html/index.html doc/tmp.sv doc/source/styled.xlsx doc/source/templates/ +env/ \ No newline at end of file diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index a80de0a560fac..3b283df9302e8 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -92,6 +92,7 @@ I/O - Bug in :func:`read_json` not decoding when reading line deliminted JSON from S3 (:issue:`17200`) - Bug in :func:`pandas.io.json.json_normalize` to avoid modification of ``meta`` (:issue:`18610`) - Bug when storing NaN-only categorical columns in hdf5 store (:issue:`18413`) +- Bug when reading NaN-only categorical columns in :class:`HDFStore` (:issue:`18413`) Plotting ^^^^^^^^ diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index a3442421761d0..d69f3cb293ad8 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -4935,10 +4935,12 @@ def test_categorical_nan_only_columns(self): df = pd.DataFrame({ 'a': ['a', 'b', 'c', np.nan], 'b': [np.nan, np.nan, np.nan, np.nan], - 'c': [1, 2, 3, 4] + 'c': [1, 2, 3, 4], + 'd': pd.Series([None]* 4, dtype=object) }) df['a'] = df.a.astype('category') df['b'] = df.b.astype('category') + df['d'] = df.b.astype('category') expected = df with ensure_clean_path(self.path) as path: df.to_hdf(path, 'df', format='table', data_columns=True) From b2ac7c4d534975eeac49be0ddbfa7e8a3eb89602 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 10 Dec 2017 10:54:25 -0500 Subject: [PATCH 8/8] lint fixes --- .gitignore | 2 +- pandas/tests/io/test_pytables.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 00004f262e84c..b1748ae72b8ba 100644 --- a/.gitignore +++ b/.gitignore @@ -106,4 +106,4 @@ doc/build/html/index.html doc/tmp.sv doc/source/styled.xlsx doc/source/templates/ -env/ \ No newline at end of file +env/ diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 54dc41e390b31..85f24e794f12a 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -4936,7 +4936,7 @@ def test_categorical_nan_only_columns(self): 'a': ['a', 'b', 'c', np.nan], 'b': [np.nan, np.nan, np.nan, np.nan], 'c': [1, 2, 3, 4], - 'd': pd.Series([None]* 4, dtype=object) + 'd': pd.Series([None] * 4, dtype=object) }) df['a'] = df.a.astype('category') df['b'] = df.b.astype('category')