From c6d55e2ad7c69bdf4456d6aef99900f16ac13357 Mon Sep 17 00:00:00 2001 From: Jean-Mathieu Deschenes Date: Mon, 17 Jul 2017 11:30:38 -0400 Subject: [PATCH 1/2] Fix for #16836 * Removed unnecessary conversion to i8 * Fixed failed test (`test_frame_column_inplace_sort_exception`) * Added check to ensure that the test is performing its intended goal(`test_sort_nan`) --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/frame.py | 7 +------ pandas/tests/frame/test_sorting.py | 27 ++++++++++++++++++++++++++- 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 2259eb7d89534..3b24177a4f086 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -180,6 +180,7 @@ Bug Fixes - Fixes regression in 0.20, :func:`Series.aggregate` and :func:`DataFrame.aggregate` allow dictionaries as return values again (:issue:`16741`) - Fixes bug where indexing with ``np.inf`` caused an ``OverflowError`` to be raised (:issue:`16957`) +- Fixes regression when sorting by multiple columns on a datetime array with NaT values (:issue:`16836`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9a79ca1d4eab1..85b3f27410508 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3336,18 +3336,13 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, if len(by) > 1: from pandas.core.sorting import lexsort_indexer - def trans(v): - if needs_i8_conversion(v): - return v.view('i8') - return v - keys = [] for x in by: k = self.xs(x, axis=other_axis).values if k.ndim == 2: raise ValueError('Cannot sort by duplicate column %s' % str(x)) - keys.append(trans(k)) + keys.append(k) indexer = lexsort_indexer(keys, orders=ascending, na_position=na_position) indexer = _ensure_platform_int(indexer) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 891c94b59074a..cbf5cdb381234 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -89,6 +89,22 @@ def test_sort_values(self): with tm.assert_raises_regex(ValueError, msg): frame.sort_values(by=['A', 'B'], axis=0, ascending=[True] * 5) + # GH 16836 + + d1 = [Timestamp(x) for x in ['2016-01-01', '2015-01-01', + np.nan, '2016-01-01']] + d2 = [Timestamp(x) for x in ['2017-01-01', '2014-01-01', + '2016-01-01', '2015-01-01']] + df = pd.DataFrame({'a': d1, 'b': d2}, index=[0, 1, 2, 3]) + + d3 = [Timestamp(x) for x in ['2015-01-01', '2016-01-01', + '2016-01-01', np.nan]] + d4 = [Timestamp(x) for x in ['2014-01-01', '2015-01-01', + '2017-01-01', '2016-01-01']] + expected = pd.DataFrame({'a': d3, 'b': d4}, index=[1, 3, 0, 2]) + sorted_df = df.sort_values(by=['a', 'b'], ) + tm.assert_frame_equal(sorted_df, expected) + def test_sort_values_inplace(self): frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4], columns=['A', 'B', 'C', 'D']) @@ -269,6 +285,11 @@ def test_sort_datetimes(self): df2 = df.sort_values(by=['B']) assert_frame_equal(df1, df2) + df1 = df.sort_values(by='B') + + df2 = df.sort_values(by=['C', 'B']) + assert_frame_equal(df1, df2) + def test_frame_column_inplace_sort_exception(self): s = self.frame['A'] with tm.assert_raises_regex(ValueError, "This Series is a view"): @@ -321,7 +342,11 @@ def test_sort_nat_values_in_int_column(self): assert_frame_equal(df_sorted, df_reversed) df_sorted = df.sort_values(["datetime", "float"], na_position="last") - assert_frame_equal(df_sorted, df_reversed) + assert_frame_equal(df_sorted, df) + + # Ascending should not affect the results. + df_sorted = df.sort_values(["datetime", "float"], ascending=False) + assert_frame_equal(df_sorted, df) class TestDataFrameSortIndexKinds(TestData): From 257e10a436b6bac642b3e9ffddee0669c544637a Mon Sep 17 00:00:00 2001 From: Jean-Mathieu Deschenes Date: Tue, 18 Jul 2017 13:57:54 -0400 Subject: [PATCH 2/2] Changes requested by @jreback --- doc/source/whatsnew/v0.21.0.txt | 2 +- pandas/tests/frame/test_sorting.py | 32 +++++++++++++++--------------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 3b24177a4f086..668b0c2b5c3b7 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -180,7 +180,6 @@ Bug Fixes - Fixes regression in 0.20, :func:`Series.aggregate` and :func:`DataFrame.aggregate` allow dictionaries as return values again (:issue:`16741`) - Fixes bug where indexing with ``np.inf`` caused an ``OverflowError`` to be raised (:issue:`16957`) -- Fixes regression when sorting by multiple columns on a datetime array with NaT values (:issue:`16836`) Conversion ^^^^^^^^^^ @@ -223,6 +222,7 @@ Sparse Reshaping ^^^^^^^^^ - Joining/Merging with a non unique ``PeriodIndex`` raised a TypeError (:issue:`16871`) +- Fixes regression when sorting by multiple columns on a ``datetime64`` dtype ``Series`` with ``NaT`` values (:issue:`16836`) Numeric diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index cbf5cdb381234..c356ed11e9328 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -89,22 +89,6 @@ def test_sort_values(self): with tm.assert_raises_regex(ValueError, msg): frame.sort_values(by=['A', 'B'], axis=0, ascending=[True] * 5) - # GH 16836 - - d1 = [Timestamp(x) for x in ['2016-01-01', '2015-01-01', - np.nan, '2016-01-01']] - d2 = [Timestamp(x) for x in ['2017-01-01', '2014-01-01', - '2016-01-01', '2015-01-01']] - df = pd.DataFrame({'a': d1, 'b': d2}, index=[0, 1, 2, 3]) - - d3 = [Timestamp(x) for x in ['2015-01-01', '2016-01-01', - '2016-01-01', np.nan]] - d4 = [Timestamp(x) for x in ['2014-01-01', '2015-01-01', - '2017-01-01', '2016-01-01']] - expected = pd.DataFrame({'a': d3, 'b': d4}, index=[1, 3, 0, 2]) - sorted_df = df.sort_values(by=['a', 'b'], ) - tm.assert_frame_equal(sorted_df, expected) - def test_sort_values_inplace(self): frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4], columns=['A', 'B', 'C', 'D']) @@ -348,6 +332,22 @@ def test_sort_nat_values_in_int_column(self): df_sorted = df.sort_values(["datetime", "float"], ascending=False) assert_frame_equal(df_sorted, df) + # GH 16836 + + d1 = [Timestamp(x) for x in ['2016-01-01', '2015-01-01', + np.nan, '2016-01-01']] + d2 = [Timestamp(x) for x in ['2017-01-01', '2014-01-01', + '2016-01-01', '2015-01-01']] + df = pd.DataFrame({'a': d1, 'b': d2}, index=[0, 1, 2, 3]) + + d3 = [Timestamp(x) for x in ['2015-01-01', '2016-01-01', + '2016-01-01', np.nan]] + d4 = [Timestamp(x) for x in ['2014-01-01', '2015-01-01', + '2017-01-01', '2016-01-01']] + expected = pd.DataFrame({'a': d3, 'b': d4}, index=[1, 3, 0, 2]) + sorted_df = df.sort_values(by=['a', 'b'], ) + tm.assert_frame_equal(sorted_df, expected) + class TestDataFrameSortIndexKinds(TestData):