diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 4cecf12a27042..4bdf033d6d5c6 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -2,7 +2,7 @@ import numpy as np import pandas.util.testing as tm from pandas import (DataFrame, Series, MultiIndex, date_range, period_range, - isnull, NaT) + isnull, NaT, timedelta_range) from .pandas_vb_common import setup # noqa @@ -440,6 +440,55 @@ def time_frame_xs(self, axis): self.df.xs(self.N / 2, axis=axis) +class SortValuesMultipleColumns(object): + goal_time = 0.1 + + param_names = ['columns'] + # params = generate_column_combinations( + # column_names=['less_repeated_strings', + # 'repeated_category', 'less_repeated_category', + # 'float', 'int_sorted', 'int_random', + # 'date', 'timedelta'], r=[2, 5], num=20) + params = ['repeated_strings|int_same_cardinality_as_repeated_strings', + 'int_same_cardinality_as_repeated_strings|' + 'int_same_cardinality_as_repeated_strings_copy'] + + def setup(self, columns): + N = 1000000 + + self.df = DataFrame( + {'repeated_strings': Series(tm.makeStringIndex(100).take( + np.random.randint(0, 100, size=N))), + 'less_repeated_strings': Series( + tm.makeStringIndex(10000).take( + np.random.randint(0, 10000, size=N))), + 'float': np.random.randn(N), + 'int_sorted': np.arange(N), + 'int_random': np.random.randint(0, 10000000, N), + 'date': date_range('20110101', freq='s', periods=N), + 'timedelta': timedelta_range('1 day', freq='s', periods=N), + }) + self.df['repeated_category'] = \ + self.df['repeated_strings'].astype('category') + self.df['less_repeated_category'] = \ + self.df['less_repeated_strings'].astype('category') + self.df['int_same_cardinality_as_repeated_strings'] = \ + self.df['repeated_strings'].rank(method='dense') + self.df['int_same_cardinality_as_repeated_strings_copy'] = \ + self.df['repeated_strings'].rank(method='dense') + self.df['int_same_cardinality_as_less_repeated_strings'] = \ + self.df['less_repeated_strings'].rank(method='dense') + assert self.df['repeated_strings'].nunique() == \ + self.df['int_same_cardinality_as_repeated_strings'].nunique() + assert self.df['less_repeated_strings'].nunique() == \ + self.df['int_same_cardinality_as_less_repeated_strings']\ + .nunique() + + def time_frame_sort_values_by_multiple_columns(self, columns): + columns_list = columns.split('|') + DataFrame(self.df[columns_list]).sort_values(by=columns_list) + + class SortValues(object): goal_time = 0.2 @@ -447,11 +496,14 @@ class SortValues(object): param_names = ['ascending'] def setup(self, ascending): - self.df = DataFrame(np.random.randn(1000000, 2), columns=list('AB')) + self.df = DataFrame(np.random.randn(1000000, 5), columns=list('ABCDE')) def time_frame_sort_values(self, ascending): self.df.sort_values(by='A', ascending=ascending) + def time_frame_sort_values_two_columns(self, ascending): + self.df.sort_values(by=['A', 'B', 'C', 'D', 'E'], ascending=ascending) + class SortIndexByColumns(object): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4d907180da00a..e699552aa9674 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3637,16 +3637,45 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, raise ValueError('Length of ascending (%d) != length of by (%d)' % (len(ascending), len(by))) if len(by) > 1: - from pandas.core.sorting import lexsort_indexer + if any([is_object_dtype(self._get_label_or_level_values( + x, axis=axis, stacklevel=stacklevel)) for x in by]): + from pandas.core.sorting import lexsort_indexer + + keys = [] + for x in by: + k = self._get_label_or_level_values(x, axis=axis, + stacklevel=stacklevel) + keys.append(k) + indexer = lexsort_indexer(keys, orders=ascending, + na_position=na_position) + indexer = _ensure_platform_int(indexer) + + new_data = self._data.take(indexer, + axis=self._get_block_manager_axis( + axis), + verify=False) + else: + if not is_list_like(ascending): + ascending = [ascending] * len(by) + ascending = ascending[::-1] + new_data = self + from pandas.core.sorting import nargsort + kind = 'mergesort' + + for i, by_step in enumerate(by[::-1]): + k = self._get_label_or_level_values( + by_step, axis=axis, stacklevel=stacklevel) + + indexer = nargsort(k, kind=kind, ascending=ascending[i], + na_position=na_position) + + new_data = new_data._data.take( + indexer, + axis=self._get_block_manager_axis(axis), + convert=False, verify=False) + + new_data = self._constructor(new_data).__finalize__(self) - keys = [] - for x in by: - k = self._get_label_or_level_values(x, axis=axis, - stacklevel=stacklevel) - keys.append(k) - indexer = lexsort_indexer(keys, orders=ascending, - na_position=na_position) - indexer = _ensure_platform_int(indexer) else: from pandas.core.sorting import nargsort @@ -3660,9 +3689,9 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, indexer = nargsort(k, kind=kind, ascending=ascending, na_position=na_position) - new_data = self._data.take(indexer, - axis=self._get_block_manager_axis(axis), - verify=False) + new_data = self._data.take(indexer, + axis=self._get_block_manager_axis(axis), + verify=False) if inplace: return self._update_inplace(new_data)