Skip to content

PERF: sort_values speedup for multiple columns with random numeric values #19237

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 54 additions & 2 deletions asv_bench/benchmarks/frame_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import numpy as np
import pandas.util.testing as tm
from pandas import (DataFrame, Series, MultiIndex, date_range, period_range,
isnull, NaT)
isnull, NaT, timedelta_range)

from .pandas_vb_common import setup # noqa

Expand Down Expand Up @@ -440,18 +440,70 @@ def time_frame_xs(self, axis):
self.df.xs(self.N / 2, axis=axis)


class SortValuesMultipleColumns(object):
goal_time = 0.1

param_names = ['columns']
# params = generate_column_combinations(
# column_names=['less_repeated_strings',
# 'repeated_category', 'less_repeated_category',
# 'float', 'int_sorted', 'int_random',
# 'date', 'timedelta'], r=[2, 5], num=20)
params = ['repeated_strings|int_same_cardinality_as_repeated_strings',
'int_same_cardinality_as_repeated_strings|'
'int_same_cardinality_as_repeated_strings_copy']

def setup(self, columns):
N = 1000000

self.df = DataFrame(
{'repeated_strings': Series(tm.makeStringIndex(100).take(
np.random.randint(0, 100, size=N))),
'less_repeated_strings': Series(
tm.makeStringIndex(10000).take(
np.random.randint(0, 10000, size=N))),
'float': np.random.randn(N),
'int_sorted': np.arange(N),
'int_random': np.random.randint(0, 10000000, N),
'date': date_range('20110101', freq='s', periods=N),
'timedelta': timedelta_range('1 day', freq='s', periods=N),
})
self.df['repeated_category'] = \
self.df['repeated_strings'].astype('category')
self.df['less_repeated_category'] = \
self.df['less_repeated_strings'].astype('category')
self.df['int_same_cardinality_as_repeated_strings'] = \
self.df['repeated_strings'].rank(method='dense')
self.df['int_same_cardinality_as_repeated_strings_copy'] = \
self.df['repeated_strings'].rank(method='dense')
self.df['int_same_cardinality_as_less_repeated_strings'] = \
self.df['less_repeated_strings'].rank(method='dense')
assert self.df['repeated_strings'].nunique() == \
self.df['int_same_cardinality_as_repeated_strings'].nunique()
assert self.df['less_repeated_strings'].nunique() == \
self.df['int_same_cardinality_as_less_repeated_strings']\
.nunique()

def time_frame_sort_values_by_multiple_columns(self, columns):
columns_list = columns.split('|')
DataFrame(self.df[columns_list]).sort_values(by=columns_list)


class SortValues(object):

goal_time = 0.2
params = [True, False]
param_names = ['ascending']

def setup(self, ascending):
self.df = DataFrame(np.random.randn(1000000, 2), columns=list('AB'))
self.df = DataFrame(np.random.randn(1000000, 5), columns=list('ABCDE'))

def time_frame_sort_values(self, ascending):
self.df.sort_values(by='A', ascending=ascending)

def time_frame_sort_values_two_columns(self, ascending):
self.df.sort_values(by=['A', 'B', 'C', 'D', 'E'], ascending=ascending)


class SortIndexByColumns(object):

Expand Down
53 changes: 41 additions & 12 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3637,16 +3637,45 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False,
raise ValueError('Length of ascending (%d) != length of by (%d)' %
(len(ascending), len(by)))
if len(by) > 1:
from pandas.core.sorting import lexsort_indexer
if any([is_object_dtype(self._get_label_or_level_values(
x, axis=axis, stacklevel=stacklevel)) for x in by]):
from pandas.core.sorting import lexsort_indexer

keys = []
for x in by:
k = self._get_label_or_level_values(x, axis=axis,
stacklevel=stacklevel)
keys.append(k)
indexer = lexsort_indexer(keys, orders=ascending,
na_position=na_position)
indexer = _ensure_platform_int(indexer)

new_data = self._data.take(indexer,
axis=self._get_block_manager_axis(
axis),
verify=False)
else:
if not is_list_like(ascending):
ascending = [ascending] * len(by)
ascending = ascending[::-1]
new_data = self
from pandas.core.sorting import nargsort
kind = 'mergesort'

for i, by_step in enumerate(by[::-1]):
k = self._get_label_or_level_values(
by_step, axis=axis, stacklevel=stacklevel)

indexer = nargsort(k, kind=kind, ascending=ascending[i],
na_position=na_position)

new_data = new_data._data.take(
indexer,
axis=self._get_block_manager_axis(axis),
convert=False, verify=False)

new_data = self._constructor(new_data).__finalize__(self)

keys = []
for x in by:
k = self._get_label_or_level_values(x, axis=axis,
stacklevel=stacklevel)
keys.append(k)
indexer = lexsort_indexer(keys, orders=ascending,
na_position=na_position)
indexer = _ensure_platform_int(indexer)
else:
from pandas.core.sorting import nargsort

Expand All @@ -3660,9 +3689,9 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False,
indexer = nargsort(k, kind=kind, ascending=ascending,
na_position=na_position)

new_data = self._data.take(indexer,
axis=self._get_block_manager_axis(axis),
verify=False)
new_data = self._data.take(indexer,
axis=self._get_block_manager_axis(axis),
verify=False)

if inplace:
return self._update_inplace(new_data)
Expand Down