[Review] Add fast path for multi-column sorting (#229)

quasiben · charlesbluca · web-flow · commit dcf93ee30cce · 2021-10-07T15:27:01.000-07:00
* add fast path for multi-column sorting

* lint

* Prevent single column Dask dataframes from calling sort_values

* Wrap dask_cudf import in try/except block

* Add test for fast multi column sort

* Move multi_col_sort contents to apply_sort

* Ignore index for dask-cudf sorting

* Fix show tables test for cudf enabled fixture

* Trigger CI

* Add single partition sort case

* Return cudf sorted dataframe without persisting

* Update nan sort test to reflect Pandas' sort_values ordering

* Add comments tracking relevant [dask-]cudf issues

* Move GPU sorting tests to test_sort.py

* Remove unnecessary isin import

Co-authored-by: Charles Blackmon-Luca &lt;20627856+charlesbluca@users.noreply.github.com&gt;
diff --git a/dask_sql/physical/utils/sort.py b/dask_sql/physical/utils/sort.py
@@ -2,8 +2,14 @@
 
 import dask.dataframe as dd
 import pandas as pd
+from dask.utils import M
 
-from dask_sql.utils import make_pickable_without_dask_sql, new_temporary_column
+from dask_sql.utils import make_pickable_without_dask_sql
+
+try:
+    import dask_cudf
+except ImportError:
+    dask_cudf = None
 
 
 def apply_sort(
@@ -12,6 +18,46 @@ def apply_sort(
     sort_ascending: List[bool],
     sort_null_first: List[bool],
 ) -> dd.DataFrame:
+    # if we have a single partition, we can sometimes sort with map_partitions
+    if df.npartitions == 1:
+        if dask_cudf is not None and isinstance(df, dask_cudf.DataFrame):
+            # cudf only supports null positioning if `ascending` is a single boolean:
+            # https://github.com/rapidsai/cudf/issues/9400
+            if (all(sort_ascending) or not any(sort_ascending)) and not any(
+                sort_null_first[1:]
+            ):
+                return df.map_partitions(
+                    M.sort_values,
+                    by=sort_columns,
+                    ascending=all(sort_ascending),
+                    na_position="first" if sort_null_first[0] else "last",
+                )
+            if not any(sort_null_first):
+                return df.map_partitions(
+                    M.sort_values, by=sort_columns, ascending=sort_ascending
+                )
+        elif not any(sort_null_first[1:]):
+            return df.map_partitions(
+                M.sort_values,
+                by=sort_columns,
+                ascending=sort_ascending,
+                na_position="first" if sort_null_first[0] else "last",
+            )
+
+    # dask-cudf only supports ascending sort / nulls last:
+    # https://github.com/rapidsai/cudf/pull/9250
+    # https://github.com/rapidsai/cudf/pull/9264
+    if (
+        dask_cudf is not None
+        and isinstance(df, dask_cudf.DataFrame)
+        and all(sort_ascending)
+        and not any(sort_null_first)
+    ):
+        try:
+            return df.sort_values(sort_columns, ignore_index=True)
+        except ValueError:
+            pass
+
     # Split the first column. We need to handle this one with set_index
     first_sort_column = sort_columns[0]
     first_sort_ascending = sort_ascending[0]
diff --git a/tests/integration/fixtures.py b/tests/integration/fixtures.py
@@ -9,6 +9,11 @@
 from dask.distributed import Client
 from pandas.testing import assert_frame_equal
 
+try:
+    import cudf
+except ImportError:
+    cudf = None
+
 
 @pytest.fixture()
 def timeseries_df(c):
@@ -86,6 +91,21 @@ def datetime_table():
     )
 
 
+@pytest.fixture()
+def gpu_user_table_1(user_table_1):
+    return cudf.from_pandas(user_table_1) if cudf else None
+
+
+@pytest.fixture()
+def gpu_df(df):
+    return cudf.from_pandas(df) if cudf else None
+
+
+@pytest.fixture()
+def gpu_long_table(long_table):
+    return cudf.from_pandas(long_table) if cudf else None
+
+
 @pytest.fixture()
 def c(
     df_simple,
@@ -97,6 +117,9 @@ def c(
     user_table_nan,
     string_table,
     datetime_table,
+    gpu_user_table_1,
+    gpu_df,
+    gpu_long_table,
 ):
     dfs = {
         "df_simple": df_simple,
@@ -108,13 +131,18 @@ def c(
         "user_table_nan": user_table_nan,
         "string_table": string_table,
         "datetime_table": datetime_table,
+        "gpu_user_table_1": gpu_user_table_1,
+        "gpu_df": gpu_df,
+        "gpu_long_table": gpu_long_table,
     }
 
     # Lazy import, otherwise the pytest framework has problems
     from dask_sql.context import Context
 
     c = Context()
     for df_name, df in dfs.items():
+        if df is None:
+            continue
         dask_df = dd.from_pandas(df, npartitions=3)
         c.create_table(df_name, dask_df)
 
diff --git a/tests/integration/test_show.py b/tests/integration/test_show.py
@@ -2,6 +2,11 @@
 import pytest
 from pandas.testing import assert_frame_equal
 
+try:
+    import cudf
+except ImportError:
+    cudf = None
+
 
 def test_schemas(c):
     df = c.sql("SHOW SCHEMAS")
@@ -36,6 +41,21 @@ def test_tables(c):
                 "string_table",
                 "datetime_table",
             ]
+            if cudf is None
+            else [
+                "df",
+                "df_simple",
+                "user_table_1",
+                "user_table_2",
+                "long_table",
+                "user_table_inf",
+                "user_table_nan",
+                "string_table",
+                "datetime_table",
+                "gpu_user_table_1",
+                "gpu_df",
+                "gpu_long_table",
+            ]
         }
     )
 
diff --git a/tests/integration/test_sort.py b/tests/integration/test_sort.py