Avoid schema enforcement from meta on Arrow data in P2P shuffling (#8235)

hendrikmakait · web-flow · commit de3f755243ec · 2023-10-05T14:34:19.000+02:00
diff --git a/distributed/shuffle/_arrow.py b/distributed/shuffle/_arrow.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from pathlib import Path
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING
 
 from packaging.version import parse
 
@@ -80,15 +80,12 @@ def deserialize_table(buffer: bytes) -> pa.Table:
         return reader.read_all()
 
 
-def read_from_disk(path: Path, meta: pd.DataFrame) -> tuple[Any, int]:
+def read_from_disk(path: Path) -> tuple[list[pa.Table], int]:
     import pyarrow as pa
 
-    from dask.dataframe.dispatch import pyarrow_schema_dispatch
-
     batch_size = parse_bytes("1 MiB")
     batch = []
     shards = []
-    schema = pyarrow_schema_dispatch(meta, preserve_index=True)
 
     with pa.OSFile(str(path), mode="rb") as f:
         size = f.seek(0, whence=2)
@@ -103,17 +100,17 @@ def read_from_disk(path: Path, meta: pd.DataFrame) -> tuple[Any, int]:
 
             if offset - prev >= batch_size:
                 table = pa.concat_tables(batch)
-                shards.append(_copy_table(table, schema))
+                shards.append(_copy_table(table))
                 batch = []
                 prev = offset
     if batch:
         table = pa.concat_tables(batch)
-        shards.append(_copy_table(table, schema))
+        shards.append(_copy_table(table))
     return shards, size
 
 
-def _copy_table(table: pa.Table, schema: pa.Schema) -> pa.Table:
+def _copy_table(table: pa.Table) -> pa.Table:
     import pyarrow as pa
 
     arrs = [pa.concat_arrays(column.chunks) for column in table.columns]
-    return pa.table(data=arrs, schema=schema)
+    return pa.table(data=arrs, schema=table.schema)
diff --git a/distributed/shuffle/_shuffle.py b/distributed/shuffle/_shuffle.py
@@ -499,8 +499,8 @@ def _(partition_id: int, meta: pd.DataFrame) -> pd.DataFrame:
     def _get_assigned_worker(self, id: int) -> str:
         return self.worker_for[id]
 
-    def read(self, path: Path) -> tuple[Any, int]:
-        return read_from_disk(path, self.meta)
+    def read(self, path: Path) -> tuple[pa.Table, int]:
+        return read_from_disk(path)
 
 
 @dataclass(frozen=True)
diff --git a/distributed/shuffle/tests/test_shuffle.py b/distributed/shuffle/tests/test_shuffle.py
@@ -1129,7 +1129,7 @@ def __init__(self, value: int) -> None:
 
     out = {}
     for k in range(npartitions):
-        shards, _ = read_from_disk(tmp_path / str(k), meta)
+        shards, _ = read_from_disk(tmp_path / str(k))
         out[k] = convert_shards(shards, meta)
 
     shuffled_df = pd.concat(df for df in out.values())
@@ -2100,7 +2100,7 @@ async def test_replace_stale_shuffle(c, s, a, b):
 
 
 @gen_cluster(client=True)
-async def test_handle_null_partitions_p2p_shuffling(c, s, *workers):
+async def test_handle_null_partitions_p2p_shuffling(c, s, a, b):
     data = [
         {"companies": [], "id": "a", "x": None},
         {"companies": [{"id": 3}, {"id": 5}], "id": "b", "x": None},
@@ -2113,8 +2113,8 @@ async def test_handle_null_partitions_p2p_shuffling(c, s, *workers):
     result = await c.compute(ddf)
     dd.assert_eq(result, df)
 
-    await c.close()
-    await asyncio.gather(*[check_worker_cleanup(w) for w in workers])
+    await check_worker_cleanup(a)
+    await check_worker_cleanup(b)
     await check_scheduler_cleanup(s)
 
 
@@ -2133,7 +2133,35 @@ def make_partition(i):
     result = await result
     expected = await expected
     dd.assert_eq(result, expected)
-    del result
+
+    await check_worker_cleanup(a)
+    await check_worker_cleanup(b)
+    await check_scheduler_cleanup(s)
+
+
+@gen_cluster(client=True)
+async def test_handle_object_columns_p2p(c, s, a, b):
+    with dask.config.set({"dataframe.convert-string": False}):
+        df = pd.DataFrame(
+            {
+                "a": [1, 2, 3],
+                "b": [
+                    np.asarray([1, 2, 3]),
+                    np.asarray([4, 5, 6]),
+                    np.asarray([7, 8, 9]),
+                ],
+                "c": ["foo", "bar", "baz"],
+            }
+        )
+
+        ddf = dd.from_pandas(
+            df,
+            npartitions=2,
+        )
+        shuffled = ddf.shuffle(on="a")
+
+        result = await c.compute(shuffled)
+    dd.assert_eq(result, df)
 
     await check_worker_cleanup(a)
     await check_worker_cleanup(b)