Skip to content

Commit 25d9a28

Browse files
dsgreen2pre-commit-ci[bot]Illviljandcherian
authored
Implement DataArray.to_dask_dataframe() (pydata#7635)
* Add feature to convert dataarray to dask dataframe. This is for the issue pydata#7409 * Add test for new method dataarray.to_dask_dataframe() * Changes after review * Corrections in docstring and import * docstring correction * Remove name parameter * Add feature to convert dataarray to dask dataframe. This is for the issue pydata#7409 * Add test for new method dataarray.to_dask_dataframe() * Changes after review * Corrections in docstring and import * docstring correction * Remove name parameter * Corrected doc/whats-new.rst * Update whats-new.rst * Space corrections in docstring * Whitespace correction in docstring * Add white space in docstring line * Whitespace correction * Update line npartitions=1 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Revert "Update line npartitions=1" This reverts commit 4bae82c. Reverting commit . * Add whitespace in npartitions=1 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Change example in docstring * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Change example in docstring * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update xarray/core/dataarray.py Co-authored-by: Illviljan <[email protected]> * Update doc/whats-new.rst Co-authored-by: Illviljan <[email protected]> * Add name check * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add test for unnamed dataarray. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove scalar array test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Change error message * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update xarray/tests/test_dataarray.py * Update whats-new.rst * Update doc/whats-new.rst --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Illviljan <[email protected]> Co-authored-by: Deepak Cherian <[email protected]>
1 parent 087ebbb commit 25d9a28

File tree

4 files changed

+105
-0
lines changed

4 files changed

+105
-0
lines changed

doc/api.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -632,6 +632,7 @@ DataArray methods
632632
DataArray.from_iris
633633
DataArray.from_series
634634
DataArray.to_cdms2
635+
DataArray.to_dask_dataframe
635636
DataArray.to_dataframe
636637
DataArray.to_dataset
637638
DataArray.to_dict

doc/whats-new.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ v2023.05.0 (unreleased)
2222

2323
New Features
2424
~~~~~~~~~~~~
25+
- Added new method :py:meth:`DataArray.to_dask_dataframe`, convert a dataarray into a dask dataframe (:issue:`7409`).
26+
By `Deeksha <https://github.com/dsgreen2>`_.
2527
- Add support for lshift and rshift binary operators (`<<`, `>>`) on
2628
:py:class:`xr.DataArray` of type :py:class:`int` (:issue:`7727` , :pull:`7741`).
2729
By `Alan Brammer <https://github.com/abrammer>`_.

xarray/core/dataarray.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,10 @@
5757

5858
from numpy.typing import ArrayLike
5959

60+
try:
61+
from dask.dataframe import DataFrame as DaskDataFrame
62+
except ImportError:
63+
DaskDataFrame = None # type: ignore
6064
try:
6165
from dask.delayed import Delayed
6266
except ImportError:
@@ -6888,6 +6892,71 @@ def resample(
68886892
**indexer_kwargs,
68896893
)
68906894

6895+
def to_dask_dataframe(
6896+
self,
6897+
dim_order: Sequence[Hashable] | None = None,
6898+
set_index: bool = False,
6899+
) -> DaskDataFrame:
6900+
"""Convert this array into a dask.dataframe.DataFrame.
6901+
6902+
Parameters
6903+
----------
6904+
dim_order : Sequence of Hashable or None , optional
6905+
Hierarchical dimension order for the resulting dataframe.
6906+
Array content is transposed to this order and then written out as flat
6907+
vectors in contiguous order, so the last dimension in this list
6908+
will be contiguous in the resulting DataFrame. This has a major influence
6909+
on which operations are efficient on the resulting dask dataframe.
6910+
set_index : bool, default: False
6911+
If set_index=True, the dask DataFrame is indexed by this dataset's
6912+
coordinate. Since dask DataFrames do not support multi-indexes,
6913+
set_index only works if the dataset only contains one dimension.
6914+
6915+
Returns
6916+
-------
6917+
dask.dataframe.DataFrame
6918+
6919+
Examples
6920+
--------
6921+
>>> da = xr.DataArray(
6922+
... np.arange(4 * 2 * 2).reshape(4, 2, 2),
6923+
... dims=("time", "lat", "lon"),
6924+
... coords={
6925+
... "time": np.arange(4),
6926+
... "lat": [-30, -20],
6927+
... "lon": [120, 130],
6928+
... },
6929+
... name="eg_dataarray",
6930+
... attrs={"units": "Celsius", "description": "Random temperature data"},
6931+
... )
6932+
>>> da.to_dask_dataframe(["lat", "lon", "time"]).compute()
6933+
lat lon time eg_dataarray
6934+
0 -30 120 0 0
6935+
1 -30 120 1 4
6936+
2 -30 120 2 8
6937+
3 -30 120 3 12
6938+
4 -30 130 0 1
6939+
5 -30 130 1 5
6940+
6 -30 130 2 9
6941+
7 -30 130 3 13
6942+
8 -20 120 0 2
6943+
9 -20 120 1 6
6944+
10 -20 120 2 10
6945+
11 -20 120 3 14
6946+
12 -20 130 0 3
6947+
13 -20 130 1 7
6948+
14 -20 130 2 11
6949+
15 -20 130 3 15
6950+
"""
6951+
if self.name is None:
6952+
raise ValueError(
6953+
"Cannot convert an unnamed DataArray to a "
6954+
"dask dataframe : use the ``.rename`` method to assign a name."
6955+
)
6956+
name = self.name
6957+
ds = self._to_dataset_whole(name, shallow_copy=False)
6958+
return ds.to_dask_dataframe(dim_order, set_index)
6959+
68916960
# this needs to be at the end, or mypy will confuse with `str`
68926961
# https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names
68936962
str = utils.UncachedAccessor(StringAccessor["DataArray"])

xarray/tests/test_dataarray.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3245,6 +3245,39 @@ def test_to_dataframe_0length(self) -> None:
32453245
assert len(actual) == 0
32463246
assert_array_equal(actual.index.names, list("ABC"))
32473247

3248+
@requires_dask
3249+
def test_to_dask_dataframe(self) -> None:
3250+
arr_np = np.arange(3 * 4).reshape(3, 4)
3251+
arr = DataArray(arr_np, [("B", [1, 2, 3]), ("A", list("cdef"))], name="foo")
3252+
expected = arr.to_series()
3253+
actual = arr.to_dask_dataframe()["foo"]
3254+
3255+
assert_array_equal(actual.values, expected.values)
3256+
3257+
actual = arr.to_dask_dataframe(dim_order=["A", "B"])["foo"]
3258+
assert_array_equal(arr_np.transpose().reshape(-1), actual.values)
3259+
3260+
# regression test for coords with different dimensions
3261+
3262+
arr.coords["C"] = ("B", [-1, -2, -3])
3263+
expected = arr.to_series().to_frame()
3264+
expected["C"] = [-1] * 4 + [-2] * 4 + [-3] * 4
3265+
expected = expected[["C", "foo"]]
3266+
actual = arr.to_dask_dataframe()[["C", "foo"]]
3267+
3268+
assert_array_equal(expected.values, actual.values)
3269+
assert_array_equal(expected.columns.values, actual.columns.values)
3270+
3271+
with pytest.raises(ValueError, match="does not match the set of dimensions"):
3272+
arr.to_dask_dataframe(dim_order=["B", "A", "C"])
3273+
3274+
arr.name = None
3275+
with pytest.raises(
3276+
ValueError,
3277+
match="Cannot convert an unnamed DataArray",
3278+
):
3279+
arr.to_dask_dataframe()
3280+
32483281
def test_to_pandas_name_matches_coordinate(self) -> None:
32493282
# coordinate with same name as array
32503283
arr = DataArray([1, 2, 3], dims="x", name="x")

0 commit comments

Comments
 (0)