From 3e0b352e3c79de9b2b1bdd69bbcbc0ec5e82456d Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Fri, 16 Mar 2018 08:05:16 +0900 Subject: [PATCH 1/5] Make constructing slices lazily. --- doc/whats-new.rst | 4 ++++ xarray/core/rolling.py | 25 ++++++------------------- 2 files changed, 10 insertions(+), 19 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 40e206aaa86..1a252b15fd0 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -37,6 +37,10 @@ Documentation Enhancements ~~~~~~~~~~~~ + - Some speed improvement to construct :py:class:`~xarray.DataArrayRolling` + object (:issue:`1993`) + By `Keisuke Fujii `_. + Bug fixes ~~~~~~~~~ diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index fb09c9e0df3..6f65be51a0c 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -151,34 +151,21 @@ def __init__(self, obj, min_periods=None, center=False, **windows): """ super(DataArrayRolling, self).__init__(obj, min_periods=min_periods, center=center, **windows) - self.window_indices = None - self.window_labels = None - self._setup_windows() + self.window_labels = self.obj[self.dim] + self._stops = np.arange(len(self.window_labels)) + 1 + self._starts = np.maximum(self._stops - int(self.window), 0) def __iter__(self): - for (label, indices) in zip(self.window_labels, self.window_indices): - window = self.obj.isel(**{self.dim: indices}) + for (label, start, stop) in zip(self.window_labels, self._starts, + self._stops): + window = self.obj.isel(**{self.dim: slice(start, stop)}) counts = window.count(dim=self.dim) window = window.where(counts >= self._min_periods) yield (label, window) - def _setup_windows(self): - """ - Find the indices and labels for each window - """ - self.window_labels = self.obj[self.dim] - window = int(self.window) - dim_size = self.obj[self.dim].size - - stops = np.arange(dim_size) + 1 - starts = np.maximum(stops - window, 0) - - self.window_indices = [slice(start, stop) - for start, stop in zip(starts, stops)] - def construct(self, window_dim, stride=1, fill_value=dtypes.NA): """ Convert this rolling object to xr.DataArray, From ef35ae6d013bd0673d83f12c927bd6d7bf81eafb Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Fri, 16 Mar 2018 08:14:27 +0900 Subject: [PATCH 2/5] Additional speedup --- xarray/core/rolling.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 6f65be51a0c..efb67297dfe 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -153,8 +153,9 @@ def __init__(self, obj, min_periods=None, center=False, **windows): center=center, **windows) self.window_labels = self.obj[self.dim] - self._stops = np.arange(len(self.window_labels)) + 1 - self._starts = np.maximum(self._stops - int(self.window), 0) + self._stops = np.arange(1, len(self.window_labels) + 1) + self._starts = self._stops - int(self.window) + self._starts[:int(self.window)] = 0 def __iter__(self): for (label, start, stop) in zip(self.window_labels, self._starts, From 151adfb121339b9333eebba6bf414f941592ab4d Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Fri, 16 Mar 2018 09:12:58 +0900 Subject: [PATCH 3/5] Move some lines in DataArrayRolling into __iter__. Added a benchmark for long arrays. --- asv_bench/benchmarks/rolling.py | 20 +++++++++++++++++--- xarray/core/rolling.py | 9 ++++----- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 52814ad3481..fd01b0e23f2 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -8,27 +8,40 @@ from . import parameterized, randn, requires_dask nx = 3000 +long_nx = 30000200 ny = 2000 nt = 1000 window = 20 +randn_xy = randn((nx, ny), frac_nan=0.1) +randn_xt = randn((nx, nt)) +randn_t = randn((nt, )) +randn_long = randn((long_nx, ), frac_nan=0.1) + class Rolling(object): def setup(self, *args, **kwargs): self.ds = xr.Dataset( - {'var1': (('x', 'y'), randn((nx, ny), frac_nan=0.1)), - 'var2': (('x', 't'), randn((nx, nt))), - 'var3': (('t', ), randn(nt))}, + {'var1': (('x', 'y'), randn_xy), + 'var2': (('x', 't'), randn_xt), + 'var3': (('t', ), randn_t)}, coords={'x': np.arange(nx), 'y': np.linspace(0, 1, ny), 't': pd.date_range('1970-01-01', periods=nt, freq='D'), 'x_coords': ('x', np.linspace(1.1, 2.1, nx))}) + self.da_long = xr.DataArray(randn_long, dims='x', + coords={'x': np.arange(long_nx) * 0.1}) @parameterized(['func', 'center'], (['mean', 'count'], [True, False])) def time_rolling(self, func, center): getattr(self.ds.rolling(x=window, center=center), func)() + @parameterized(['func', 'center'], + (['mean', 'count'], [True, False])) + def time_rolling_long(self, func, center): + getattr(self.da_long.rolling(x=window, center=center), func)() + @parameterized(['window_', 'min_periods'], ([20, 40], [5, None])) def time_rolling_np(self, window_, min_periods): @@ -47,3 +60,4 @@ def setup(self, *args, **kwargs): requires_dask() super(RollingDask, self).setup(**kwargs) self.ds = self.ds.chunk({'x': 100, 'y': 50, 't': 50}) + self.da_long = self.da_long.chunk({'x': 100, 'y': 50, 't': 50}) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index efb67297dfe..046f2874a56 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -153,13 +153,12 @@ def __init__(self, obj, min_periods=None, center=False, **windows): center=center, **windows) self.window_labels = self.obj[self.dim] - self._stops = np.arange(1, len(self.window_labels) + 1) - self._starts = self._stops - int(self.window) - self._starts[:int(self.window)] = 0 def __iter__(self): - for (label, start, stop) in zip(self.window_labels, self._starts, - self._stops): + _stops = np.arange(1, len(self.window_labels) + 1) + _starts = _stops - int(self.window) + _starts[:int(self.window)] = 0 + for (label, start, stop) in zip(self.window_labels, _starts, _stops): window = self.obj.isel(**{self.dim: slice(start, stop)}) counts = window.count(dim=self.dim) From 5e290d26527251490a64d5b4f9dfe334eedd7046 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Fri, 16 Mar 2018 09:33:28 +0900 Subject: [PATCH 4/5] Bugfix in benchmark --- asv_bench/benchmarks/rolling.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index fd01b0e23f2..3f2a38104de 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -8,7 +8,7 @@ from . import parameterized, randn, requires_dask nx = 3000 -long_nx = 30000200 +long_nx = 30000000 ny = 2000 nt = 1000 window = 20 @@ -37,10 +37,14 @@ def setup(self, *args, **kwargs): def time_rolling(self, func, center): getattr(self.ds.rolling(x=window, center=center), func)() - @parameterized(['func', 'center'], + @parameterized(['func', 'pandas'], (['mean', 'count'], [True, False])) - def time_rolling_long(self, func, center): - getattr(self.da_long.rolling(x=window, center=center), func)() + def time_rolling_long(self, func, pandas): + if pandas: + se = self.da_long.to_series() + getattr(se.rolling(window=window), func)() + else: + getattr(self.da_long.rolling(x=window), func)() @parameterized(['window_', 'min_periods'], ([20, 40], [5, None])) @@ -60,4 +64,4 @@ def setup(self, *args, **kwargs): requires_dask() super(RollingDask, self).setup(**kwargs) self.ds = self.ds.chunk({'x': 100, 'y': 50, 't': 50}) - self.da_long = self.da_long.chunk({'x': 100, 'y': 50, 't': 50}) + self.da_long = self.da_long.chunk({'x': 10000}) From 3f8aad371b8b06ebe2e620952954e6568b345fb2 Mon Sep 17 00:00:00 2001 From: fujiisoup Date: Sun, 18 Mar 2018 16:48:25 +0900 Subject: [PATCH 5/5] remove underscores. --- xarray/core/rolling.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 046f2874a56..079c60f35a7 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -155,10 +155,10 @@ def __init__(self, obj, min_periods=None, center=False, **windows): self.window_labels = self.obj[self.dim] def __iter__(self): - _stops = np.arange(1, len(self.window_labels) + 1) - _starts = _stops - int(self.window) - _starts[:int(self.window)] = 0 - for (label, start, stop) in zip(self.window_labels, _starts, _stops): + stops = np.arange(1, len(self.window_labels) + 1) + starts = stops - int(self.window) + starts[:int(self.window)] = 0 + for (label, start, stop) in zip(self.window_labels, starts, stops): window = self.obj.isel(**{self.dim: slice(start, stop)}) counts = window.count(dim=self.dim)