From ee2c034bd3910abe03cd5ddbc1058116ae95c56a Mon Sep 17 00:00:00 2001 From: simon Date: Mon, 28 May 2018 15:33:13 -0700 Subject: [PATCH 01/19] pct change bug issue 21200 --- pandas/core/groupby/groupby.py | 9 +++++-- pandas/tests/groupby/test_transform.py | 35 +++++++++++++++++++------- 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index df7a5dc9dc173..5d4b9012a42bf 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2070,7 +2070,7 @@ def shift(self, periods=1, freq=None, axis=0): def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, axis=0): """Calcuate pct_change of each value to previous entry in group""" - if freq is not None or axis != 0: + if (freq is not None or axis != 0) or not self.grouper.is_monotonic: return self.apply(lambda x: x.pct_change(periods=periods, fill_method=fill_method, limit=limit, freq=freq, @@ -3942,7 +3942,12 @@ def _apply_to_column_groupbys(self, func): return func(self) def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None): - """Calculate percent change of each value to previous entry in group""" + """Calcuate pct_change of each value to previous entry in group""" + if not self.grouper.is_monotonic: + return self.apply(lambda x: x.pct_change(periods=periods, + fill_method=fill_method, + limit=limit, freq=freq)) + filled = getattr(self, fill_method)(limit=limit) shifted = filled.shift(periods=periods, freq=freq) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 626057c1ea760..2ff95e03e605d 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -722,19 +722,29 @@ def interweave(list_obj): @pytest.mark.parametrize("test_series", [True, False]) +@pytest.mark.parametrize("shuffle", [True, False]) @pytest.mark.parametrize("periods,fill_method,limit", [ (1, 'ffill', None), (1, 'ffill', 1), (1, 'bfill', None), (1, 'bfill', 1), (-1, 'ffill', None), (-1, 'ffill', 1), (-1, 'bfill', None), (-1, 'bfill', 1)]) -def test_pct_change(test_series, periods, fill_method, limit): +def test_pct_change(test_series, shuffle, periods, fill_method, limit): + # Groupby pct change uses an apply if monotonic and a vectorized operation if non-monotonic + # Shuffle parameter tests each vals = [np.nan, np.nan, 1, 2, 4, 10, np.nan, np.nan] - exp_vals = Series(vals).pct_change(periods=periods, - fill_method=fill_method, - limit=limit).tolist() - - df = DataFrame({'key': ['a'] * len(vals) + ['b'] * len(vals), + keys = ['a', 'b'] + df = DataFrame({'key': [k for j in list(map(lambda x: [x] * len(vals), keys)) for k in j], 'vals': vals * 2}) + if shuffle: + df = df.reindex(np.random.permutation(len(df))).reset_index(drop=True) + + manual_apply = [] + for k in keys: + manual_apply.append(Series(df.loc[df.key == k, 'vals'].values).pct_change(periods=periods, + fill_method=fill_method, + limit=limit)) + exp_vals = pd.concat(manual_apply).reset_index(drop=True) + exp = pd.DataFrame(exp_vals, columns=['_pct_change']) grp = df.groupby('key') def get_result(grp_obj): @@ -742,15 +752,22 @@ def get_result(grp_obj): fill_method=fill_method, limit=limit) + # Specifically test when monotonic and not monotonic + if test_series: - exp = pd.Series(exp_vals * 2) - exp.name = 'vals' + exp = exp.loc[:, '_pct_change'] grp = grp['vals'] result = get_result(grp) + # Resort order by keys to compare to expected values + df.insert(0, '_pct_change', result) + result = df.sort_values(by='key') + result = result.loc[:, '_pct_change'] + result = result.reset_index(drop=True) tm.assert_series_equal(result, exp) else: - exp = DataFrame({'vals': exp_vals * 2}) result = get_result(grp) + result.reset_index(drop=True, inplace=True) + result.columns = ['_pct_change'] tm.assert_frame_equal(result, exp) From 47b5908deeb19079c89196cadcf3565771627083 Mon Sep 17 00:00:00 2001 From: simon Date: Mon, 28 May 2018 15:33:13 -0700 Subject: [PATCH 02/19] add whatsnew --- doc/source/whatsnew/v0.23.1.txt | 1 + pandas/core/groupby/groupby.py | 9 +++++-- pandas/tests/groupby/test_transform.py | 35 +++++++++++++++++++------- 3 files changed, 34 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index f204fce3a525f..39097b7e8f5c5 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -50,6 +50,7 @@ Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :func:`DataFrame.agg` where applying multiple aggregation functions to a :class:`DataFrame` with duplicated column names would cause a stack overflow (:issue:`21063`) +- Bug in `DataFrame.pct_change() and `Series.pct_change() where percent change on non-monotonic groups were calculated in a vectorized way (:issue:`21200`) Strings ^^^^^^^ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index df7a5dc9dc173..5d4b9012a42bf 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2070,7 +2070,7 @@ def shift(self, periods=1, freq=None, axis=0): def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, axis=0): """Calcuate pct_change of each value to previous entry in group""" - if freq is not None or axis != 0: + if (freq is not None or axis != 0) or not self.grouper.is_monotonic: return self.apply(lambda x: x.pct_change(periods=periods, fill_method=fill_method, limit=limit, freq=freq, @@ -3942,7 +3942,12 @@ def _apply_to_column_groupbys(self, func): return func(self) def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None): - """Calculate percent change of each value to previous entry in group""" + """Calcuate pct_change of each value to previous entry in group""" + if not self.grouper.is_monotonic: + return self.apply(lambda x: x.pct_change(periods=periods, + fill_method=fill_method, + limit=limit, freq=freq)) + filled = getattr(self, fill_method)(limit=limit) shifted = filled.shift(periods=periods, freq=freq) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 626057c1ea760..2ff95e03e605d 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -722,19 +722,29 @@ def interweave(list_obj): @pytest.mark.parametrize("test_series", [True, False]) +@pytest.mark.parametrize("shuffle", [True, False]) @pytest.mark.parametrize("periods,fill_method,limit", [ (1, 'ffill', None), (1, 'ffill', 1), (1, 'bfill', None), (1, 'bfill', 1), (-1, 'ffill', None), (-1, 'ffill', 1), (-1, 'bfill', None), (-1, 'bfill', 1)]) -def test_pct_change(test_series, periods, fill_method, limit): +def test_pct_change(test_series, shuffle, periods, fill_method, limit): + # Groupby pct change uses an apply if monotonic and a vectorized operation if non-monotonic + # Shuffle parameter tests each vals = [np.nan, np.nan, 1, 2, 4, 10, np.nan, np.nan] - exp_vals = Series(vals).pct_change(periods=periods, - fill_method=fill_method, - limit=limit).tolist() - - df = DataFrame({'key': ['a'] * len(vals) + ['b'] * len(vals), + keys = ['a', 'b'] + df = DataFrame({'key': [k for j in list(map(lambda x: [x] * len(vals), keys)) for k in j], 'vals': vals * 2}) + if shuffle: + df = df.reindex(np.random.permutation(len(df))).reset_index(drop=True) + + manual_apply = [] + for k in keys: + manual_apply.append(Series(df.loc[df.key == k, 'vals'].values).pct_change(periods=periods, + fill_method=fill_method, + limit=limit)) + exp_vals = pd.concat(manual_apply).reset_index(drop=True) + exp = pd.DataFrame(exp_vals, columns=['_pct_change']) grp = df.groupby('key') def get_result(grp_obj): @@ -742,15 +752,22 @@ def get_result(grp_obj): fill_method=fill_method, limit=limit) + # Specifically test when monotonic and not monotonic + if test_series: - exp = pd.Series(exp_vals * 2) - exp.name = 'vals' + exp = exp.loc[:, '_pct_change'] grp = grp['vals'] result = get_result(grp) + # Resort order by keys to compare to expected values + df.insert(0, '_pct_change', result) + result = df.sort_values(by='key') + result = result.loc[:, '_pct_change'] + result = result.reset_index(drop=True) tm.assert_series_equal(result, exp) else: - exp = DataFrame({'vals': exp_vals * 2}) result = get_result(grp) + result.reset_index(drop=True, inplace=True) + result.columns = ['_pct_change'] tm.assert_frame_equal(result, exp) From 41beb4aaa2cca7ca4990a51a777351a8f7a2fddb Mon Sep 17 00:00:00 2001 From: simon Date: Mon, 28 May 2018 16:35:14 -0700 Subject: [PATCH 03/19] fix pep8 issues --- pandas/tests/groupby/test_transform.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 2ff95e03e605d..ee5edf26feaaf 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -729,20 +729,22 @@ def interweave(list_obj): (-1, 'ffill', None), (-1, 'ffill', 1), (-1, 'bfill', None), (-1, 'bfill', 1)]) def test_pct_change(test_series, shuffle, periods, fill_method, limit): - # Groupby pct change uses an apply if monotonic and a vectorized operation if non-monotonic + # Groupby pct change uses an apply if monotonic + # and a vectorized operation if non-monotonic # Shuffle parameter tests each vals = [np.nan, np.nan, 1, 2, 4, 10, np.nan, np.nan] keys = ['a', 'b'] - df = DataFrame({'key': [k for j in list(map(lambda x: [x] * len(vals), keys)) for k in j], - 'vals': vals * 2}) + key_v = [k for j in list(map(lambda x: [x] * len(vals), keys)) for k in j] + df = DataFrame({'key': key_v, 'vals': vals * 2}) if shuffle: df = df.reindex(np.random.permutation(len(df))).reset_index(drop=True) manual_apply = [] for k in keys: - manual_apply.append(Series(df.loc[df.key == k, 'vals'].values).pct_change(periods=periods, - fill_method=fill_method, - limit=limit)) + subgroup = Series(df.loc[df.key == k, 'vals'].values) + manual_apply.append(subgroup.pct_change(periods=periods, + fill_method=fill_method, + limit=limit)) exp_vals = pd.concat(manual_apply).reset_index(drop=True) exp = pd.DataFrame(exp_vals, columns=['_pct_change']) grp = df.groupby('key') From 25efd37689aa831bcb04846a05711eb44175f821 Mon Sep 17 00:00:00 2001 From: simon Date: Tue, 29 May 2018 21:46:21 -0700 Subject: [PATCH 04/19] Removed vectorization from groupby pct change --- pandas/core/groupby/groupby.py | 27 +++++++------------------- pandas/tests/groupby/test_transform.py | 21 ++++++++------------ 2 files changed, 15 insertions(+), 33 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 5d4b9012a42bf..24e61e7723808 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2070,17 +2070,10 @@ def shift(self, periods=1, freq=None, axis=0): def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, axis=0): """Calcuate pct_change of each value to previous entry in group""" - if (freq is not None or axis != 0) or not self.grouper.is_monotonic: - return self.apply(lambda x: x.pct_change(periods=periods, - fill_method=fill_method, - limit=limit, freq=freq, - axis=axis)) - - filled = getattr(self, fill_method)(limit=limit).drop( - self.grouper.names, axis=1) - shifted = filled.shift(periods=periods, freq=freq) - - return (filled / shifted) - 1 + return self.apply(lambda x: x.pct_change(periods=periods, + fill_method=fill_method, + limit=limit, freq=freq, + axis=axis)) @Substitution(name='groupby') @Appender(_doc_template) @@ -3943,15 +3936,9 @@ def _apply_to_column_groupbys(self, func): def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None): """Calcuate pct_change of each value to previous entry in group""" - if not self.grouper.is_monotonic: - return self.apply(lambda x: x.pct_change(periods=periods, - fill_method=fill_method, - limit=limit, freq=freq)) - - filled = getattr(self, fill_method)(limit=limit) - shifted = filled.shift(periods=periods, freq=freq) - - return (filled / shifted) - 1 + return self.apply(lambda x: x.pct_change(periods=periods, + fill_method=fill_method, + limit=limit, freq=freq)) class NDFrameGroupBy(GroupBy): diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index ee5edf26feaaf..a0f559e395f82 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -729,15 +729,13 @@ def interweave(list_obj): (-1, 'ffill', None), (-1, 'ffill', 1), (-1, 'bfill', None), (-1, 'bfill', 1)]) def test_pct_change(test_series, shuffle, periods, fill_method, limit): - # Groupby pct change uses an apply if monotonic - # and a vectorized operation if non-monotonic - # Shuffle parameter tests each - vals = [np.nan, np.nan, 1, 2, 4, 10, np.nan, np.nan] + vals = [3, np.nan, 1, 2, 4, 10, np.nan, np.nan] keys = ['a', 'b'] key_v = [k for j in list(map(lambda x: [x] * len(vals), keys)) for k in j] df = DataFrame({'key': key_v, 'vals': vals * 2}) if shuffle: - df = df.reindex(np.random.permutation(len(df))).reset_index(drop=True) + order = np.random.RandomState(seed=42).permutation(len(df)) + df = df.reindex(order).reset_index(drop=True) manual_apply = [] for k in keys: @@ -746,7 +744,7 @@ def test_pct_change(test_series, shuffle, periods, fill_method, limit): fill_method=fill_method, limit=limit)) exp_vals = pd.concat(manual_apply).reset_index(drop=True) - exp = pd.DataFrame(exp_vals, columns=['_pct_change']) + exp = pd.DataFrame(exp_vals, columns=['A']) grp = df.groupby('key') def get_result(grp_obj): @@ -754,22 +752,19 @@ def get_result(grp_obj): fill_method=fill_method, limit=limit) - # Specifically test when monotonic and not monotonic - if test_series: - exp = exp.loc[:, '_pct_change'] + exp = exp.loc[:, 'A'] grp = grp['vals'] result = get_result(grp) - # Resort order by keys to compare to expected values - df.insert(0, '_pct_change', result) + df.insert(0, 'A', result) result = df.sort_values(by='key') - result = result.loc[:, '_pct_change'] + result = result.loc[:, 'A'] result = result.reset_index(drop=True) tm.assert_series_equal(result, exp) else: result = get_result(grp) result.reset_index(drop=True, inplace=True) - result.columns = ['_pct_change'] + result.columns = ['A'] tm.assert_frame_equal(result, exp) From 849fac48dc38f5b6c3726178e42748895f18d71b Mon Sep 17 00:00:00 2001 From: simon Date: Sun, 3 Jun 2018 17:55:11 -0700 Subject: [PATCH 05/19] try using cache arguments to keep vectorization --- pandas/core/groupby/groupby.py | 36 +++++++++++++++++++++++--- pandas/tests/groupby/test_transform.py | 29 +++++++++++++-------- 2 files changed, 50 insertions(+), 15 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 24e61e7723808..b0b23384d3cef 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3935,10 +3935,38 @@ def _apply_to_column_groupbys(self, func): return func(self) def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None): - """Calcuate pct_change of each value to previous entry in group""" - return self.apply(lambda x: x.pct_change(periods=periods, - fill_method=fill_method, - limit=limit, freq=freq)) + """Calculate pct_change of each value to previous entry in group""" + grouper = self.grouper + cache_exist = getattr(grouper, '_cache', False) + if cache_exist: + in_cache = True if 'is_monotonic' in cache_exist.keys() else False + else: + in_cache = False + m = grouper.is_monotonic if in_cache else False + if not m or fill_method is None: + return self.apply(lambda x: x.pct_change(periods=periods, + fill_method=fill_method, + limit=limit, freq=freq)) + + def get_invalid_index(x): + if periods == 0: + return x + elif periods > 0: + ax = Index(np.arange(min(x), min(x) + periods)) + return ax + elif periods < 0: + ax = Index(np.arange(max(x), max(x) + periods, -1)) + return ax + + filled = getattr(self, fill_method)(limit=limit) + shifted = filled.shift(periods=periods, freq=freq) + pct_change = (filled / shifted) - 1 + + invalid_index = Index([]) + for i in [get_invalid_index(v) for k, v in self.indices.items()]: + invalid_index = invalid_index.union(i) + pct_change.iloc[invalid_index] = np.nan + return pct_change class NDFrameGroupBy(GroupBy): diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index a0f559e395f82..0a975d2d37339 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -723,15 +723,19 @@ def interweave(list_obj): @pytest.mark.parametrize("test_series", [True, False]) @pytest.mark.parametrize("shuffle", [True, False]) +@pytest.mark.parametrize("activate_cache", [True, False]) @pytest.mark.parametrize("periods,fill_method,limit", [ (1, 'ffill', None), (1, 'ffill', 1), (1, 'bfill', None), (1, 'bfill', 1), (-1, 'ffill', None), (-1, 'ffill', 1), - (-1, 'bfill', None), (-1, 'bfill', 1)]) -def test_pct_change(test_series, shuffle, periods, fill_method, limit): - vals = [3, np.nan, 1, 2, 4, 10, np.nan, np.nan] + (-1, 'bfill', None), (-1, 'bfill', 1), + (-1, None, None), (-1, None, 1), + (-1, None, None), (-1, None, 1) +]) +def test_pct_change(test_series, shuffle, activate_cache, periods, fill_method, limit): + vals = [3, np.nan, 1, 2, 4, 10, np.nan, 9] keys = ['a', 'b'] - key_v = [k for j in list(map(lambda x: [x] * len(vals), keys)) for k in j] + key_v = np.repeat(keys, len(vals)) df = DataFrame({'key': key_v, 'vals': vals * 2}) if shuffle: order = np.random.RandomState(seed=42).permutation(len(df)) @@ -739,14 +743,17 @@ def test_pct_change(test_series, shuffle, periods, fill_method, limit): manual_apply = [] for k in keys: - subgroup = Series(df.loc[df.key == k, 'vals'].values) - manual_apply.append(subgroup.pct_change(periods=periods, - fill_method=fill_method, - limit=limit)) - exp_vals = pd.concat(manual_apply).reset_index(drop=True) - exp = pd.DataFrame(exp_vals, columns=['A']) + ind = df.loc[df.key == k, 'vals'] + manual_apply.append(ind.pct_change(periods=periods, + fill_method=fill_method, + limit=limit)) + exp_vals = pd.concat(manual_apply, ignore_index=True) + exp = pd.DataFrame(exp_vals.values, columns=['A']) grp = df.groupby('key') + if activate_cache: + grp.grouper.is_monotonic + def get_result(grp_obj): return grp_obj.pct_change(periods=periods, fill_method=fill_method, @@ -763,7 +770,7 @@ def get_result(grp_obj): tm.assert_series_equal(result, exp) else: result = get_result(grp) - result.reset_index(drop=True, inplace=True) + result = result.reset_index(drop=True) result.columns = ['A'] tm.assert_frame_equal(result, exp) From eaede342b41b233f67a41534b8dc843b5197cad3 Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 9 Jun 2018 16:02:44 -0700 Subject: [PATCH 06/19] BUG,TST: Remove case where vectorization fails in pct_change groupby method, improve test. #21235 --- pandas/core/groupby/groupby.py | 36 +++----------------------- pandas/tests/groupby/test_transform.py | 8 +----- 2 files changed, 5 insertions(+), 39 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b0b23384d3cef..24e61e7723808 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3935,38 +3935,10 @@ def _apply_to_column_groupbys(self, func): return func(self) def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None): - """Calculate pct_change of each value to previous entry in group""" - grouper = self.grouper - cache_exist = getattr(grouper, '_cache', False) - if cache_exist: - in_cache = True if 'is_monotonic' in cache_exist.keys() else False - else: - in_cache = False - m = grouper.is_monotonic if in_cache else False - if not m or fill_method is None: - return self.apply(lambda x: x.pct_change(periods=periods, - fill_method=fill_method, - limit=limit, freq=freq)) - - def get_invalid_index(x): - if periods == 0: - return x - elif periods > 0: - ax = Index(np.arange(min(x), min(x) + periods)) - return ax - elif periods < 0: - ax = Index(np.arange(max(x), max(x) + periods, -1)) - return ax - - filled = getattr(self, fill_method)(limit=limit) - shifted = filled.shift(periods=periods, freq=freq) - pct_change = (filled / shifted) - 1 - - invalid_index = Index([]) - for i in [get_invalid_index(v) for k, v in self.indices.items()]: - invalid_index = invalid_index.union(i) - pct_change.iloc[invalid_index] = np.nan - return pct_change + """Calcuate pct_change of each value to previous entry in group""" + return self.apply(lambda x: x.pct_change(periods=periods, + fill_method=fill_method, + limit=limit, freq=freq)) class NDFrameGroupBy(GroupBy): diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 0a975d2d37339..6e5dea968abad 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -723,16 +723,13 @@ def interweave(list_obj): @pytest.mark.parametrize("test_series", [True, False]) @pytest.mark.parametrize("shuffle", [True, False]) -@pytest.mark.parametrize("activate_cache", [True, False]) @pytest.mark.parametrize("periods,fill_method,limit", [ (1, 'ffill', None), (1, 'ffill', 1), (1, 'bfill', None), (1, 'bfill', 1), (-1, 'ffill', None), (-1, 'ffill', 1), (-1, 'bfill', None), (-1, 'bfill', 1), - (-1, None, None), (-1, None, 1), - (-1, None, None), (-1, None, 1) ]) -def test_pct_change(test_series, shuffle, activate_cache, periods, fill_method, limit): +def test_pct_change(test_series, shuffle, periods, fill_method, limit): vals = [3, np.nan, 1, 2, 4, 10, np.nan, 9] keys = ['a', 'b'] key_v = np.repeat(keys, len(vals)) @@ -751,9 +748,6 @@ def test_pct_change(test_series, shuffle, activate_cache, periods, fill_method, exp = pd.DataFrame(exp_vals.values, columns=['A']) grp = df.groupby('key') - if activate_cache: - grp.grouper.is_monotonic - def get_result(grp_obj): return grp_obj.pct_change(periods=periods, fill_method=fill_method, From 64097ccd308db909dc99cbb88eeb6206beedb820 Mon Sep 17 00:00:00 2001 From: simon Date: Mon, 18 Jun 2018 22:53:22 -0700 Subject: [PATCH 07/19] BUG,TST: Remove case where vectorization fails in pct_change groupby method. Incorporate CR suggestion. --- pandas/core/groupby/groupby.py | 34 ++++++++++++++++++++------ pandas/tests/groupby/test_transform.py | 7 ++++-- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 24e61e7723808..c9c82c0b496e9 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2070,10 +2070,24 @@ def shift(self, periods=1, freq=None, axis=0): def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, axis=0): """Calcuate pct_change of each value to previous entry in group""" - return self.apply(lambda x: x.pct_change(periods=periods, - fill_method=fill_method, - limit=limit, freq=freq, - axis=axis)) + if freq is not None or axis != 0: + return self.apply(lambda x: x.pct_change(periods=periods, + fill_method=fill_method, + limit=limit, freq=freq, + axis=axis)) + if fill_method: + new = DataFrameGroupBy(self._obj_with_exclusions, + grouper=self.grouper) + new.obj = getattr(new, fill_method)(limit=limit) + new._reset_cache() + else: + new = self + + obj = new.obj.drop(self.grouper.names, axis=1) + shifted = new.shift(periods=periods, freq=freq).\ + drop(self.grouper.names, axis=1) + return (obj / shifted) - 1 + @Substitution(name='groupby') @Appender(_doc_template) @@ -3936,9 +3950,15 @@ def _apply_to_column_groupbys(self, func): def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None): """Calcuate pct_change of each value to previous entry in group""" - return self.apply(lambda x: x.pct_change(periods=periods, - fill_method=fill_method, - limit=limit, freq=freq)) + if fill_method: + new = SeriesGroupBy(self.obj, grouper=self.grouper) + new.obj = getattr(new, fill_method)(limit=limit) + new._reset_cache() + else: + new = self + + shifted = new.shift(periods=periods, freq=freq) + return (new.obj / shifted) - 1 class NDFrameGroupBy(GroupBy): diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 6e5dea968abad..b14ffd0463aeb 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -730,7 +730,7 @@ def interweave(list_obj): (-1, 'bfill', None), (-1, 'bfill', 1), ]) def test_pct_change(test_series, shuffle, periods, fill_method, limit): - vals = [3, np.nan, 1, 2, 4, 10, np.nan, 9] + vals = [3, np.nan, 1, 2, 4, 10, np.nan, 4] keys = ['a', 'b'] key_v = np.repeat(keys, len(vals)) df = DataFrame({'key': key_v, 'vals': vals * 2}) @@ -765,7 +765,10 @@ def get_result(grp_obj): else: result = get_result(grp) result = result.reset_index(drop=True) - result.columns = ['A'] + df.insert(0, 'A', result) + result = df.sort_values(by='key') + result = result.loc[:, ['A']] + result = result.reset_index(drop=True) tm.assert_frame_equal(result, exp) From fcd7183044c378718dbe8e306c8bc7caac65a81f Mon Sep 17 00:00:00 2001 From: simon Date: Mon, 18 Jun 2018 22:53:22 -0700 Subject: [PATCH 08/19] BUG,TST: Remove case where vectorization fails in pct_change groupby method. Incorporate CR suggestion. --- pandas/core/groupby/groupby.py | 34 ++++++++++++++++++++------ pandas/tests/groupby/test_transform.py | 7 ++++-- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 24e61e7723808..c9c82c0b496e9 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2070,10 +2070,24 @@ def shift(self, periods=1, freq=None, axis=0): def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, axis=0): """Calcuate pct_change of each value to previous entry in group""" - return self.apply(lambda x: x.pct_change(periods=periods, - fill_method=fill_method, - limit=limit, freq=freq, - axis=axis)) + if freq is not None or axis != 0: + return self.apply(lambda x: x.pct_change(periods=periods, + fill_method=fill_method, + limit=limit, freq=freq, + axis=axis)) + if fill_method: + new = DataFrameGroupBy(self._obj_with_exclusions, + grouper=self.grouper) + new.obj = getattr(new, fill_method)(limit=limit) + new._reset_cache() + else: + new = self + + obj = new.obj.drop(self.grouper.names, axis=1) + shifted = new.shift(periods=periods, freq=freq).\ + drop(self.grouper.names, axis=1) + return (obj / shifted) - 1 + @Substitution(name='groupby') @Appender(_doc_template) @@ -3936,9 +3950,15 @@ def _apply_to_column_groupbys(self, func): def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None): """Calcuate pct_change of each value to previous entry in group""" - return self.apply(lambda x: x.pct_change(periods=periods, - fill_method=fill_method, - limit=limit, freq=freq)) + if fill_method: + new = SeriesGroupBy(self.obj, grouper=self.grouper) + new.obj = getattr(new, fill_method)(limit=limit) + new._reset_cache() + else: + new = self + + shifted = new.shift(periods=periods, freq=freq) + return (new.obj / shifted) - 1 class NDFrameGroupBy(GroupBy): diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 6e5dea968abad..b14ffd0463aeb 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -730,7 +730,7 @@ def interweave(list_obj): (-1, 'bfill', None), (-1, 'bfill', 1), ]) def test_pct_change(test_series, shuffle, periods, fill_method, limit): - vals = [3, np.nan, 1, 2, 4, 10, np.nan, 9] + vals = [3, np.nan, 1, 2, 4, 10, np.nan, 4] keys = ['a', 'b'] key_v = np.repeat(keys, len(vals)) df = DataFrame({'key': key_v, 'vals': vals * 2}) @@ -765,7 +765,10 @@ def get_result(grp_obj): else: result = get_result(grp) result = result.reset_index(drop=True) - result.columns = ['A'] + df.insert(0, 'A', result) + result = df.sort_values(by='key') + result = result.loc[:, ['A']] + result = result.reset_index(drop=True) tm.assert_frame_equal(result, exp) From 77c39359b478cebadbc2324096c751d0f7706386 Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 23 Jun 2018 18:38:11 -0700 Subject: [PATCH 09/19] BUG,TST: Remove case where vectorization fails in pct_change groupby method. Incorporate CR suggestion. --- pandas/tests/groupby/test_transform.py | 37 +++++++------------------- 1 file changed, 10 insertions(+), 27 deletions(-) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index b14ffd0463aeb..8a3423c116b9a 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -738,37 +738,20 @@ def test_pct_change(test_series, shuffle, periods, fill_method, limit): order = np.random.RandomState(seed=42).permutation(len(df)) df = df.reindex(order).reset_index(drop=True) - manual_apply = [] - for k in keys: - ind = df.loc[df.key == k, 'vals'] - manual_apply.append(ind.pct_change(periods=periods, - fill_method=fill_method, - limit=limit)) - exp_vals = pd.concat(manual_apply, ignore_index=True) - exp = pd.DataFrame(exp_vals.values, columns=['A']) + df = getattr(df.groupby('key'), fill_method)(limit=limit) grp = df.groupby('key') - - def get_result(grp_obj): - return grp_obj.pct_change(periods=periods, - fill_method=fill_method, - limit=limit) + exp = grp['vals'].shift(0) / grp['vals'].shift(periods) - 1 + exp = exp.to_frame('vals') if test_series: - exp = exp.loc[:, 'A'] - grp = grp['vals'] - result = get_result(grp) - df.insert(0, 'A', result) - result = df.sort_values(by='key') - result = result.loc[:, 'A'] - result = result.reset_index(drop=True) - tm.assert_series_equal(result, exp) + result = grp['vals'].pct_change(periods=periods, + fill_method=fill_method, + limit=limit) + tm.assert_series_equal(result, exp.loc[:, 'vals']) else: - result = get_result(grp) - result = result.reset_index(drop=True) - df.insert(0, 'A', result) - result = df.sort_values(by='key') - result = result.loc[:, ['A']] - result = result.reset_index(drop=True) + result = grp.pct_change(periods=periods, + fill_method=fill_method, + limit=limit) tm.assert_frame_equal(result, exp) From d037e65def37dca77cf3a2bee7b1b7cde13a3739 Mon Sep 17 00:00:00 2001 From: simon Date: Sun, 29 Jul 2018 15:29:43 -0700 Subject: [PATCH 10/19] Update test to address WillAyds code suggestion --- pandas/core/groupby/groupby.py | 7 ++---- pandas/tests/groupby/test_transform.py | 32 +++++++++++++++----------- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c9c82c0b496e9..f869dc5bed362 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2076,19 +2076,16 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, limit=limit, freq=freq, axis=axis)) if fill_method: - new = DataFrameGroupBy(self._obj_with_exclusions, - grouper=self.grouper) + new = copy.copy(self) new.obj = getattr(new, fill_method)(limit=limit) new._reset_cache() else: new = self obj = new.obj.drop(self.grouper.names, axis=1) - shifted = new.shift(periods=periods, freq=freq).\ - drop(self.grouper.names, axis=1) + shifted = new.shift(periods=periods, freq=freq) return (obj / shifted) - 1 - @Substitution(name='groupby') @Appender(_doc_template) def head(self, n=5): diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 8a3423c116b9a..00221aac079cb 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -724,13 +724,15 @@ def interweave(list_obj): @pytest.mark.parametrize("test_series", [True, False]) @pytest.mark.parametrize("shuffle", [True, False]) @pytest.mark.parametrize("periods,fill_method,limit", [ + (1, None, None), (1, None, 1), (1, 'ffill', None), (1, 'ffill', 1), (1, 'bfill', None), (1, 'bfill', 1), (-1, 'ffill', None), (-1, 'ffill', 1), (-1, 'bfill', None), (-1, 'bfill', 1), ]) def test_pct_change(test_series, shuffle, periods, fill_method, limit): - vals = [3, np.nan, 1, 2, 4, 10, np.nan, 4] + # GH 21200, 21621 + vals = [3, np.nan, np.nan, np.nan, 1, 2, 4, 10, np.nan, 4] keys = ['a', 'b'] key_v = np.repeat(keys, len(vals)) df = DataFrame({'key': key_v, 'vals': vals * 2}) @@ -738,21 +740,25 @@ def test_pct_change(test_series, shuffle, periods, fill_method, limit): order = np.random.RandomState(seed=42).permutation(len(df)) df = df.reindex(order).reset_index(drop=True) - df = getattr(df.groupby('key'), fill_method)(limit=limit) - grp = df.groupby('key') - exp = grp['vals'].shift(0) / grp['vals'].shift(periods) - 1 - exp = exp.to_frame('vals') + if fill_method: + df_g = getattr(df.groupby('key'), fill_method)(limit=limit) + grp = df_g.groupby('key') + else: + grp = df.groupby('key') + + expected = grp['vals'].obj / grp['vals'].shift(periods) - 1 + expected = expected.to_frame('vals') if test_series: - result = grp['vals'].pct_change(periods=periods, - fill_method=fill_method, - limit=limit) - tm.assert_series_equal(result, exp.loc[:, 'vals']) + result = df.groupby('key')['vals'].pct_change(periods=periods, + fill_method=fill_method, + limit=limit) + tm.assert_series_equal(result, expected.loc[:, 'vals']) else: - result = grp.pct_change(periods=periods, - fill_method=fill_method, - limit=limit) - tm.assert_frame_equal(result, exp) + result = df.groupby('key').pct_change(periods=periods, + fill_method=fill_method, + limit=limit) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("func", [np.any, np.all]) From a4fcb33ce1b7559fb367cb222c259fff81e2e18f Mon Sep 17 00:00:00 2001 From: simon Date: Wed, 26 Sep 2018 23:07:41 -0700 Subject: [PATCH 11/19] incorporate Will CRs --- pandas/core/groupby/groupby.py | 14 +++++--------- pandas/tests/groupby/test_transform.py | 11 +++-------- 2 files changed, 8 insertions(+), 17 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f869dc5bed362..c9633b00178db 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2076,9 +2076,8 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, limit=limit, freq=freq, axis=axis)) if fill_method: - new = copy.copy(self) + new = self new.obj = getattr(new, fill_method)(limit=limit) - new._reset_cache() else: new = self @@ -3948,14 +3947,11 @@ def _apply_to_column_groupbys(self, func): def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None): """Calcuate pct_change of each value to previous entry in group""" if fill_method: - new = SeriesGroupBy(self.obj, grouper=self.grouper) - new.obj = getattr(new, fill_method)(limit=limit) - new._reset_cache() - else: - new = self + self.obj = getattr(self, fill_method)(limit=limit) + self._reset_cache('_selected_obj') - shifted = new.shift(periods=periods, freq=freq) - return (new.obj / shifted) - 1 + shifted = self.shift(periods=periods, freq=freq) + return (self.obj / shifted) - 1 class NDFrameGroupBy(GroupBy): diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 00221aac079cb..a6b52943642ca 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -722,7 +722,6 @@ def interweave(list_obj): @pytest.mark.parametrize("test_series", [True, False]) -@pytest.mark.parametrize("shuffle", [True, False]) @pytest.mark.parametrize("periods,fill_method,limit", [ (1, None, None), (1, None, 1), (1, 'ffill', None), (1, 'ffill', 1), @@ -730,15 +729,12 @@ def interweave(list_obj): (-1, 'ffill', None), (-1, 'ffill', 1), (-1, 'bfill', None), (-1, 'bfill', 1), ]) -def test_pct_change(test_series, shuffle, periods, fill_method, limit): +def test_pct_change(test_series, periods, fill_method, limit): # GH 21200, 21621 vals = [3, np.nan, np.nan, np.nan, 1, 2, 4, 10, np.nan, 4] keys = ['a', 'b'] key_v = np.repeat(keys, len(vals)) df = DataFrame({'key': key_v, 'vals': vals * 2}) - if shuffle: - order = np.random.RandomState(seed=42).permutation(len(df)) - df = df.reindex(order).reset_index(drop=True) if fill_method: df_g = getattr(df.groupby('key'), fill_method)(limit=limit) @@ -747,18 +743,17 @@ def test_pct_change(test_series, shuffle, periods, fill_method, limit): grp = df.groupby('key') expected = grp['vals'].obj / grp['vals'].shift(periods) - 1 - expected = expected.to_frame('vals') if test_series: result = df.groupby('key')['vals'].pct_change(periods=periods, fill_method=fill_method, limit=limit) - tm.assert_series_equal(result, expected.loc[:, 'vals']) + tm.assert_series_equal(result, expected) else: result = df.groupby('key').pct_change(periods=periods, fill_method=fill_method, limit=limit) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected.to_frame('vals')) @pytest.mark.parametrize("func", [np.any, np.all]) From 64d2e4f4565edd6522437481cf5df10eced02b78 Mon Sep 17 00:00:00 2001 From: simon Date: Thu, 27 Sep 2018 23:54:35 -0700 Subject: [PATCH 12/19] cr update --- pandas/core/groupby/groupby.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c9633b00178db..cf4e541d7d318 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2075,15 +2075,15 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, fill_method=fill_method, limit=limit, freq=freq, axis=axis)) - if fill_method: - new = self - new.obj = getattr(new, fill_method)(limit=limit) - else: - new = self - obj = new.obj.drop(self.grouper.names, axis=1) - shifted = new.shift(periods=periods, freq=freq) - return (obj / shifted) - 1 + with _group_selection_context(self) as new: + if fill_method: + new = copy.copy(new) + new.obj = getattr(new, fill_method)(limit=limit) + + obj = new.obj.drop(self.grouper.names, axis=1) + shifted = new.shift(periods=periods, freq=freq) + return (obj / shifted) - 1 @Substitution(name='groupby') @Appender(_doc_template) @@ -3946,12 +3946,14 @@ def _apply_to_column_groupbys(self, func): def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None): """Calcuate pct_change of each value to previous entry in group""" - if fill_method: - self.obj = getattr(self, fill_method)(limit=limit) - self._reset_cache('_selected_obj') - - shifted = self.shift(periods=periods, freq=freq) - return (self.obj / shifted) - 1 + with _group_selection_context(self) as new: + if fill_method: + new = copy.copy(new) + new.obj = getattr(new, fill_method)(limit=limit) + new._reset_cache('_selected_obj') + + shifted = new.shift(periods=periods, freq=freq) + return (new.obj / shifted) - 1 class NDFrameGroupBy(GroupBy): From 5d3fc2dd6752a04ddeb56989d397ea9ee85c8228 Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 24 Nov 2018 22:25:25 -0800 Subject: [PATCH 13/19] BUG,TST: Remove case where vectorization fails in pct_change groupby method --- pandas/core/groupby/generic.py | 15 ++++++++------- pandas/core/groupby/groupby.py | 12 +++++------- pandas/tests/groupby/test_transform.py | 19 ++++++++++++++----- 3 files changed, 27 insertions(+), 19 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 531ddc73a8fe6..419d522a34762 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1222,13 +1222,14 @@ def _apply_to_column_groupbys(self, func): def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None): """Calcuate pct_change of each value to previous entry in group""" - with _group_selection_context(self) as new: - if fill_method: - new = copy.copy(new) - new.obj = getattr(new, fill_method)(limit=limit) - new._reset_cache('_selected_obj') - shifted = new.shift(periods=periods, freq=freq) - return (new.obj / shifted) - 1 + if freq: + return self.apply(lambda x: x.pct_change(periods=periods, + fill_method=fill_method, + limit=limit, freq=freq)) + filled = getattr(self, fill_method)(limit=limit) + fill_grp = filled.groupby(self.grouper.labels) + shifted = fill_grp.shift(periods=periods, freq=freq) + return (filled / shifted) - 1 class DataFrameGroupBy(NDFrameGroupBy): diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f8d543c87ea17..00516c25a7e00 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1997,13 +1997,11 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, fill_method=fill_method, limit=limit, freq=freq, axis=axis)) - with _group_selection_context(self) as new: - if fill_method: - new = copy.copy(new) - new.obj = getattr(new, fill_method)(limit=limit) - obj = new.obj.drop(self.grouper.names, axis=1) - shifted = new.shift(periods=periods, freq=freq) - return (obj / shifted) - 1 + filled = getattr(self, fill_method)(limit=limit) + filled = filled.drop(self.grouper.names, axis=1) + fill_grp = filled.groupby(self.grouper.labels) + shifted = fill_grp.shift(periods=periods, freq=freq) + return (filled/shifted)-1 @Substitution(name='groupby') @Appender(_doc_template) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 2bb8a811f546d..d0d729ad4bf10 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -765,20 +765,28 @@ def test_pad_stable_sorting(fill_method): @pytest.mark.parametrize("test_series", [True, False]) +@pytest.mark.parametrize("test_freq", [True, False]) @pytest.mark.parametrize("periods,fill_method,limit", [ - (1, None, None), (1, None, 1), (1, 'ffill', None), (1, 'ffill', 1), (1, 'bfill', None), (1, 'bfill', 1), (-1, 'ffill', None), (-1, 'ffill', 1), (-1, 'bfill', None), (-1, 'bfill', 1), ]) -def test_pct_change(test_series, periods, fill_method, limit): +def test_pct_change(test_series, test_freq, periods, fill_method, limit): # GH 21200, 21621 vals = [3, np.nan, np.nan, np.nan, 1, 2, 4, 10, np.nan, 4] keys = ['a', 'b'] key_v = np.repeat(keys, len(vals)) df = DataFrame({'key': key_v, 'vals': vals * 2}) + if test_freq: + freq = 'D' + dt_idx = pd.DatetimeIndex(start='2010-01-01', freq=freq, + periods=len(vals)) + df.index = np.concatenate([dt_idx.values]*2) + else: + freq = None + if fill_method: df_g = getattr(df.groupby('key'), fill_method)(limit=limit) grp = df_g.groupby('key') @@ -790,16 +798,17 @@ def test_pct_change(test_series, periods, fill_method, limit): if test_series: result = df.groupby('key')['vals'].pct_change(periods=periods, fill_method=fill_method, - limit=limit) + limit=limit, + freq=freq) tm.assert_series_equal(result, expected) else: result = df.groupby('key').pct_change(periods=periods, fill_method=fill_method, - limit=limit) + limit=limit, + freq=freq) tm.assert_frame_equal(result, expected.to_frame('vals')) - @pytest.mark.parametrize("func", [np.any, np.all]) def test_any_all_np_func(func): # GH 20653 From a3c95cb795fa57997ff68130acfd3d36f321a533 Mon Sep 17 00:00:00 2001 From: simon Date: Sun, 2 Dec 2018 14:13:37 -0800 Subject: [PATCH 14/19] BUG,TST: GH21235 groupby pct_change whatsnew --- doc/source/whatsnew/v0.23.5.txt | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.5.txt b/doc/source/whatsnew/v0.23.5.txt index 8f4b1a13c2e9d..14450ef4aff82 100644 --- a/doc/source/whatsnew/v0.23.5.txt +++ b/doc/source/whatsnew/v0.23.5.txt @@ -42,7 +42,33 @@ Bug Fixes **Groupby/Resample/Rolling** - Bug in :meth:`DataFrame.resample` when resampling ``NaT`` in ``TimeDeltaIndex`` (:issue:`13223`). -- +- Bug where calling :func:`SeriesGroupBy.pct_change` or :func:`DataFrameGroupBy.pct_change` would ignore groups when calculating the percent change (:issue:`21235`) + +.. ipython:: python + + df = pd.DataFrame({'a': [1.0, 1.1, 2.2], 'grp': ['a', 'a', 'b']}) + +**Previous behavior**: + +.. code-block:: ipython + + >>> df.groupby('grp').pct_change() + a + 0 NaN + 1 0.1 + 2 1.0 + +**New behavior**: + +Grouping respected + +.. code-block:: ipython + + >>> df.groupby('grp').pct_change() + a + 0 NaN + 1 0.1 + 2 NaN **Missing** From e5289a24f1421ad37138af5c87b0cc4c00cd135d Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 8 Dec 2018 16:40:37 -0800 Subject: [PATCH 15/19] BUG, TST: GH 21235 groupby pct_change whatsnew --- doc/source/whatsnew/v0.23.5.txt | 43 ++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v0.23.5.txt b/doc/source/whatsnew/v0.23.5.txt index 14450ef4aff82..7e7be1cfb1450 100644 --- a/doc/source/whatsnew/v0.23.5.txt +++ b/doc/source/whatsnew/v0.23.5.txt @@ -42,39 +42,42 @@ Bug Fixes **Groupby/Resample/Rolling** - Bug in :meth:`DataFrame.resample` when resampling ``NaT`` in ``TimeDeltaIndex`` (:issue:`13223`). -- Bug where calling :func:`SeriesGroupBy.pct_change` or :func:`DataFrameGroupBy.pct_change` would ignore groups when calculating the percent change (:issue:`21235`) -.. ipython:: python +**Missing** - df = pd.DataFrame({'a': [1.0, 1.1, 2.2], 'grp': ['a', 'a', 'b']}) +- +- -**Previous behavior**: +**I/O** -.. code-block:: ipython +- Bug in :func:`read_csv` that caused it to raise ``OverflowError`` when trying to use 'inf' as ``na_value`` with integer index column (:issue:`17128`) - >>> df.groupby('grp').pct_change() - a - 0 NaN - 1 0.1 - 2 1.0 +Backwards incompatible API changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Bug where calling :func:`SeriesGroupBy.pct_change` or :func:`DataFrameGroupBy.pct_change` would ignore groups when calculating the percent change (:issue:`21235`) **New behavior**: -Grouping respected +.. ipython:: python + + import pandas as pd + df = pd.DataFrame({'grp': ['a', 'a', 'b'], 'foo': [1.0, 1.1, 2.2]}) + df + + df.groupby('grp').pct_change() + + +**Previous behavior**: .. code-block:: ipython - >>> df.groupby('grp').pct_change() - a + In [1]: df.groupby('grp').pct_change() + Out[1]: + foo 0 NaN 1 0.1 - 2 NaN - -**Missing** + 2 1.0 -- -- -**I/O** -- Bug in :func:`read_csv` that caused it to raise ``OverflowError`` when trying to use 'inf' as ``na_value`` with integer index column (:issue:`17128`) From c26cf60bdb875c2f37eebcc62e50fa59a886cbf9 Mon Sep 17 00:00:00 2001 From: simon Date: Sun, 9 Dec 2018 00:35:47 -0800 Subject: [PATCH 16/19] BUG, TST: GH 21235 groupby pct_change whatsnew --- doc/source/whatsnew/v0.24.0.rst | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index b1780181d8a9c..4648bba8cefe2 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -378,6 +378,29 @@ Backwards incompatible API changes - Passing scalar values to :class:`DatetimeIndex` or :class:`TimedeltaIndex` will now raise ``TypeError`` instead of ``ValueError`` (:issue:`23539`) - ``max_rows`` and ``max_cols`` parameters removed from :class:`HTMLFormatter` since truncation is handled by :class:`DataFrameFormatter` (:issue:`23818`) - :meth:`read_csv` will now raise a ``ValueError`` if a column with missing values is declared as having dtype ``bool`` (:issue:`20591`) +- Bug where calling :func:`SeriesGroupBy.pct_change` or :func:`DataFrameGroupBy.pct_change` would ignore groups when calculating the percent change (:issue:`21235`) + +**New behavior**: + +.. ipython:: python + + import pandas as pd + df = pd.DataFrame({'grp': ['a', 'a', 'b'], 'foo': [1.0, 1.1, 2.2]}) + df + + df.groupby('grp').pct_change() + + +**Previous behavior**: + +.. code-block:: ipython + + In [1]: df.groupby('grp').pct_change() + Out[1]: + foo + 0 NaN + 1 0.1 + 2 1.0 .. _whatsnew_0240.api_breaking.deps: From 877b4337bd6ececf553d5b2589b1f09658bb1ede Mon Sep 17 00:00:00 2001 From: simon Date: Sun, 9 Dec 2018 14:45:02 -0800 Subject: [PATCH 17/19] BUG, TST: GH 21235 groupby pct_change whatsnew --- doc/source/whatsnew/v0.24.0.rst | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 4648bba8cefe2..e34b0c2fa7e13 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -378,20 +378,18 @@ Backwards incompatible API changes - Passing scalar values to :class:`DatetimeIndex` or :class:`TimedeltaIndex` will now raise ``TypeError`` instead of ``ValueError`` (:issue:`23539`) - ``max_rows`` and ``max_cols`` parameters removed from :class:`HTMLFormatter` since truncation is handled by :class:`DataFrameFormatter` (:issue:`23818`) - :meth:`read_csv` will now raise a ``ValueError`` if a column with missing values is declared as having dtype ``bool`` (:issue:`20591`) -- Bug where calling :func:`SeriesGroupBy.pct_change` or :func:`DataFrameGroupBy.pct_change` would ignore groups when calculating the percent change (:issue:`21235`) -**New behavior**: +Percentage change called on groupby now returns correct values +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Fixed a bug where calling :func:`SeriesGroupBy.pct_change` or :func:`DataFrameGroupBy.pct_change` would ignore groups when calculating the percent change (:issue:`21235`). .. ipython:: python - import pandas as pd df = pd.DataFrame({'grp': ['a', 'a', 'b'], 'foo': [1.0, 1.1, 2.2]}) df - df.groupby('grp').pct_change() - - -**Previous behavior**: +Previous behavior: .. code-block:: ipython @@ -402,6 +400,12 @@ Backwards incompatible API changes 1 0.1 2 1.0 +New behavior: + +.. ipython:: python + + df.groupby('grp').pct_change() + .. _whatsnew_0240.api_breaking.deps: Dependencies have increased minimum versions From ad934b749ec47478724756cac82412d290cc61cc Mon Sep 17 00:00:00 2001 From: simon Date: Tue, 11 Dec 2018 21:29:48 -0800 Subject: [PATCH 18/19] BUG, TST: GH 21235 groupby pct_change whatsnew --- doc/source/whatsnew/v0.24.0.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index ce36ca2703481..80317d6806346 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -382,10 +382,10 @@ Backwards incompatible API changes - :meth:`read_csv` will now raise a ``ValueError`` if a column with missing values is declared as having dtype ``bool`` (:issue:`20591`) - The column order of the resultant :class:`DataFrame` from :meth:`MultiIndex.to_frame` is now guaranteed to match the :attr:`MultiIndex.names` order. (:issue:`22420`) -Percentage change called on groupby now returns correct values -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Percentage change on groupby changes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Fixed a bug where calling :func:`SeriesGroupBy.pct_change` or :func:`DataFrameGroupBy.pct_change` would ignore groups when calculating the percent change (:issue:`21235`). +Fixed a bug where calling :func:`SeriesGroupBy.pct_change` or :func:`DataFrameGroupBy.pct_change` would previously work across groups when calculating the percent change, where it now correctly works per group (:issue:`21200`, :issue:`21235`). .. ipython:: python From 01d705fa246d792f9d0e52a614f9b68a0a7f6f3b Mon Sep 17 00:00:00 2001 From: simon Date: Tue, 11 Dec 2018 23:07:44 -0800 Subject: [PATCH 19/19] BUG, TST: GH 21235 groupby pct_change test parametrize --- pandas/tests/groupby/test_transform.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 7ae4d5420f3b6..b6361b4ad76a0 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -765,7 +765,10 @@ def test_pad_stable_sorting(fill_method): @pytest.mark.parametrize("test_series", [True, False]) -@pytest.mark.parametrize("freq", [None, 'D']) +@pytest.mark.parametrize("freq", [ + None, + pytest.param('D', marks=pytest.mark.xfail( + reason='GH#23918 before method uses freq in vectorized approach'))]) @pytest.mark.parametrize("periods,fill_method,limit", [ (1, 'ffill', None), (1, 'ffill', 1), (1, 'bfill', None), (1, 'bfill', 1), @@ -774,10 +777,6 @@ def test_pad_stable_sorting(fill_method): ]) def test_pct_change(test_series, freq, periods, fill_method, limit): # GH 21200, 21621 - if freq == 'D': - pytest.xfail("'freq' test not necessary until #23918 completed and" - "freq is used in the vectorized approach") - vals = [3, np.nan, np.nan, np.nan, 1, 2, 4, 10, np.nan, 4] keys = ['a', 'b'] key_v = np.repeat(keys, len(vals))