From 665a522067ee92d3616ed8d96025fe5ff6bd9249 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 25 Sep 2020 19:45:30 -0700 Subject: [PATCH 01/38] CLN: dont special-case DatetimeArray indexing --- pandas/core/arrays/datetimelike.py | 5 ++++- pandas/core/indexes/datetimelike.py | 8 +++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index c90610bdd920c..2990a1997f30c 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -530,7 +530,6 @@ def _validate_getitem_key(self, key): key = np.asarray(key, dtype=bool) key = check_array_indexer(self, key) - key = lib.maybe_booleans_to_slice(key.view(np.uint8)) elif isinstance(key, list) and len(key) == 1 and isinstance(key[0], slice): # see https://github.com/pandas-dev/pandas/issues/31299, need to allow # this for now (would otherwise raise in check_array_indexer) @@ -560,6 +559,10 @@ def _get_getitem_freq(self, key): # GH#21282 indexing with Ellipsis is similar to a full slice, # should preserve `freq` attribute freq = self.freq + elif com.is_bool_indexer(key): + new_key = lib.maybe_booleans_to_slice(key.view(np.uint8)) + if isinstance(new_key, slice): + return self._get_getitem_freq(new_key) return freq def __setitem__( diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index e2f59ceb41db5..922af8810a5ca 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -167,12 +167,14 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): indices = ensure_int64(indices) maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) - if isinstance(maybe_slice, slice): - return self[maybe_slice] - return ExtensionIndex.take( + result = ExtensionIndex.take( self, indices, axis, allow_fill, fill_value, **kwargs ) + if isinstance(maybe_slice, slice): + freq = self._data._get_getitem_freq(maybe_slice) + result._data._freq = freq + return result @doc(IndexOpsMixin.searchsorted, klass="Datetime-like Index") def searchsorted(self, value, side="left", sorter=None): From bdea70bd4c9b03ef50e2db51eda00677c85eea3e Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 26 Sep 2020 10:30:28 -0700 Subject: [PATCH 02/38] use parent class _validate_getitem_key --- pandas/core/arrays/datetimelike.py | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 2990a1997f30c..081ae069a19d8 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -55,7 +55,7 @@ from pandas.core.arrays.base import ExtensionOpsMixin import pandas.core.common as com from pandas.core.construction import array, extract_array -from pandas.core.indexers import check_array_indexer, check_setitem_lengths +from pandas.core.indexers import check_setitem_lengths from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.ops.invalid import invalid_comparison, make_invalid_op @@ -522,22 +522,6 @@ def __getitem__(self, key): result._freq = self._get_getitem_freq(key) return result - def _validate_getitem_key(self, key): - if com.is_bool_indexer(key): - # first convert to boolean, because check_array_indexer doesn't - # allow object dtype - if is_object_dtype(key): - key = np.asarray(key, dtype=bool) - - key = check_array_indexer(self, key) - elif isinstance(key, list) and len(key) == 1 and isinstance(key[0], slice): - # see https://github.com/pandas-dev/pandas/issues/31299, need to allow - # this for now (would otherwise raise in check_array_indexer) - pass - else: - key = super()._validate_getitem_key(key) - return key - def _get_getitem_freq(self, key): """ Find the `freq` attribute to assign to the result of a __getitem__ lookup. From 42e8e1573c6ee0966484a33363acb4770fe90487 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 2 Oct 2020 19:59:44 -0700 Subject: [PATCH 03/38] test, whatsnew --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/tests/series/indexing/test_boolean.py | 13 ++++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index cb0858fd678f8..aad21dfb19a74 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -358,6 +358,7 @@ Indexing - Bug in :meth:`Index.sort_values` where, when empty values were passed, the method would break by trying to compare missing values instead of pushing them to the end of the sort order. (:issue:`35584`) - Bug in :meth:`Index.get_indexer` and :meth:`Index.get_indexer_non_unique` where int64 arrays are returned instead of intp. (:issue:`36359`) - Bug in :meth:`DataFrame.sort_index` where parameter ascending passed as a list on a single level index gives wrong result. (:issue:`32334`) +- Bug in indexing with boolean masks on datetime-like values sometimes returning a view instead of a copy (:issue:`36210`) Missing ^^^^^^^ diff --git a/pandas/tests/series/indexing/test_boolean.py b/pandas/tests/series/indexing/test_boolean.py index e2b71b1f2f412..28bebd764a866 100644 --- a/pandas/tests/series/indexing/test_boolean.py +++ b/pandas/tests/series/indexing/test_boolean.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas import Index, Series +from pandas import Index, Series, date_range import pandas._testing as tm from pandas.core.indexing import IndexingError @@ -128,3 +128,14 @@ def test_get_set_boolean_different_order(string_series): sel = string_series[ordered > 0] exp = string_series[string_series > 0] tm.assert_series_equal(sel, exp) + + +def test_getitem_boolean_dt64_copies(): + # GH#36210 + dti = date_range("2016-01-01", periods=4, tz="US/Pacific") + key = np.array([True, True, False, False]) + + ser = Series(dti._data) + + res = ser[key] + assert res._values._data.base is None From b80fcd87867bca002c32de9ce99aae71a8a8204f Mon Sep 17 00:00:00 2001 From: John Karasinski Date: Sat, 3 Oct 2020 07:25:28 -0700 Subject: [PATCH 04/38] DOC: update code style for development doc and user guide #36777 (#36821) --- doc/source/development/extending.rst | 28 +- doc/source/user_guide/computation.rst | 186 +++++------ doc/source/user_guide/dsintro.rst | 170 +++++----- doc/source/user_guide/visualization.rst | 395 ++++++++++++------------ 4 files changed, 409 insertions(+), 370 deletions(-) diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index c708ebb361ed1..46960140d3a8c 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -34,7 +34,7 @@ decorate a class, providing the name of attribute to add. The class's @staticmethod def _validate(obj): # verify there is a column latitude and a column longitude - if 'latitude' not in obj.columns or 'longitude' not in obj.columns: + if "latitude" not in obj.columns or "longitude" not in obj.columns: raise AttributeError("Must have 'latitude' and 'longitude'.") @property @@ -176,6 +176,7 @@ your ``MyExtensionArray`` class, as follows: from pandas.api.extensions import ExtensionArray, ExtensionScalarOpsMixin + class MyExtensionArray(ExtensionArray, ExtensionScalarOpsMixin): pass @@ -271,6 +272,7 @@ included as a column in a pandas DataFrame): def __arrow_array__(self, type=None): # convert the underlying array values to a pyarrow Array import pyarrow + return pyarrow.array(..., type=type) The ``ExtensionDtype.__from_arrow__`` method then controls the conversion @@ -347,7 +349,6 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame .. code-block:: python class SubclassedSeries(pd.Series): - @property def _constructor(self): return SubclassedSeries @@ -358,7 +359,6 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame class SubclassedDataFrame(pd.DataFrame): - @property def _constructor(self): return SubclassedDataFrame @@ -377,7 +377,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame >>> type(to_framed) - >>> df = SubclassedDataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) + >>> df = SubclassedDataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) >>> df A B C 0 1 4 7 @@ -387,7 +387,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame >>> type(df) - >>> sliced1 = df[['A', 'B']] + >>> sliced1 = df[["A", "B"]] >>> sliced1 A B 0 1 4 @@ -397,7 +397,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame >>> type(sliced1) - >>> sliced2 = df['A'] + >>> sliced2 = df["A"] >>> sliced2 0 1 1 2 @@ -422,11 +422,11 @@ Below is an example to define two original properties, "internal_cache" as a tem class SubclassedDataFrame2(pd.DataFrame): # temporary properties - _internal_names = pd.DataFrame._internal_names + ['internal_cache'] + _internal_names = pd.DataFrame._internal_names + ["internal_cache"] _internal_names_set = set(_internal_names) # normal properties - _metadata = ['added_property'] + _metadata = ["added_property"] @property def _constructor(self): @@ -434,15 +434,15 @@ Below is an example to define two original properties, "internal_cache" as a tem .. code-block:: python - >>> df = SubclassedDataFrame2({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) + >>> df = SubclassedDataFrame2({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) >>> df A B C 0 1 4 7 1 2 5 8 2 3 6 9 - >>> df.internal_cache = 'cached' - >>> df.added_property = 'property' + >>> df.internal_cache = "cached" + >>> df.added_property = "property" >>> df.internal_cache cached @@ -450,11 +450,11 @@ Below is an example to define two original properties, "internal_cache" as a tem property # properties defined in _internal_names is reset after manipulation - >>> df[['A', 'B']].internal_cache + >>> df[["A", "B"]].internal_cache AttributeError: 'SubclassedDataFrame2' object has no attribute 'internal_cache' # properties defined in _metadata are retained - >>> df[['A', 'B']].added_property + >>> df[["A", "B"]].added_property property .. _extending.plotting-backends: @@ -468,7 +468,7 @@ one based on Matplotlib. For example: .. code-block:: python - >>> pd.set_option('plotting.backend', 'backend.module') + >>> pd.set_option("plotting.backend", "backend.module") >>> pd.Series([1, 2, 3]).plot() This would be more or less equivalent to: diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index e7edda90610b5..2f6ac6b06d85e 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -63,8 +63,7 @@ series in the DataFrame, also excluding NA/null values. .. ipython:: python - frame = pd.DataFrame(np.random.randn(1000, 5), - columns=['a', 'b', 'c', 'd', 'e']) + frame = pd.DataFrame(np.random.randn(1000, 5), columns=["a", "b", "c", "d", "e"]) frame.cov() ``DataFrame.cov`` also supports an optional ``min_periods`` keyword that @@ -73,9 +72,9 @@ in order to have a valid result. .. ipython:: python - frame = pd.DataFrame(np.random.randn(20, 3), columns=['a', 'b', 'c']) - frame.loc[frame.index[:5], 'a'] = np.nan - frame.loc[frame.index[5:10], 'b'] = np.nan + frame = pd.DataFrame(np.random.randn(20, 3), columns=["a", "b", "c"]) + frame.loc[frame.index[:5], "a"] = np.nan + frame.loc[frame.index[5:10], "b"] = np.nan frame.cov() @@ -116,13 +115,12 @@ Wikipedia has articles covering the above correlation coefficients: .. ipython:: python - frame = pd.DataFrame(np.random.randn(1000, 5), - columns=['a', 'b', 'c', 'd', 'e']) + frame = pd.DataFrame(np.random.randn(1000, 5), columns=["a", "b", "c", "d", "e"]) frame.iloc[::2] = np.nan # Series with Series - frame['a'].corr(frame['b']) - frame['a'].corr(frame['b'], method='spearman') + frame["a"].corr(frame["b"]) + frame["a"].corr(frame["b"], method="spearman") # Pairwise correlation of DataFrame columns frame.corr() @@ -134,9 +132,9 @@ Like ``cov``, ``corr`` also supports the optional ``min_periods`` keyword: .. ipython:: python - frame = pd.DataFrame(np.random.randn(20, 3), columns=['a', 'b', 'c']) - frame.loc[frame.index[:5], 'a'] = np.nan - frame.loc[frame.index[5:10], 'b'] = np.nan + frame = pd.DataFrame(np.random.randn(20, 3), columns=["a", "b", "c"]) + frame.loc[frame.index[:5], "a"] = np.nan + frame.loc[frame.index[5:10], "b"] = np.nan frame.corr() @@ -154,8 +152,8 @@ compute the correlation based on histogram intersection: # histogram intersection def histogram_intersection(a, b): - return np.minimum(np.true_divide(a, a.sum()), - np.true_divide(b, b.sum())).sum() + return np.minimum(np.true_divide(a, a.sum()), np.true_divide(b, b.sum())).sum() + frame.corr(method=histogram_intersection) @@ -165,8 +163,8 @@ DataFrame objects. .. ipython:: python - index = ['a', 'b', 'c', 'd', 'e'] - columns = ['one', 'two', 'three', 'four'] + index = ["a", "b", "c", "d", "e"] + columns = ["one", "two", "three", "four"] df1 = pd.DataFrame(np.random.randn(5, 4), index=index, columns=columns) df2 = pd.DataFrame(np.random.randn(4, 4), index=index[:4], columns=columns) df1.corrwith(df2) @@ -182,8 +180,8 @@ assigned the mean of the ranks (by default) for the group: .. ipython:: python - s = pd.Series(np.random.randn(5), index=list('abcde')) - s['d'] = s['b'] # so there's a tie + s = pd.Series(np.random.randn(5), index=list("abcde")) + s["d"] = s["b"] # so there's a tie s.rank() :meth:`~DataFrame.rank` is also a DataFrame method and can rank either the rows @@ -243,8 +241,7 @@ objects, :class:`~pandas.core.window.Rolling`, :class:`~pandas.core.window.Expan .. ipython:: python - s = pd.Series(np.random.randn(1000), - index=pd.date_range('1/1/2000', periods=1000)) + s = pd.Series(np.random.randn(1000), index=pd.date_range("1/1/2000", periods=1000)) s = s.cumsum() s @@ -279,24 +276,26 @@ We can then call methods on these ``rolling`` objects. These return like-indexed .. ipython:: python - s.plot(style='k--') + s.plot(style="k--") @savefig rolling_mean_ex.png - r.mean().plot(style='k') + r.mean().plot(style="k") .. ipython:: python :suppress: - plt.close('all') + plt.close("all") They can also be applied to DataFrame objects. This is really just syntactic sugar for applying the moving window operator to all of the DataFrame's columns: .. ipython:: python - df = pd.DataFrame(np.random.randn(1000, 4), - index=pd.date_range('1/1/2000', periods=1000), - columns=['A', 'B', 'C', 'D']) + df = pd.DataFrame( + np.random.randn(1000, 4), + index=pd.date_range("1/1/2000", periods=1000), + columns=["A", "B", "C", "D"], + ) df = df.cumsum() @savefig rolling_mean_frame.png @@ -368,7 +367,7 @@ compute the mean absolute deviation on a rolling basis: return np.fabs(x - x.mean()).mean() @savefig rolling_apply_ex.png - s.rolling(window=60).apply(mad, raw=True).plot(style='k') + s.rolling(window=60).apply(mad, raw=True).plot(style="k") Using the Numba engine ~~~~~~~~~~~~~~~~~~~~~~ @@ -453,23 +452,22 @@ The list of recognized types are the `scipy.signal window functions .. ipython:: python - ser = pd.Series(np.random.randn(10), - index=pd.date_range('1/1/2000', periods=10)) + ser = pd.Series(np.random.randn(10), index=pd.date_range("1/1/2000", periods=10)) - ser.rolling(window=5, win_type='triang').mean() + ser.rolling(window=5, win_type="triang").mean() Note that the ``boxcar`` window is equivalent to :meth:`~Rolling.mean`. .. ipython:: python - ser.rolling(window=5, win_type='boxcar').mean() + ser.rolling(window=5, win_type="boxcar").mean() ser.rolling(window=5).mean() For some windowing functions, additional parameters must be specified: .. ipython:: python - ser.rolling(window=5, win_type='gaussian').mean(std=0.1) + ser.rolling(window=5, win_type="gaussian").mean(std=0.1) .. _stats.moments.normalization: @@ -498,10 +496,10 @@ This can be particularly useful for a non-regular time frequency index. .. ipython:: python - dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, - index=pd.date_range('20130101 09:00:00', - periods=5, - freq='s')) + dft = pd.DataFrame( + {"B": [0, 1, 2, np.nan, 4]}, + index=pd.date_range("20130101 09:00:00", periods=5, freq="s"), + ) dft This is a regular frequency index. Using an integer window parameter works to roll along the window frequency. @@ -515,20 +513,26 @@ Specifying an offset allows a more intuitive specification of the rolling freque .. ipython:: python - dft.rolling('2s').sum() + dft.rolling("2s").sum() Using a non-regular, but still monotonic index, rolling with an integer window does not impart any special calculation. .. ipython:: python - dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, - index=pd.Index([pd.Timestamp('20130101 09:00:00'), - pd.Timestamp('20130101 09:00:02'), - pd.Timestamp('20130101 09:00:03'), - pd.Timestamp('20130101 09:00:05'), - pd.Timestamp('20130101 09:00:06')], - name='foo')) + dft = pd.DataFrame( + {"B": [0, 1, 2, np.nan, 4]}, + index=pd.Index( + [ + pd.Timestamp("20130101 09:00:00"), + pd.Timestamp("20130101 09:00:02"), + pd.Timestamp("20130101 09:00:03"), + pd.Timestamp("20130101 09:00:05"), + pd.Timestamp("20130101 09:00:06"), + ], + name="foo", + ), + ) dft dft.rolling(2).sum() @@ -537,7 +541,7 @@ Using the time-specification generates variable windows for this sparse data. .. ipython:: python - dft.rolling('2s').sum() + dft.rolling("2s").sum() Furthermore, we now allow an optional ``on`` parameter to specify a column (rather than the default of the index) in a DataFrame. @@ -546,7 +550,7 @@ default of the index) in a DataFrame. dft = dft.reset_index() dft - dft.rolling('2s', on='foo').sum() + dft.rolling("2s", on="foo").sum() .. _stats.custom_rolling_window: @@ -569,7 +573,7 @@ For example, if we have the following ``DataFrame``: use_expanding = [True, False, True, False, True] use_expanding - df = pd.DataFrame({'values': range(5)}) + df = pd.DataFrame({"values": range(5)}) df and we want to use an expanding window where ``use_expanding`` is ``True`` otherwise a window of size @@ -615,7 +619,8 @@ rolling operations over a non-fixed offset like a ``BusinessDay``. .. ipython:: python from pandas.api.indexers import VariableOffsetWindowIndexer - df = pd.DataFrame(range(10), index=pd.date_range('2020', periods=10)) + + df = pd.DataFrame(range(10), index=pd.date_range("2020", periods=10)) offset = pd.offsets.BDay(1) indexer = VariableOffsetWindowIndexer(index=df.index, offset=offset) df @@ -657,17 +662,21 @@ from present information back to past information. This allows the rolling windo .. ipython:: python - df = pd.DataFrame({'x': 1}, - index=[pd.Timestamp('20130101 09:00:01'), - pd.Timestamp('20130101 09:00:02'), - pd.Timestamp('20130101 09:00:03'), - pd.Timestamp('20130101 09:00:04'), - pd.Timestamp('20130101 09:00:06')]) - - df["right"] = df.rolling('2s', closed='right').x.sum() # default - df["both"] = df.rolling('2s', closed='both').x.sum() - df["left"] = df.rolling('2s', closed='left').x.sum() - df["neither"] = df.rolling('2s', closed='neither').x.sum() + df = pd.DataFrame( + {"x": 1}, + index=[ + pd.Timestamp("20130101 09:00:01"), + pd.Timestamp("20130101 09:00:02"), + pd.Timestamp("20130101 09:00:03"), + pd.Timestamp("20130101 09:00:04"), + pd.Timestamp("20130101 09:00:06"), + ], + ) + + df["right"] = df.rolling("2s", closed="right").x.sum() # default + df["both"] = df.rolling("2s", closed="both").x.sum() + df["left"] = df.rolling("2s", closed="left").x.sum() + df["neither"] = df.rolling("2s", closed="neither").x.sum() df @@ -745,13 +754,15 @@ For example: .. ipython:: python - df = pd.DataFrame(np.random.randn(1000, 4), - index=pd.date_range('1/1/2000', periods=1000), - columns=['A', 'B', 'C', 'D']) + df = pd.DataFrame( + np.random.randn(1000, 4), + index=pd.date_range("1/1/2000", periods=1000), + columns=["A", "B", "C", "D"], + ) df = df.cumsum() df2 = df[:20] - df2.rolling(window=5).corr(df2['B']) + df2.rolling(window=5).corr(df2["B"]) .. _stats.moments.corr_pairwise: @@ -776,14 +787,13 @@ can even be omitted: .. ipython:: python - covs = (df[['B', 'C', 'D']].rolling(window=50) - .cov(df[['A', 'B', 'C']], pairwise=True)) - covs.loc['2002-09-22':] + covs = df[["B", "C", "D"]].rolling(window=50).cov(df[["A", "B", "C"]], pairwise=True) + covs.loc["2002-09-22":] .. ipython:: python correls = df.rolling(window=50).corr() - correls.loc['2002-09-22':] + correls.loc["2002-09-22":] You can efficiently retrieve the time series of correlations between two columns by reshaping and indexing: @@ -791,12 +801,12 @@ columns by reshaping and indexing: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. ipython:: python @savefig rolling_corr_pairwise_ex.png - correls.unstack(1)[('A', 'C')].plot() + correls.unstack(1)[("A", "C")].plot() .. _stats.aggregate: @@ -810,9 +820,11 @@ perform multiple computations on the data. These operations are similar to the : .. ipython:: python - dfa = pd.DataFrame(np.random.randn(1000, 3), - index=pd.date_range('1/1/2000', periods=1000), - columns=['A', 'B', 'C']) + dfa = pd.DataFrame( + np.random.randn(1000, 3), + index=pd.date_range("1/1/2000", periods=1000), + columns=["A", "B", "C"], + ) r = dfa.rolling(window=60, min_periods=1) r @@ -823,9 +835,9 @@ Series (or multiple Series) via standard ``__getitem__``. r.aggregate(np.sum) - r['A'].aggregate(np.sum) + r["A"].aggregate(np.sum) - r[['A', 'B']].aggregate(np.sum) + r[["A", "B"]].aggregate(np.sum) As you can see, the result of the aggregation will have the selected columns, or all columns if none are selected. @@ -840,7 +852,7 @@ aggregation with, outputting a DataFrame: .. ipython:: python - r['A'].agg([np.sum, np.mean, np.std]) + r["A"].agg([np.sum, np.mean, np.std]) On a windowed DataFrame, you can pass a list of functions to apply to each column, which produces an aggregated result with a hierarchical index: @@ -860,20 +872,20 @@ columns of a ``DataFrame``: .. ipython:: python - r.agg({'A': np.sum, 'B': lambda x: np.std(x, ddof=1)}) + r.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)}) The function names can also be strings. In order for a string to be valid it must be implemented on the windowed object .. ipython:: python - r.agg({'A': 'sum', 'B': 'std'}) + r.agg({"A": "sum", "B": "std"}) Furthermore you can pass a nested dict to indicate different aggregations on different columns. .. ipython:: python - r.agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) + r.agg({"A": ["sum", "std"], "B": ["mean", "std"]}) .. _stats.moments.expanding: @@ -967,7 +979,7 @@ all accept are: sn.expanding().sum() sn.cumsum() - sn.cumsum().fillna(method='ffill') + sn.cumsum().fillna(method="ffill") An expanding window statistic will be more stable (and less responsive) than @@ -978,14 +990,14 @@ relative impact of an individual data point. As an example, here is the .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. ipython:: python - s.plot(style='k--') + s.plot(style="k--") @savefig expanding_mean_frame.png - s.expanding().mean().plot(style='k') + s.expanding().mean().plot(style="k") .. _stats.moments.exponentially_weighted: @@ -1115,10 +1127,10 @@ of ``times``. .. ipython:: python - df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) + df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]}) df - times = ['2020-01-01', '2020-01-03', '2020-01-10', '2020-01-15', '2020-01-17'] - df.ewm(halflife='4 days', times=pd.DatetimeIndex(times)).mean() + times = ["2020-01-01", "2020-01-03", "2020-01-10", "2020-01-15", "2020-01-17"] + df.ewm(halflife="4 days", times=pd.DatetimeIndex(times)).mean() The following formula is used to compute exponentially weighted mean with an input vector of times: @@ -1130,10 +1142,10 @@ Here is an example for a univariate time series: .. ipython:: python - s.plot(style='k--') + s.plot(style="k--") @savefig ewma_ex.png - s.ewm(span=20).mean().plot(style='k') + s.ewm(span=20).mean().plot(style="k") ExponentialMovingWindow has a ``min_periods`` argument, which has the same meaning it does for all the ``.expanding`` and ``.rolling`` methods: diff --git a/doc/source/user_guide/dsintro.rst b/doc/source/user_guide/dsintro.rst index c27c73d439a0c..d698b316d321e 100644 --- a/doc/source/user_guide/dsintro.rst +++ b/doc/source/user_guide/dsintro.rst @@ -51,7 +51,7 @@ index is passed, one will be created having values ``[0, ..., len(data) - 1]``. .. ipython:: python - s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) + s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"]) s s.index @@ -71,7 +71,7 @@ Series can be instantiated from dicts: .. ipython:: python - d = {'b': 1, 'a': 0, 'c': 2} + d = {"b": 1, "a": 0, "c": 2} pd.Series(d) .. note:: @@ -92,9 +92,9 @@ index will be pulled out. .. ipython:: python - d = {'a': 0., 'b': 1., 'c': 2.} + d = {"a": 0.0, "b": 1.0, "c": 2.0} pd.Series(d) - pd.Series(d, index=['b', 'c', 'd', 'a']) + pd.Series(d, index=["b", "c", "d", "a"]) .. note:: @@ -107,7 +107,7 @@ provided. The value will be repeated to match the length of **index**. .. ipython:: python - pd.Series(5., index=['a', 'b', 'c', 'd', 'e']) + pd.Series(5.0, index=["a", "b", "c", "d", "e"]) Series is ndarray-like ~~~~~~~~~~~~~~~~~~~~~~ @@ -173,26 +173,26 @@ label: .. ipython:: python - s['a'] - s['e'] = 12. + s["a"] + s["e"] = 12.0 s - 'e' in s - 'f' in s + "e" in s + "f" in s If a label is not contained, an exception is raised: .. code-block:: python - >>> s['f'] + >>> s["f"] KeyError: 'f' Using the ``get`` method, a missing label will return None or specified default: .. ipython:: python - s.get('f') + s.get("f") - s.get('f', np.nan) + s.get("f", np.nan) See also the :ref:`section on attribute access`. @@ -244,7 +244,7 @@ Series can also have a ``name`` attribute: .. ipython:: python - s = pd.Series(np.random.randn(5), name='something') + s = pd.Series(np.random.randn(5), name="something") s s.name @@ -306,13 +306,15 @@ keys. .. ipython:: python - d = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']), - 'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])} + d = { + "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]), + "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"]), + } df = pd.DataFrame(d) df - pd.DataFrame(d, index=['d', 'b', 'a']) - pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three']) + pd.DataFrame(d, index=["d", "b", "a"]) + pd.DataFrame(d, index=["d", "b", "a"], columns=["two", "three"]) The row and column labels can be accessed respectively by accessing the **index** and **columns** attributes: @@ -336,10 +338,9 @@ result will be ``range(n)``, where ``n`` is the array length. .. ipython:: python - d = {'one': [1., 2., 3., 4.], - 'two': [4., 3., 2., 1.]} + d = {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]} pd.DataFrame(d) - pd.DataFrame(d, index=['a', 'b', 'c', 'd']) + pd.DataFrame(d, index=["a", "b", "c", "d"]) From structured or record array ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -348,12 +349,12 @@ This case is handled identically to a dict of arrays. .. ipython:: python - data = np.zeros((2, ), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'a10')]) - data[:] = [(1, 2., 'Hello'), (2, 3., "World")] + data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")]) + data[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] pd.DataFrame(data) - pd.DataFrame(data, index=['first', 'second']) - pd.DataFrame(data, columns=['C', 'A', 'B']) + pd.DataFrame(data, index=["first", "second"]) + pd.DataFrame(data, columns=["C", "A", "B"]) .. note:: @@ -367,10 +368,10 @@ From a list of dicts .. ipython:: python - data2 = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}] + data2 = [{"a": 1, "b": 2}, {"a": 5, "b": 10, "c": 20}] pd.DataFrame(data2) - pd.DataFrame(data2, index=['first', 'second']) - pd.DataFrame(data2, columns=['a', 'b']) + pd.DataFrame(data2, index=["first", "second"]) + pd.DataFrame(data2, columns=["a", "b"]) .. _basics.dataframe.from_dict_of_tuples: @@ -382,11 +383,15 @@ dictionary. .. ipython:: python - pd.DataFrame({('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2}, - ('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4}, - ('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6}, - ('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8}, - ('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}}) + pd.DataFrame( + { + ("a", "b"): {("A", "B"): 1, ("A", "C"): 2}, + ("a", "a"): {("A", "C"): 3, ("A", "B"): 4}, + ("a", "c"): {("A", "B"): 5, ("A", "C"): 6}, + ("b", "a"): {("A", "C"): 7, ("A", "B"): 8}, + ("b", "b"): {("A", "D"): 9, ("A", "B"): 10}, + } + ) .. _basics.dataframe.from_series: @@ -414,11 +419,11 @@ first ``namedtuple``, a ``ValueError`` is raised. from collections import namedtuple - Point = namedtuple('Point', 'x y') + Point = namedtuple("Point", "x y") pd.DataFrame([Point(0, 0), Point(0, 3), (2, 3)]) - Point3D = namedtuple('Point3D', 'x y z') + Point3D = namedtuple("Point3D", "x y z") pd.DataFrame([Point3D(0, 0, 0), Point3D(0, 3, 5), Point(2, 3)]) @@ -468,15 +473,18 @@ set to ``'index'`` in order to use the dict keys as row labels. .. ipython:: python - pd.DataFrame.from_dict(dict([('A', [1, 2, 3]), ('B', [4, 5, 6])])) + pd.DataFrame.from_dict(dict([("A", [1, 2, 3]), ("B", [4, 5, 6])])) If you pass ``orient='index'``, the keys will be the row labels. In this case, you can also pass the desired column names: .. ipython:: python - pd.DataFrame.from_dict(dict([('A', [1, 2, 3]), ('B', [4, 5, 6])]), - orient='index', columns=['one', 'two', 'three']) + pd.DataFrame.from_dict( + dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]), + orient="index", + columns=["one", "two", "three"], + ) .. _basics.dataframe.from_records: @@ -490,7 +498,7 @@ dtype. For example: .. ipython:: python data - pd.DataFrame.from_records(data, index='C') + pd.DataFrame.from_records(data, index="C") .. _basics.dataframe.sel_add_del: @@ -503,17 +511,17 @@ the analogous dict operations: .. ipython:: python - df['one'] - df['three'] = df['one'] * df['two'] - df['flag'] = df['one'] > 2 + df["one"] + df["three"] = df["one"] * df["two"] + df["flag"] = df["one"] > 2 df Columns can be deleted or popped like with a dict: .. ipython:: python - del df['two'] - three = df.pop('three') + del df["two"] + three = df.pop("three") df When inserting a scalar value, it will naturally be propagated to fill the @@ -521,7 +529,7 @@ column: .. ipython:: python - df['foo'] = 'bar' + df["foo"] = "bar" df When inserting a Series that does not have the same index as the DataFrame, it @@ -529,7 +537,7 @@ will be conformed to the DataFrame's index: .. ipython:: python - df['one_trunc'] = df['one'][:2] + df["one_trunc"] = df["one"][:2] df You can insert raw ndarrays but their length must match the length of the @@ -540,7 +548,7 @@ available to insert at a particular location in the columns: .. ipython:: python - df.insert(1, 'bar', df['one']) + df.insert(1, "bar", df["one"]) df .. _dsintro.chained_assignment: @@ -556,17 +564,16 @@ derived from existing columns. .. ipython:: python - iris = pd.read_csv('data/iris.data') + iris = pd.read_csv("data/iris.data") iris.head() - (iris.assign(sepal_ratio=iris['SepalWidth'] / iris['SepalLength']) - .head()) + iris.assign(sepal_ratio=iris["SepalWidth"] / iris["SepalLength"]).head() In the example above, we inserted a precomputed value. We can also pass in a function of one argument to be evaluated on the DataFrame being assigned to. .. ipython:: python - iris.assign(sepal_ratio=lambda x: (x['SepalWidth'] / x['SepalLength'])).head() + iris.assign(sepal_ratio=lambda x: (x["SepalWidth"] / x["SepalLength"])).head() ``assign`` **always** returns a copy of the data, leaving the original DataFrame untouched. @@ -580,10 +587,14 @@ greater than 5, calculate the ratio, and plot: .. ipython:: python @savefig basics_assign.png - (iris.query('SepalLength > 5') - .assign(SepalRatio=lambda x: x.SepalWidth / x.SepalLength, - PetalRatio=lambda x: x.PetalWidth / x.PetalLength) - .plot(kind='scatter', x='SepalRatio', y='PetalRatio')) + ( + iris.query("SepalLength > 5") + .assign( + SepalRatio=lambda x: x.SepalWidth / x.SepalLength, + PetalRatio=lambda x: x.PetalWidth / x.PetalLength, + ) + .plot(kind="scatter", x="SepalRatio", y="PetalRatio") + ) Since a function is passed in, the function is computed on the DataFrame being assigned to. Importantly, this is the DataFrame that's been filtered @@ -603,10 +614,8 @@ to a column created earlier in the same :meth:`~DataFrame.assign`. .. ipython:: python - dfa = pd.DataFrame({"A": [1, 2, 3], - "B": [4, 5, 6]}) - dfa.assign(C=lambda x: x['A'] + x['B'], - D=lambda x: x['A'] + x['C']) + dfa = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + dfa.assign(C=lambda x: x["A"] + x["B"], D=lambda x: x["A"] + x["C"]) In the second expression, ``x['C']`` will refer to the newly created column, that's equal to ``dfa['A'] + dfa['B']``. @@ -631,7 +640,7 @@ DataFrame: .. ipython:: python - df.loc['b'] + df.loc["b"] df.iloc[2] For a more exhaustive treatment of sophisticated label-based indexing and @@ -650,8 +659,8 @@ union of the column and row labels. .. ipython:: python - df = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D']) - df2 = pd.DataFrame(np.random.randn(7, 3), columns=['A', 'B', 'C']) + df = pd.DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"]) + df2 = pd.DataFrame(np.random.randn(7, 3), columns=["A", "B", "C"]) df + df2 When doing an operation between DataFrame and Series, the default behavior is @@ -680,8 +689,8 @@ Boolean operators work as well: .. ipython:: python - df1 = pd.DataFrame({'a': [1, 0, 1], 'b': [0, 1, 1]}, dtype=bool) - df2 = pd.DataFrame({'a': [0, 1, 1], 'b': [1, 1, 0]}, dtype=bool) + df1 = pd.DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}, dtype=bool) + df2 = pd.DataFrame({"a": [0, 1, 1], "b": [1, 1, 0]}, dtype=bool) df1 & df2 df1 | df2 df1 ^ df2 @@ -737,8 +746,8 @@ on two :class:`Series` with differently ordered labels will align before the ope .. ipython:: python - ser1 = pd.Series([1, 2, 3], index=['a', 'b', 'c']) - ser2 = pd.Series([1, 3, 5], index=['b', 'a', 'c']) + ser1 = pd.Series([1, 2, 3], index=["a", "b", "c"]) + ser2 = pd.Series([1, 3, 5], index=["b", "a", "c"]) ser1 ser2 np.remainder(ser1, ser2) @@ -748,7 +757,7 @@ with missing values. .. ipython:: python - ser3 = pd.Series([2, 4, 6], index=['b', 'c', 'd']) + ser3 = pd.Series([2, 4, 6], index=["b", "c", "d"]) ser3 np.remainder(ser1, ser3) @@ -778,11 +787,11 @@ R package): :suppress: # force a summary to be printed - pd.set_option('display.max_rows', 5) + pd.set_option("display.max_rows", 5) .. ipython:: python - baseball = pd.read_csv('data/baseball.csv') + baseball = pd.read_csv("data/baseball.csv") print(baseball) baseball.info() @@ -791,7 +800,7 @@ R package): :okwarning: # restore GlobalPrintConfig - pd.reset_option(r'^display\.') + pd.reset_option(r"^display\.") However, using ``to_string`` will return a string representation of the DataFrame in tabular form, though it won't always fit the console width: @@ -812,7 +821,7 @@ option: .. ipython:: python - pd.set_option('display.width', 40) # default is 80 + pd.set_option("display.width", 40) # default is 80 pd.DataFrame(np.random.randn(3, 12)) @@ -820,21 +829,25 @@ You can adjust the max width of the individual columns by setting ``display.max_ .. ipython:: python - datafile = {'filename': ['filename_01', 'filename_02'], - 'path': ["media/user_name/storage/folder_01/filename_01", - "media/user_name/storage/folder_02/filename_02"]} + datafile = { + "filename": ["filename_01", "filename_02"], + "path": [ + "media/user_name/storage/folder_01/filename_01", + "media/user_name/storage/folder_02/filename_02", + ], + } - pd.set_option('display.max_colwidth', 30) + pd.set_option("display.max_colwidth", 30) pd.DataFrame(datafile) - pd.set_option('display.max_colwidth', 100) + pd.set_option("display.max_colwidth", 100) pd.DataFrame(datafile) .. ipython:: python :suppress: - pd.reset_option('display.width') - pd.reset_option('display.max_colwidth') + pd.reset_option("display.width") + pd.reset_option("display.max_colwidth") You can also disable this feature via the ``expand_frame_repr`` option. This will print the table in one block. @@ -847,8 +860,7 @@ accessed like an attribute: .. ipython:: python - df = pd.DataFrame({'foo1': np.random.randn(5), - 'foo2': np.random.randn(5)}) + df = pd.DataFrame({"foo1": np.random.randn(5), "foo2": np.random.randn(5)}) df df.foo1 diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index f41912445455d..46ab29a52747a 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -11,7 +11,8 @@ We use the standard convention for referencing the matplotlib API: .. ipython:: python import matplotlib.pyplot as plt - plt.close('all') + + plt.close("all") We provide the basics in pandas to easily create decent looking plots. See the :ref:`ecosystem ` section for visualization @@ -39,8 +40,7 @@ The ``plot`` method on Series and DataFrame is just a simple wrapper around .. ipython:: python - ts = pd.Series(np.random.randn(1000), - index=pd.date_range('1/1/2000', periods=1000)) + ts = pd.Series(np.random.randn(1000), index=pd.date_range("1/1/2000", periods=1000)) ts = ts.cumsum() @savefig series_plot_basic.png @@ -54,18 +54,17 @@ On DataFrame, :meth:`~DataFrame.plot` is a convenience to plot all of the column .. ipython:: python :suppress: - plt.close('all') + plt.close("all") np.random.seed(123456) .. ipython:: python - df = pd.DataFrame(np.random.randn(1000, 4), - index=ts.index, columns=list('ABCD')) + df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=list("ABCD")) df = df.cumsum() plt.figure(); @savefig frame_plot_basic.png - df.plot(); + df.plot() You can plot one column versus another using the ``x`` and ``y`` keywords in :meth:`~DataFrame.plot`: @@ -73,17 +72,17 @@ You can plot one column versus another using the ``x`` and ``y`` keywords in .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() np.random.seed(123456) .. ipython:: python - df3 = pd.DataFrame(np.random.randn(1000, 2), columns=['B', 'C']).cumsum() - df3['A'] = pd.Series(list(range(len(df)))) + df3 = pd.DataFrame(np.random.randn(1000, 2), columns=["B", "C"]).cumsum() + df3["A"] = pd.Series(list(range(len(df)))) @savefig df_plot_xy.png - df3.plot(x='A', y='B') + df3.plot(x="A", y="B") .. note:: @@ -93,7 +92,7 @@ You can plot one column versus another using the ``x`` and ``y`` keywords in .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. _visualization.other: @@ -120,7 +119,7 @@ For example, a bar plot can be created the following way: plt.figure(); @savefig bar_plot_ex.png - df.iloc[5].plot(kind='bar'); + df.iloc[5].plot(kind="bar") You can also create these other plots using the methods ``DataFrame.plot.`` instead of providing the ``kind`` keyword argument. This makes it easier to discover plot methods and the specific arguments they use: @@ -164,7 +163,7 @@ For labeled, non-time series data, you may wish to produce a bar plot: @savefig bar_plot_ex.png df.iloc[5].plot.bar() - plt.axhline(0, color='k'); + plt.axhline(0, color="k") Calling a DataFrame's :meth:`plot.bar() ` method produces a multiple bar plot: @@ -172,42 +171,42 @@ bar plot: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() np.random.seed(123456) .. ipython:: python - df2 = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd']) + df2 = pd.DataFrame(np.random.rand(10, 4), columns=["a", "b", "c", "d"]) @savefig bar_plot_multi_ex.png - df2.plot.bar(); + df2.plot.bar() To produce a stacked bar plot, pass ``stacked=True``: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() .. ipython:: python @savefig bar_plot_stacked_ex.png - df2.plot.bar(stacked=True); + df2.plot.bar(stacked=True) To get horizontal bar plots, use the ``barh`` method: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() .. ipython:: python @savefig barh_plot_stacked_ex.png - df2.plot.barh(stacked=True); + df2.plot.barh(stacked=True) .. _visualization.hist: @@ -218,8 +217,14 @@ Histograms can be drawn by using the :meth:`DataFrame.plot.hist` and :meth:`Seri .. ipython:: python - df4 = pd.DataFrame({'a': np.random.randn(1000) + 1, 'b': np.random.randn(1000), - 'c': np.random.randn(1000) - 1}, columns=['a', 'b', 'c']) + df4 = pd.DataFrame( + { + "a": np.random.randn(1000) + 1, + "b": np.random.randn(1000), + "c": np.random.randn(1000) - 1, + }, + columns=["a", "b", "c"], + ) plt.figure(); @@ -230,7 +235,7 @@ Histograms can be drawn by using the :meth:`DataFrame.plot.hist` and :meth:`Seri .. ipython:: python :suppress: - plt.close('all') + plt.close("all") A histogram can be stacked using ``stacked=True``. Bin size can be changed using the ``bins`` keyword. @@ -245,7 +250,7 @@ using the ``bins`` keyword. .. ipython:: python :suppress: - plt.close('all') + plt.close("all") You can pass other keywords supported by matplotlib ``hist``. For example, horizontal and cumulative histograms can be drawn by @@ -256,12 +261,12 @@ horizontal and cumulative histograms can be drawn by plt.figure(); @savefig hist_new_kwargs.png - df4['a'].plot.hist(orientation='horizontal', cumulative=True) + df4["a"].plot.hist(orientation="horizontal", cumulative=True) .. ipython:: python :suppress: - plt.close('all') + plt.close("all") See the :meth:`hist ` method and the `matplotlib hist documentation `__ for more. @@ -274,12 +279,12 @@ The existing interface ``DataFrame.hist`` to plot histogram still can be used. plt.figure(); @savefig hist_plot_ex.png - df['A'].diff().hist() + df["A"].diff().hist() .. ipython:: python :suppress: - plt.close('all') + plt.close("all") :meth:`DataFrame.hist` plots the histograms of the columns on multiple subplots: @@ -289,7 +294,7 @@ subplots: plt.figure() @savefig frame_hist_ex.png - df.diff().hist(color='k', alpha=0.5, bins=50) + df.diff().hist(color="k", alpha=0.5, bins=50) The ``by`` keyword can be specified to plot grouped histograms: @@ -297,7 +302,7 @@ The ``by`` keyword can be specified to plot grouped histograms: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() np.random.seed(123456) @@ -323,12 +328,12 @@ a uniform random variable on [0,1). .. ipython:: python :suppress: - plt.close('all') + plt.close("all") np.random.seed(123456) .. ipython:: python - df = pd.DataFrame(np.random.rand(10, 5), columns=['A', 'B', 'C', 'D', 'E']) + df = pd.DataFrame(np.random.rand(10, 5), columns=["A", "B", "C", "D", "E"]) @savefig box_plot_new.png df.plot.box() @@ -348,16 +353,20 @@ more complicated colorization, you can get each drawn artists by passing .. ipython:: python - color = {'boxes': 'DarkGreen', 'whiskers': 'DarkOrange', - 'medians': 'DarkBlue', 'caps': 'Gray'} + color = { + "boxes": "DarkGreen", + "whiskers": "DarkOrange", + "medians": "DarkBlue", + "caps": "Gray", + } @savefig box_new_colorize.png - df.plot.box(color=color, sym='r+') + df.plot.box(color=color, sym="r+") .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Also, you can pass other keywords supported by matplotlib ``boxplot``. For example, horizontal and custom-positioned boxplot can be drawn by @@ -378,7 +387,7 @@ The existing interface ``DataFrame.boxplot`` to plot boxplot still can be used. .. ipython:: python :suppress: - plt.close('all') + plt.close("all") np.random.seed(123456) .. ipython:: python @@ -396,19 +405,19 @@ groupings. For instance, .. ipython:: python :suppress: - plt.close('all') + plt.close("all") np.random.seed(123456) .. ipython:: python :okwarning: - df = pd.DataFrame(np.random.rand(10, 2), columns=['Col1', 'Col2']) - df['X'] = pd.Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B']) + df = pd.DataFrame(np.random.rand(10, 2), columns=["Col1", "Col2"]) + df["X"] = pd.Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"]) - plt.figure(); + plt.figure() @savefig box_plot_ex2.png - bp = df.boxplot(by='X') + bp = df.boxplot(by="X") You can also pass a subset of columns to plot, as well as group by multiple columns: @@ -416,25 +425,25 @@ columns: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") np.random.seed(123456) .. ipython:: python :okwarning: - df = pd.DataFrame(np.random.rand(10, 3), columns=['Col1', 'Col2', 'Col3']) - df['X'] = pd.Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B']) - df['Y'] = pd.Series(['A', 'B', 'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B']) + df = pd.DataFrame(np.random.rand(10, 3), columns=["Col1", "Col2", "Col3"]) + df["X"] = pd.Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"]) + df["Y"] = pd.Series(["A", "B", "A", "B", "A", "B", "A", "B", "A", "B"]) plt.figure(); @savefig box_plot_ex3.png - bp = df.boxplot(column=['Col1', 'Col2'], by=['X', 'Y']) + bp = df.boxplot(column=["Col1", "Col2"], by=["X", "Y"]) .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. _visualization.box.return: @@ -462,16 +471,16 @@ keyword, will affect the output type as well: np.random.seed(1234) df_box = pd.DataFrame(np.random.randn(50, 2)) - df_box['g'] = np.random.choice(['A', 'B'], size=50) - df_box.loc[df_box['g'] == 'B', 1] += 3 + df_box["g"] = np.random.choice(["A", "B"], size=50) + df_box.loc[df_box["g"] == "B", 1] += 3 @savefig boxplot_groupby.png - bp = df_box.boxplot(by='g') + bp = df_box.boxplot(by="g") .. ipython:: python :suppress: - plt.close('all') + plt.close("all") The subplots above are split by the numeric columns first, then the value of the ``g`` column. Below the subplots are first split by the value of ``g``, @@ -481,12 +490,12 @@ then by the numeric columns. :okwarning: @savefig groupby_boxplot_vis.png - bp = df_box.groupby('g').boxplot() + bp = df_box.groupby("g").boxplot() .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. _visualization.area_plot: @@ -506,23 +515,23 @@ When input data contains ``NaN``, it will be automatically filled by 0. If you w .. ipython:: python - df = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd']) + df = pd.DataFrame(np.random.rand(10, 4), columns=["a", "b", "c", "d"]) @savefig area_plot_stacked.png - df.plot.area(); + df.plot.area() To produce an unstacked plot, pass ``stacked=False``. Alpha value is set to 0.5 unless otherwise specified: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() .. ipython:: python @savefig area_plot_unstacked.png - df.plot.area(stacked=False); + df.plot.area(stacked=False) .. _visualization.scatter: @@ -537,29 +546,29 @@ These can be specified by the ``x`` and ``y`` keywords. :suppress: np.random.seed(123456) - plt.close('all') + plt.close("all") plt.figure() .. ipython:: python - df = pd.DataFrame(np.random.rand(50, 4), columns=['a', 'b', 'c', 'd']) + df = pd.DataFrame(np.random.rand(50, 4), columns=["a", "b", "c", "d"]) @savefig scatter_plot.png - df.plot.scatter(x='a', y='b'); + df.plot.scatter(x="a", y="b") To plot multiple column groups in a single axes, repeat ``plot`` method specifying target ``ax``. It is recommended to specify ``color`` and ``label`` keywords to distinguish each groups. .. ipython:: python - ax = df.plot.scatter(x='a', y='b', color='DarkBlue', label='Group 1'); + ax = df.plot.scatter(x="a", y="b", color="DarkBlue", label="Group 1") @savefig scatter_plot_repeated.png - df.plot.scatter(x='c', y='d', color='DarkGreen', label='Group 2', ax=ax); + df.plot.scatter(x="c", y="d", color="DarkGreen", label="Group 2", ax=ax) .. ipython:: python :suppress: - plt.close('all') + plt.close("all") The keyword ``c`` may be given as the name of a column to provide colors for each point: @@ -567,13 +576,13 @@ each point: .. ipython:: python @savefig scatter_plot_colored.png - df.plot.scatter(x='a', y='b', c='c', s=50); + df.plot.scatter(x="a", y="b", c="c", s=50) .. ipython:: python :suppress: - plt.close('all') + plt.close("all") You can pass other keywords supported by matplotlib :meth:`scatter `. The example below shows a @@ -582,12 +591,12 @@ bubble chart using a column of the ``DataFrame`` as the bubble size. .. ipython:: python @savefig scatter_plot_bubble.png - df.plot.scatter(x='a', y='b', s=df['c'] * 200); + df.plot.scatter(x="a", y="b", s=df["c"] * 200) .. ipython:: python :suppress: - plt.close('all') + plt.close("all") See the :meth:`scatter ` method and the `matplotlib scatter documentation `__ for more. @@ -609,11 +618,11 @@ too dense to plot each point individually. .. ipython:: python - df = pd.DataFrame(np.random.randn(1000, 2), columns=['a', 'b']) - df['b'] = df['b'] + np.arange(1000) + df = pd.DataFrame(np.random.randn(1000, 2), columns=["a", "b"]) + df["b"] = df["b"] + np.arange(1000) @savefig hexbin_plot.png - df.plot.hexbin(x='a', y='b', gridsize=25) + df.plot.hexbin(x="a", y="b", gridsize=25) A useful keyword argument is ``gridsize``; it controls the number of hexagons @@ -631,23 +640,23 @@ given by column ``z``. The bins are aggregated with NumPy's ``max`` function. .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() np.random.seed(123456) .. ipython:: python - df = pd.DataFrame(np.random.randn(1000, 2), columns=['a', 'b']) - df['b'] = df['b'] = df['b'] + np.arange(1000) - df['z'] = np.random.uniform(0, 3, 1000) + df = pd.DataFrame(np.random.randn(1000, 2), columns=["a", "b"]) + df["b"] = df["b"] = df["b"] + np.arange(1000) + df["z"] = np.random.uniform(0, 3, 1000) @savefig hexbin_plot_agg.png - df.plot.hexbin(x='a', y='b', C='z', reduce_C_function=np.max, gridsize=25) + df.plot.hexbin(x="a", y="b", C="z", reduce_C_function=np.max, gridsize=25) .. ipython:: python :suppress: - plt.close('all') + plt.close("all") See the :meth:`hexbin ` method and the `matplotlib hexbin documentation `__ for more. @@ -670,8 +679,7 @@ A ``ValueError`` will be raised if there are any negative values in your data. .. ipython:: python :okwarning: - series = pd.Series(3 * np.random.rand(4), - index=['a', 'b', 'c', 'd'], name='series') + series = pd.Series(3 * np.random.rand(4), index=["a", "b", "c", "d"], name="series") @savefig series_pie_plot.png series.plot.pie(figsize=(6, 6)) @@ -679,7 +687,7 @@ A ``ValueError`` will be raised if there are any negative values in your data. .. ipython:: python :suppress: - plt.close('all') + plt.close("all") For pie plots it's best to use square figures, i.e. a figure aspect ratio 1. You can create the figure with equal width and height, or force the aspect ratio @@ -700,8 +708,9 @@ drawn in each pie plots by default; specify ``legend=False`` to hide it. .. ipython:: python - df = pd.DataFrame(3 * np.random.rand(4, 2), - index=['a', 'b', 'c', 'd'], columns=['x', 'y']) + df = pd.DataFrame( + 3 * np.random.rand(4, 2), index=["a", "b", "c", "d"], columns=["x", "y"] + ) @savefig df_pie_plot.png df.plot.pie(subplots=True, figsize=(8, 4)) @@ -709,7 +718,7 @@ drawn in each pie plots by default; specify ``legend=False`` to hide it. .. ipython:: python :suppress: - plt.close('all') + plt.close("all") You can use the ``labels`` and ``colors`` keywords to specify the labels and colors of each wedge. @@ -731,21 +740,26 @@ Also, other keywords supported by :func:`matplotlib.pyplot.pie` can be used. .. ipython:: python @savefig series_pie_plot_options.png - series.plot.pie(labels=['AA', 'BB', 'CC', 'DD'], colors=['r', 'g', 'b', 'c'], - autopct='%.2f', fontsize=20, figsize=(6, 6)) + series.plot.pie( + labels=["AA", "BB", "CC", "DD"], + colors=["r", "g", "b", "c"], + autopct="%.2f", + fontsize=20, + figsize=(6, 6), + ) If you pass values whose sum total is less than 1.0, matplotlib draws a semicircle. .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() .. ipython:: python :okwarning: - series = pd.Series([0.1] * 4, index=['a', 'b', 'c', 'd'], name='series2') + series = pd.Series([0.1] * 4, index=["a", "b", "c", "d"], name="series2") @savefig series_pie_plot_semi.png series.plot.pie(figsize=(6, 6)) @@ -755,7 +769,7 @@ See the `matplotlib pie documentation `__ for more. @@ -1560,12 +1574,12 @@ To use the cubehelix colormap, we can pass ``colormap='cubehelix'``. plt.figure() @savefig cubehelix.png - df.plot(colormap='cubehelix') + df.plot(colormap="cubehelix") .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Alternatively, we can pass the colormap itself: @@ -1581,7 +1595,7 @@ Alternatively, we can pass the colormap itself: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Colormaps can also be used other plot types, like bar charts: @@ -1598,12 +1612,12 @@ Colormaps can also be used other plot types, like bar charts: plt.figure() @savefig greens.png - dd.plot.bar(colormap='Greens') + dd.plot.bar(colormap="Greens") .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Parallel coordinates charts: @@ -1612,12 +1626,12 @@ Parallel coordinates charts: plt.figure() @savefig parallel_gist_rainbow.png - parallel_coordinates(data, 'Name', colormap='gist_rainbow') + parallel_coordinates(data, "Name", colormap="gist_rainbow") .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Andrews curves charts: @@ -1626,12 +1640,12 @@ Andrews curves charts: plt.figure() @savefig andrews_curve_winter.png - andrews_curves(data, 'Name', colormap='winter') + andrews_curves(data, "Name", colormap="winter") .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Plotting directly with matplotlib --------------------------------- @@ -1655,23 +1669,24 @@ when plotting a large number of points. .. ipython:: python - price = pd.Series(np.random.randn(150).cumsum(), - index=pd.date_range('2000-1-1', periods=150, freq='B')) + price = pd.Series( + np.random.randn(150).cumsum(), + index=pd.date_range("2000-1-1", periods=150, freq="B"), + ) ma = price.rolling(20).mean() mstd = price.rolling(20).std() plt.figure() - plt.plot(price.index, price, 'k') - plt.plot(ma.index, ma, 'b') + plt.plot(price.index, price, "k") + plt.plot(ma.index, ma, "b") @savefig bollinger.png - plt.fill_between(mstd.index, ma - 2 * mstd, ma + 2 * mstd, - color='b', alpha=0.2) + plt.fill_between(mstd.index, ma - 2 * mstd, ma + 2 * mstd, color="b", alpha=0.2) .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Plotting backends ----------------- @@ -1685,21 +1700,21 @@ function. For example: .. code-block:: python - >>> Series([1, 2, 3]).plot(backend='backend.module') + >>> Series([1, 2, 3]).plot(backend="backend.module") Alternatively, you can also set this option globally, do you don't need to specify the keyword in each ``plot`` call. For example: .. code-block:: python - >>> pd.set_option('plotting.backend', 'backend.module') + >>> pd.set_option("plotting.backend", "backend.module") >>> pd.Series([1, 2, 3]).plot() Or: .. code-block:: python - >>> pd.options.plotting.backend = 'backend.module' + >>> pd.options.plotting.backend = "backend.module" >>> pd.Series([1, 2, 3]).plot() This would be more or less equivalent to: From f3125ab3da1f5185f22b41a28e8bf0c9d878d122 Mon Sep 17 00:00:00 2001 From: John Karasinski Date: Sat, 3 Oct 2020 07:30:49 -0700 Subject: [PATCH 05/38] DOC: update code style for user guide for #36777 (#36823) --- doc/source/user_guide/groupby.rst | 463 ++++++++-------- doc/source/user_guide/io.rst | 12 +- doc/source/user_guide/missing_data.rst | 137 ++--- doc/source/user_guide/scale.rst | 35 +- doc/source/user_guide/timeseries.rst | 713 +++++++++++++------------ 5 files changed, 717 insertions(+), 643 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 52342de98de79..9696f14f03b56 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -68,19 +68,23 @@ object (more on what the GroupBy object is later), you may do the following: .. ipython:: python - df = pd.DataFrame([('bird', 'Falconiformes', 389.0), - ('bird', 'Psittaciformes', 24.0), - ('mammal', 'Carnivora', 80.2), - ('mammal', 'Primates', np.nan), - ('mammal', 'Carnivora', 58)], - index=['falcon', 'parrot', 'lion', 'monkey', 'leopard'], - columns=('class', 'order', 'max_speed')) + df = pd.DataFrame( + [ + ("bird", "Falconiformes", 389.0), + ("bird", "Psittaciformes", 24.0), + ("mammal", "Carnivora", 80.2), + ("mammal", "Primates", np.nan), + ("mammal", "Carnivora", 58), + ], + index=["falcon", "parrot", "lion", "monkey", "leopard"], + columns=("class", "order", "max_speed"), + ) df # default is axis=0 - grouped = df.groupby('class') - grouped = df.groupby('order', axis='columns') - grouped = df.groupby(['class', 'order']) + grouped = df.groupby("class") + grouped = df.groupby("order", axis="columns") + grouped = df.groupby(["class", "order"]) The mapping can be specified many different ways: @@ -103,12 +107,14 @@ consider the following ``DataFrame``: .. ipython:: python - df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) + df = pd.DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + } + ) df On a DataFrame, we obtain a GroupBy object by calling :meth:`~DataFrame.groupby`. @@ -116,8 +122,8 @@ We could naturally group by either the ``A`` or ``B`` columns, or both: .. ipython:: python - grouped = df.groupby('A') - grouped = df.groupby(['A', 'B']) + grouped = df.groupby("A") + grouped = df.groupby(["A", "B"]) .. versionadded:: 0.24 @@ -126,8 +132,8 @@ but the specified columns .. ipython:: python - df2 = df.set_index(['A', 'B']) - grouped = df2.groupby(level=df2.index.names.difference(['B'])) + df2 = df.set_index(["A", "B"]) + grouped = df2.groupby(level=df2.index.names.difference(["B"])) grouped.sum() These will split the DataFrame on its index (rows). We could also split by the @@ -181,9 +187,9 @@ By default the group keys are sorted during the ``groupby`` operation. You may h .. ipython:: python - df2 = pd.DataFrame({'X': ['B', 'B', 'A', 'A'], 'Y': [1, 2, 3, 4]}) - df2.groupby(['X']).sum() - df2.groupby(['X'], sort=False).sum() + df2 = pd.DataFrame({"X": ["B", "B", "A", "A"], "Y": [1, 2, 3, 4]}) + df2.groupby(["X"]).sum() + df2.groupby(["X"], sort=False).sum() Note that ``groupby`` will preserve the order in which *observations* are sorted *within* each group. @@ -191,10 +197,10 @@ For example, the groups created by ``groupby()`` below are in the order they app .. ipython:: python - df3 = pd.DataFrame({'X': ['A', 'B', 'A', 'B'], 'Y': [1, 4, 3, 2]}) - df3.groupby(['X']).get_group('A') + df3 = pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}) + df3.groupby(["X"]).get_group("A") - df3.groupby(['X']).get_group('B') + df3.groupby(["X"]).get_group("B") .. _groupby.dropna: @@ -236,7 +242,7 @@ above example we have: .. ipython:: python - df.groupby('A').groups + df.groupby("A").groups df.groupby(get_letter_type, axis=1).groups Calling the standard Python ``len`` function on the GroupBy object just returns @@ -244,7 +250,7 @@ the length of the ``groups`` dict, so it is largely just a convenience: .. ipython:: python - grouped = df.groupby(['A', 'B']) + grouped = df.groupby(["A", "B"]) grouped.groups len(grouped) @@ -259,15 +265,14 @@ the length of the ``groups`` dict, so it is largely just a convenience: n = 10 weight = np.random.normal(166, 20, size=n) height = np.random.normal(60, 10, size=n) - time = pd.date_range('1/1/2000', periods=n) - gender = np.random.choice(['male', 'female'], size=n) - df = pd.DataFrame({'height': height, 'weight': weight, - 'gender': gender}, index=time) + time = pd.date_range("1/1/2000", periods=n) + gender = np.random.choice(["male", "female"], size=n) + df = pd.DataFrame({"height": height, "weight": weight, "gender": gender}, index=time) .. ipython:: python df - gb = df.groupby('gender') + gb = df.groupby("gender") .. ipython:: @@ -291,9 +296,11 @@ Let's create a Series with a two-level ``MultiIndex``. .. ipython:: python - arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] - index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second']) + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + index = pd.MultiIndex.from_arrays(arrays, names=["first", "second"]) s = pd.Series(np.random.randn(8), index=index) s @@ -309,7 +316,7 @@ number: .. ipython:: python - s.groupby(level='second').sum() + s.groupby(level="second").sum() The aggregation functions such as ``sum`` will take the level parameter directly. Additionally, the resulting index will be named according to the @@ -317,30 +324,32 @@ chosen level: .. ipython:: python - s.sum(level='second') + s.sum(level="second") Grouping with multiple levels is supported. .. ipython:: python :suppress: - arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], - ['doo', 'doo', 'bee', 'bee', 'bop', 'bop', 'bop', 'bop'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["doo", "doo", "bee", "bee", "bop", "bop", "bop", "bop"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] tuples = list(zip(*arrays)) - index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second', 'third']) + index = pd.MultiIndex.from_tuples(tuples, names=["first", "second", "third"]) s = pd.Series(np.random.randn(8), index=index) .. ipython:: python s - s.groupby(level=['first', 'second']).sum() + s.groupby(level=["first", "second"]).sum() Index level names may be supplied as keys. .. ipython:: python - s.groupby(['first', 'second']).sum() + s.groupby(["first", "second"]).sum() More on the ``sum`` function and aggregation later. @@ -352,14 +361,14 @@ objects. .. ipython:: python - arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] - index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second']) + index = pd.MultiIndex.from_arrays(arrays, names=["first", "second"]) - df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 3, 3], - 'B': np.arange(8)}, - index=index) + df = pd.DataFrame({"A": [1, 1, 1, 1, 2, 2, 3, 3], "B": np.arange(8)}, index=index) df @@ -368,19 +377,19 @@ the ``A`` column. .. ipython:: python - df.groupby([pd.Grouper(level=1), 'A']).sum() + df.groupby([pd.Grouper(level=1), "A"]).sum() Index levels may also be specified by name. .. ipython:: python - df.groupby([pd.Grouper(level='second'), 'A']).sum() + df.groupby([pd.Grouper(level="second"), "A"]).sum() Index level names may be specified as keys directly to ``groupby``. .. ipython:: python - df.groupby(['second', 'A']).sum() + df.groupby(["second", "A"]).sum() DataFrame column selection in GroupBy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -392,24 +401,26 @@ getting a column from a DataFrame, you can do: .. ipython:: python :suppress: - df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) + df = pd.DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + } + ) .. ipython:: python - grouped = df.groupby(['A']) - grouped_C = grouped['C'] - grouped_D = grouped['D'] + grouped = df.groupby(["A"]) + grouped_C = grouped["C"] + grouped_D = grouped["D"] This is mainly syntactic sugar for the alternative and much more verbose: .. ipython:: python - df['C'].groupby(df['A']) + df["C"].groupby(df["A"]) Additionally this method avoids recomputing the internal grouping information derived from the passed key. @@ -450,13 +461,13 @@ A single group can be selected using .. ipython:: python - grouped.get_group('bar') + grouped.get_group("bar") Or for an object grouped on multiple columns: .. ipython:: python - df.groupby(['A', 'B']).get_group(('bar', 'one')) + df.groupby(["A", "B"]).get_group(("bar", "one")) .. _groupby.aggregate: @@ -474,10 +485,10 @@ An obvious one is aggregation via the .. ipython:: python - grouped = df.groupby('A') + grouped = df.groupby("A") grouped.aggregate(np.sum) - grouped = df.groupby(['A', 'B']) + grouped = df.groupby(["A", "B"]) grouped.aggregate(np.sum) As you can see, the result of the aggregation will have the group names as the @@ -487,17 +498,17 @@ changed by using the ``as_index`` option: .. ipython:: python - grouped = df.groupby(['A', 'B'], as_index=False) + grouped = df.groupby(["A", "B"], as_index=False) grouped.aggregate(np.sum) - df.groupby('A', as_index=False).sum() + df.groupby("A", as_index=False).sum() Note that you could use the ``reset_index`` DataFrame function to achieve the same result as the column names are stored in the resulting ``MultiIndex``: .. ipython:: python - df.groupby(['A', 'B']).sum().reset_index() + df.groupby(["A", "B"]).sum().reset_index() Another simple aggregation example is to compute the size of each group. This is included in GroupBy as the ``size`` method. It returns a Series whose @@ -559,8 +570,8 @@ aggregation with, outputting a DataFrame: .. ipython:: python - grouped = df.groupby('A') - grouped['C'].agg([np.sum, np.mean, np.std]) + grouped = df.groupby("A") + grouped["C"].agg([np.sum, np.mean, np.std]) On a grouped ``DataFrame``, you can pass a list of functions to apply to each column, which produces an aggregated result with a hierarchical index: @@ -575,19 +586,21 @@ need to rename, then you can add in a chained operation for a ``Series`` like th .. ipython:: python - (grouped['C'].agg([np.sum, np.mean, np.std]) - .rename(columns={'sum': 'foo', - 'mean': 'bar', - 'std': 'baz'})) + ( + grouped["C"] + .agg([np.sum, np.mean, np.std]) + .rename(columns={"sum": "foo", "mean": "bar", "std": "baz"}) + ) For a grouped ``DataFrame``, you can rename in a similar manner: .. ipython:: python - (grouped.agg([np.sum, np.mean, np.std]) - .rename(columns={'sum': 'foo', - 'mean': 'bar', - 'std': 'baz'})) + ( + grouped.agg([np.sum, np.mean, np.std]).rename( + columns={"sum": "foo", "mean": "bar", "std": "baz"} + ) + ) .. note:: @@ -598,7 +611,7 @@ For a grouped ``DataFrame``, you can rename in a similar manner: .. ipython:: python :okexcept: - grouped['C'].agg(['sum', 'sum']) + grouped["C"].agg(["sum", "sum"]) Pandas *does* allow you to provide multiple lambdas. In this case, pandas @@ -607,8 +620,7 @@ For a grouped ``DataFrame``, you can rename in a similar manner: .. ipython:: python - grouped['C'].agg([lambda x: x.max() - x.min(), - lambda x: x.median() - x.mean()]) + grouped["C"].agg([lambda x: x.max() - x.min(), lambda x: x.median() - x.mean()]) @@ -631,15 +643,19 @@ accepts the special syntax in :meth:`GroupBy.agg`, known as "named aggregation", .. ipython:: python - animals = pd.DataFrame({'kind': ['cat', 'dog', 'cat', 'dog'], - 'height': [9.1, 6.0, 9.5, 34.0], - 'weight': [7.9, 7.5, 9.9, 198.0]}) + animals = pd.DataFrame( + { + "kind": ["cat", "dog", "cat", "dog"], + "height": [9.1, 6.0, 9.5, 34.0], + "weight": [7.9, 7.5, 9.9, 198.0], + } + ) animals animals.groupby("kind").agg( - min_height=pd.NamedAgg(column='height', aggfunc='min'), - max_height=pd.NamedAgg(column='height', aggfunc='max'), - average_weight=pd.NamedAgg(column='weight', aggfunc=np.mean), + min_height=pd.NamedAgg(column="height", aggfunc="min"), + max_height=pd.NamedAgg(column="height", aggfunc="max"), + average_weight=pd.NamedAgg(column="weight", aggfunc=np.mean), ) @@ -648,9 +664,9 @@ accepts the special syntax in :meth:`GroupBy.agg`, known as "named aggregation", .. ipython:: python animals.groupby("kind").agg( - min_height=('height', 'min'), - max_height=('height', 'max'), - average_weight=('weight', np.mean), + min_height=("height", "min"), + max_height=("height", "max"), + average_weight=("weight", np.mean), ) @@ -659,9 +675,11 @@ and unpack the keyword arguments .. ipython:: python - animals.groupby("kind").agg(**{ - 'total weight': pd.NamedAgg(column='weight', aggfunc=sum), - }) + animals.groupby("kind").agg( + **{ + "total weight": pd.NamedAgg(column="weight", aggfunc=sum), + } + ) Additional keyword arguments are not passed through to the aggregation functions. Only pairs of ``(column, aggfunc)`` should be passed as ``**kwargs``. If your aggregation functions @@ -680,8 +698,8 @@ no column selection, so the values are just the functions. .. ipython:: python animals.groupby("kind").height.agg( - min_height='min', - max_height='max', + min_height="min", + max_height="max", ) Applying different functions to DataFrame columns @@ -692,8 +710,7 @@ columns of a DataFrame: .. ipython:: python - grouped.agg({'C': np.sum, - 'D': lambda x: np.std(x, ddof=1)}) + grouped.agg({"C": np.sum, "D": lambda x: np.std(x, ddof=1)}) The function names can also be strings. In order for a string to be valid it must be either implemented on GroupBy or available via :ref:`dispatching @@ -701,7 +718,7 @@ must be either implemented on GroupBy or available via :ref:`dispatching .. ipython:: python - grouped.agg({'C': 'sum', 'D': 'std'}) + grouped.agg({"C": "sum", "D": "std"}) .. _groupby.aggregate.cython: @@ -713,8 +730,8 @@ optimized Cython implementations: .. ipython:: python - df.groupby('A').sum() - df.groupby(['A', 'B']).mean() + df.groupby("A").sum() + df.groupby(["A", "B"]).mean() Of course ``sum`` and ``mean`` are implemented on pandas objects, so the above code would work even without the special versions via dispatching (see below). @@ -743,15 +760,14 @@ For example, suppose we wished to standardize the data within each group: .. ipython:: python - index = pd.date_range('10/1/1999', periods=1100) + index = pd.date_range("10/1/1999", periods=1100) ts = pd.Series(np.random.normal(0.5, 2, 1100), index) ts = ts.rolling(window=100, min_periods=100).mean().dropna() ts.head() ts.tail() - transformed = (ts.groupby(lambda x: x.year) - .transform(lambda x: (x - x.mean()) / x.std())) + transformed = ts.groupby(lambda x: x.year).transform(lambda x: (x - x.mean()) / x.std()) We would expect the result to now have mean 0 and standard deviation 1 within each group, which we can easily check: @@ -772,7 +788,7 @@ We can also visually compare the original and transformed data sets. .. ipython:: python - compare = pd.DataFrame({'Original': ts, 'Transformed': transformed}) + compare = pd.DataFrame({"Original": ts, "Transformed": transformed}) @savefig groupby_transform_plot.png compare.plot() @@ -788,8 +804,8 @@ Alternatively, the built-in methods could be used to produce the same outputs. .. ipython:: python - max = ts.groupby(lambda x: x.year).transform('max') - min = ts.groupby(lambda x: x.year).transform('min') + max = ts.groupby(lambda x: x.year).transform("max") + min = ts.groupby(lambda x: x.year).transform("min") max - min @@ -798,7 +814,7 @@ Another common data transform is to replace missing data with the group mean. .. ipython:: python :suppress: - cols = ['A', 'B', 'C'] + cols = ["A", "B", "C"] values = np.random.randn(1000, 3) values[np.random.randint(0, 1000, 100), 0] = np.nan values[np.random.randint(0, 1000, 50), 1] = np.nan @@ -809,7 +825,7 @@ Another common data transform is to replace missing data with the group mean. data_df - countries = np.array(['US', 'UK', 'GR', 'JP']) + countries = np.array(["US", "UK", "GR", "JP"]) key = countries[np.random.randint(0, 4, 1000)] grouped = data_df.groupby(key) @@ -859,11 +875,10 @@ the column B based on the groups of column A. .. ipython:: python - df_re = pd.DataFrame({'A': [1] * 10 + [5] * 10, - 'B': np.arange(20)}) + df_re = pd.DataFrame({"A": [1] * 10 + [5] * 10, "B": np.arange(20)}) df_re - df_re.groupby('A').rolling(4).B.mean() + df_re.groupby("A").rolling(4).B.mean() The ``expanding()`` method will accumulate a given operation @@ -872,7 +887,7 @@ group. .. ipython:: python - df_re.groupby('A').expanding().sum() + df_re.groupby("A").expanding().sum() Suppose you want to use the ``resample()`` method to get a daily @@ -881,13 +896,16 @@ missing values with the ``ffill()`` method. .. ipython:: python - df_re = pd.DataFrame({'date': pd.date_range(start='2016-01-01', periods=4, - freq='W'), - 'group': [1, 1, 2, 2], - 'val': [5, 6, 7, 8]}).set_index('date') + df_re = pd.DataFrame( + { + "date": pd.date_range(start="2016-01-01", periods=4, freq="W"), + "group": [1, 1, 2, 2], + "val": [5, 6, 7, 8], + } + ).set_index("date") df_re - df_re.groupby('group').resample('1D').ffill() + df_re.groupby("group").resample("1D").ffill() .. _groupby.filter: @@ -911,8 +929,8 @@ with only a couple members. .. ipython:: python - dff = pd.DataFrame({'A': np.arange(8), 'B': list('aabbbbcc')}) - dff.groupby('B').filter(lambda x: len(x) > 2) + dff = pd.DataFrame({"A": np.arange(8), "B": list("aabbbbcc")}) + dff.groupby("B").filter(lambda x: len(x) > 2) Alternatively, instead of dropping the offending groups, we can return a like-indexed objects where the groups that do not pass the filter are filled @@ -920,14 +938,14 @@ with NaNs. .. ipython:: python - dff.groupby('B').filter(lambda x: len(x) > 2, dropna=False) + dff.groupby("B").filter(lambda x: len(x) > 2, dropna=False) For DataFrames with multiple columns, filters should explicitly specify a column as the filter criterion. .. ipython:: python - dff['C'] = np.arange(8) - dff.groupby('B').filter(lambda x: len(x['C']) > 2) + dff["C"] = np.arange(8) + dff.groupby("B").filter(lambda x: len(x["C"]) > 2) .. note:: @@ -939,7 +957,7 @@ For DataFrames with multiple columns, filters should explicitly specify a column .. ipython:: python - dff.groupby('B').head(2) + dff.groupby("B").head(2) .. _groupby.dispatch: @@ -953,7 +971,7 @@ functions: .. ipython:: python - grouped = df.groupby('A') + grouped = df.groupby("A") grouped.agg(lambda x: x.std()) But, it's rather verbose and can be untidy if you need to pass additional @@ -973,12 +991,14 @@ next). This enables some operations to be carried out rather succinctly: .. ipython:: python - tsdf = pd.DataFrame(np.random.randn(1000, 3), - index=pd.date_range('1/1/2000', periods=1000), - columns=['A', 'B', 'C']) + tsdf = pd.DataFrame( + np.random.randn(1000, 3), + index=pd.date_range("1/1/2000", periods=1000), + columns=["A", "B", "C"], + ) tsdf.iloc[::2] = np.nan grouped = tsdf.groupby(lambda x: x.year) - grouped.fillna(method='pad') + grouped.fillna(method="pad") In this example, we chopped the collection of time series into yearly chunks then independently called :ref:`fillna ` on the @@ -989,7 +1009,7 @@ The ``nlargest`` and ``nsmallest`` methods work on ``Series`` style groupbys: .. ipython:: python s = pd.Series([9, 8, 7, 5, 19, 1, 4.2, 3.3]) - g = pd.Series(list('abababab')) + g = pd.Series(list("abababab")) gb = s.groupby(g) gb.nlargest(3) gb.nsmallest(3) @@ -1008,10 +1028,10 @@ for both ``aggregate`` and ``transform`` in many standard use cases. However, .. ipython:: python df - grouped = df.groupby('A') + grouped = df.groupby("A") # could also just call .describe() - grouped['C'].apply(lambda x: x.describe()) + grouped["C"].apply(lambda x: x.describe()) The dimension of the returned result can also change: @@ -1032,7 +1052,8 @@ that is itself a series, and possibly upcast the result to a DataFrame: .. ipython:: python def f(x): - return pd.Series([x, x ** 2], index=['x', 'x^2']) + return pd.Series([x, x ** 2], index=["x", "x^2"]) + s = pd.Series(np.random.rand(5)) s @@ -1133,7 +1154,7 @@ will be (silently) dropped. Thus, this does not pose any problems: .. ipython:: python - df.groupby('A').std() + df.groupby("A").std() Note that ``df.groupby('A').colname.std().`` is more efficient than ``df.groupby('A').std().colname``, so if the result of an aggregation function @@ -1151,23 +1172,29 @@ is only interesting over one column (here ``colname``), it may be filtered .. ipython:: python from decimal import Decimal + df_dec = pd.DataFrame( - {'id': [1, 2, 1, 2], - 'int_column': [1, 2, 3, 4], - 'dec_column': [Decimal('0.50'), Decimal('0.15'), - Decimal('0.25'), Decimal('0.40')] - } + { + "id": [1, 2, 1, 2], + "int_column": [1, 2, 3, 4], + "dec_column": [ + Decimal("0.50"), + Decimal("0.15"), + Decimal("0.25"), + Decimal("0.40"), + ], + } ) # Decimal columns can be sum'd explicitly by themselves... - df_dec.groupby(['id'])[['dec_column']].sum() + df_dec.groupby(["id"])[["dec_column"]].sum() # ...but cannot be combined with standard data types or they will be excluded - df_dec.groupby(['id'])[['int_column', 'dec_column']].sum() + df_dec.groupby(["id"])[["int_column", "dec_column"]].sum() # Use .agg function to aggregate over standard and "nuisance" data types # at the same time - df_dec.groupby(['id']).agg({'int_column': 'sum', 'dec_column': 'sum'}) + df_dec.groupby(["id"]).agg({"int_column": "sum", "dec_column": "sum"}) .. _groupby.observed: @@ -1182,25 +1209,27 @@ Show all values: .. ipython:: python - pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], - categories=['a', 'b']), - observed=False).count() + pd.Series([1, 1, 1]).groupby( + pd.Categorical(["a", "a", "a"], categories=["a", "b"]), observed=False + ).count() Show only the observed values: .. ipython:: python - pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], - categories=['a', 'b']), - observed=True).count() + pd.Series([1, 1, 1]).groupby( + pd.Categorical(["a", "a", "a"], categories=["a", "b"]), observed=True + ).count() The returned dtype of the grouped will *always* include *all* of the categories that were grouped. .. ipython:: python - s = pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], - categories=['a', 'b']), - observed=False).count() + s = ( + pd.Series([1, 1, 1]) + .groupby(pd.Categorical(["a", "a", "a"], categories=["a", "b"]), observed=False) + .count() + ) s.index.dtype .. _groupby.missing: @@ -1224,7 +1253,7 @@ can be used as group keys. If so, the order of the levels will be preserved: data = pd.Series(np.random.randn(100)) - factor = pd.qcut(data, [0, .25, .5, .75, 1.]) + factor = pd.qcut(data, [0, 0.25, 0.5, 0.75, 1.0]) data.groupby(factor).mean() @@ -1240,19 +1269,23 @@ use the ``pd.Grouper`` to provide this local control. import datetime - df = pd.DataFrame({'Branch': 'A A A A A A A B'.split(), - 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), - 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], - 'Date': [ - datetime.datetime(2013, 1, 1, 13, 0), - datetime.datetime(2013, 1, 1, 13, 5), - datetime.datetime(2013, 10, 1, 20, 0), - datetime.datetime(2013, 10, 2, 10, 0), - datetime.datetime(2013, 10, 1, 20, 0), - datetime.datetime(2013, 10, 2, 10, 0), - datetime.datetime(2013, 12, 2, 12, 0), - datetime.datetime(2013, 12, 2, 14, 0)] - }) + df = pd.DataFrame( + { + "Branch": "A A A A A A A B".split(), + "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(), + "Quantity": [1, 3, 5, 1, 8, 1, 9, 3], + "Date": [ + datetime.datetime(2013, 1, 1, 13, 0), + datetime.datetime(2013, 1, 1, 13, 5), + datetime.datetime(2013, 10, 1, 20, 0), + datetime.datetime(2013, 10, 2, 10, 0), + datetime.datetime(2013, 10, 1, 20, 0), + datetime.datetime(2013, 10, 2, 10, 0), + datetime.datetime(2013, 12, 2, 12, 0), + datetime.datetime(2013, 12, 2, 14, 0), + ], + } + ) df @@ -1260,18 +1293,18 @@ Groupby a specific column with the desired frequency. This is like resampling. .. ipython:: python - df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer']).sum() + df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"]).sum() You have an ambiguous specification in that you have a named index and a column that could be potential groupers. .. ipython:: python - df = df.set_index('Date') - df['Date'] = df.index + pd.offsets.MonthEnd(2) - df.groupby([pd.Grouper(freq='6M', key='Date'), 'Buyer']).sum() + df = df.set_index("Date") + df["Date"] = df.index + pd.offsets.MonthEnd(2) + df.groupby([pd.Grouper(freq="6M", key="Date"), "Buyer"]).sum() - df.groupby([pd.Grouper(freq='6M', level='Date'), 'Buyer']).sum() + df.groupby([pd.Grouper(freq="6M", level="Date"), "Buyer"]).sum() Taking the first rows of each group @@ -1281,10 +1314,10 @@ Just like for a DataFrame or Series you can call head and tail on a groupby: .. ipython:: python - df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) df - g = df.groupby('A') + g = df.groupby("A") g.head(1) g.tail(1) @@ -1302,8 +1335,8 @@ will return a single row (or no row) per group if you pass an int for n: .. ipython:: python - df = pd.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') + df = pd.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) + g = df.groupby("A") g.nth(0) g.nth(-1) @@ -1314,21 +1347,21 @@ If you want to select the nth not-null item, use the ``dropna`` kwarg. For a Dat .. ipython:: python # nth(0) is the same as g.first() - g.nth(0, dropna='any') + g.nth(0, dropna="any") g.first() # nth(-1) is the same as g.last() - g.nth(-1, dropna='any') # NaNs denote group exhausted when using dropna + g.nth(-1, dropna="any") # NaNs denote group exhausted when using dropna g.last() - g.B.nth(0, dropna='all') + g.B.nth(0, dropna="all") As with other methods, passing ``as_index=False``, will achieve a filtration, which returns the grouped row. .. ipython:: python - df = pd.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A', as_index=False) + df = pd.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) + g = df.groupby("A", as_index=False) g.nth(0) g.nth(-1) @@ -1337,8 +1370,8 @@ You can also select multiple rows from each group by specifying multiple nth val .. ipython:: python - business_dates = pd.date_range(start='4/1/2014', end='6/30/2014', freq='B') - df = pd.DataFrame(1, index=business_dates, columns=['a', 'b']) + business_dates = pd.date_range(start="4/1/2014", end="6/30/2014", freq="B") + df = pd.DataFrame(1, index=business_dates, columns=["a", "b"]) # get the first, 4th, and last date index for each month df.groupby([df.index.year, df.index.month]).nth([0, 3, -1]) @@ -1350,12 +1383,12 @@ To see the order in which each row appears within its group, use the .. ipython:: python - dfg = pd.DataFrame(list('aaabba'), columns=['A']) + dfg = pd.DataFrame(list("aaabba"), columns=["A"]) dfg - dfg.groupby('A').cumcount() + dfg.groupby("A").cumcount() - dfg.groupby('A').cumcount(ascending=False) + dfg.groupby("A").cumcount(ascending=False) .. _groupby.ngroup: @@ -1374,12 +1407,12 @@ order they are first observed. .. ipython:: python - dfg = pd.DataFrame(list('aaabba'), columns=['A']) + dfg = pd.DataFrame(list("aaabba"), columns=["A"]) dfg - dfg.groupby('A').ngroup() + dfg.groupby("A").ngroup() - dfg.groupby('A').ngroup(ascending=False) + dfg.groupby("A").ngroup(ascending=False) Plotting ~~~~~~~~ @@ -1392,8 +1425,8 @@ the values in column 1 where the group is "B" are 3 higher on average. np.random.seed(1234) df = pd.DataFrame(np.random.randn(50, 2)) - df['g'] = np.random.choice(['A', 'B'], size=50) - df.loc[df['g'] == 'B', 1] += 3 + df["g"] = np.random.choice(["A", "B"], size=50) + df.loc[df["g"] == "B", 1] += 3 We can easily visualize this with a boxplot: @@ -1401,7 +1434,7 @@ We can easily visualize this with a boxplot: :okwarning: @savefig groupby_boxplot.png - df.groupby('g').boxplot() + df.groupby("g").boxplot() The result of calling ``boxplot`` is a dictionary whose keys are the values of our grouping column ``g`` ("A" and "B"). The values of the resulting dictionary @@ -1436,20 +1469,26 @@ code more readable. First we set the data: .. ipython:: python n = 1000 - df = pd.DataFrame({'Store': np.random.choice(['Store_1', 'Store_2'], n), - 'Product': np.random.choice(['Product_1', - 'Product_2'], n), - 'Revenue': (np.random.random(n) * 50 + 10).round(2), - 'Quantity': np.random.randint(1, 10, size=n)}) + df = pd.DataFrame( + { + "Store": np.random.choice(["Store_1", "Store_2"], n), + "Product": np.random.choice(["Product_1", "Product_2"], n), + "Revenue": (np.random.random(n) * 50 + 10).round(2), + "Quantity": np.random.randint(1, 10, size=n), + } + ) df.head(2) Now, to find prices per store/product, we can simply do: .. ipython:: python - (df.groupby(['Store', 'Product']) - .pipe(lambda grp: grp.Revenue.sum() / grp.Quantity.sum()) - .unstack().round(2)) + ( + df.groupby(["Store", "Product"]) + .pipe(lambda grp: grp.Revenue.sum() / grp.Quantity.sum()) + .unstack() + .round(2) + ) Piping can also be expressive when you want to deliver a grouped object to some arbitrary function, for example: @@ -1459,7 +1498,8 @@ arbitrary function, for example: def mean(groupby): return groupby.mean() - df.groupby(['Store', 'Product']).pipe(mean) + + df.groupby(["Store", "Product"]).pipe(mean) where ``mean`` takes a GroupBy object and finds the mean of the Revenue and Quantity columns respectively for each Store-Product combination. The ``mean`` function can @@ -1476,8 +1516,7 @@ Regroup columns of a DataFrame according to their sum, and sum the aggregated on .. ipython:: python - df = pd.DataFrame({'a': [1, 0, 0], 'b': [0, 1, 0], - 'c': [1, 0, 0], 'd': [2, 3, 4]}) + df = pd.DataFrame({"a": [1, 0, 0], "b": [0, 1, 0], "c": [1, 0, 0], "d": [2, 3, 4]}) df df.groupby(df.sum(), axis=1).sum() @@ -1536,16 +1575,22 @@ column index name will be used as the name of the inserted column: .. ipython:: python - df = pd.DataFrame({'a': [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], - 'b': [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1], - 'c': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], - 'd': [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1]}) + df = pd.DataFrame( + { + "a": [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], + "b": [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1], + "c": [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], + "d": [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1], + } + ) + def compute_metrics(x): - result = {'b_sum': x['b'].sum(), 'c_mean': x['c'].mean()} - return pd.Series(result, name='metrics') + result = {"b_sum": x["b"].sum(), "c_mean": x["c"].mean()} + return pd.Series(result, name="metrics") + - result = df.groupby('a').apply(compute_metrics) + result = df.groupby("a").apply(compute_metrics) result diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index e483cebf71614..184894bbafe28 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3310,10 +3310,10 @@ applications (CTRL-V on many operating systems). Here we illustrate writing a .. code-block:: python - >>> df = pd.DataFrame({'A': [1, 2, 3], - ... 'B': [4, 5, 6], - ... 'C': ['p', 'q', 'r']}, - ... index=['x', 'y', 'z']) + >>> df = pd.DataFrame( + ... {"A": [1, 2, 3], "B": [4, 5, 6], "C": ["p", "q", "r"]}, index=["x", "y", "z"] + ... ) + >>> df A B C x 1 4 p @@ -3607,8 +3607,8 @@ This format is specified by default when using ``put`` or ``to_hdf`` or by ``for .. code-block:: python - >>> pd.DataFrame(np.random.randn(10, 2)).to_hdf('test_fixed.h5', 'df') - >>> pd.read_hdf('test_fixed.h5', 'df', where='index>5') + >>> pd.DataFrame(np.random.randn(10, 2)).to_hdf("test_fixed.h5", "df") + >>> pd.read_hdf("test_fixed.h5", "df", where="index>5") TypeError: cannot pass a where specification when reading a fixed format. this store must be selected in its entirety diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 9294897686d46..3c97cc7da6edb 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -38,12 +38,15 @@ arise and we wish to also consider that "missing" or "not available" or "NA". .. ipython:: python - df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f', 'h'], - columns=['one', 'two', 'three']) - df['four'] = 'bar' - df['five'] = df['one'] > 0 + df = pd.DataFrame( + np.random.randn(5, 3), + index=["a", "c", "e", "f", "h"], + columns=["one", "two", "three"], + ) + df["four"] = "bar" + df["five"] = df["one"] > 0 df - df2 = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']) + df2 = df.reindex(["a", "b", "c", "d", "e", "f", "g", "h"]) df2 To make detecting missing values easier (and across different array dtypes), @@ -53,9 +56,9 @@ Series and DataFrame objects: .. ipython:: python - df2['one'] - pd.isna(df2['one']) - df2['four'].notna() + df2["one"] + pd.isna(df2["one"]) + df2["four"].notna() df2.isna() .. warning:: @@ -65,14 +68,14 @@ Series and DataFrame objects: .. ipython:: python - None == None # noqa: E711 + None == None # noqa: E711 np.nan == np.nan So as compared to above, a scalar equality comparison versus a ``None/np.nan`` doesn't provide useful information. .. ipython:: python - df2['one'] == np.nan + df2["one"] == np.nan Integer dtypes and missing data ------------------------------- @@ -101,9 +104,9 @@ pandas objects provide compatibility between ``NaT`` and ``NaN``. .. ipython:: python df2 = df.copy() - df2['timestamp'] = pd.Timestamp('20120101') + df2["timestamp"] = pd.Timestamp("20120101") df2 - df2.loc[['a', 'c', 'h'], ['one', 'timestamp']] = np.nan + df2.loc[["a", "c", "h"], ["one", "timestamp"]] = np.nan df2 df2.dtypes.value_counts() @@ -146,9 +149,9 @@ objects. .. ipython:: python :suppress: - df = df2.loc[:, ['one', 'two', 'three']] - a = df2.loc[df2.index[:5], ['one', 'two']].fillna(method='pad') - b = df2.loc[df2.index[:5], ['one', 'two', 'three']] + df = df2.loc[:, ["one", "two", "three"]] + a = df2.loc[df2.index[:5], ["one", "two"]].fillna(method="pad") + b = df2.loc[df2.index[:5], ["one", "two", "three"]] .. ipython:: python @@ -168,7 +171,7 @@ account for missing data. For example: .. ipython:: python df - df['one'].sum() + df["one"].sum() df.mean(1) df.cumsum() df.cumsum(skipna=False) @@ -210,7 +213,7 @@ with R, for example: .. ipython:: python df - df.groupby('one').mean() + df.groupby("one").mean() See the groupby section :ref:`here ` for more information. @@ -234,7 +237,7 @@ of ways, which we illustrate: df2 df2.fillna(0) - df2['one'].fillna('missing') + df2["one"].fillna("missing") **Fill gaps forward or backward** @@ -244,7 +247,7 @@ can propagate non-NA values forward or backward: .. ipython:: python df - df.fillna(method='pad') + df.fillna(method="pad") .. _missing_data.fillna.limit: @@ -261,7 +264,7 @@ we can use the ``limit`` keyword: .. ipython:: python df - df.fillna(method='pad', limit=1) + df.fillna(method="pad", limit=1) To remind you, these are the available filling methods: @@ -289,21 +292,21 @@ use case of this is to fill a DataFrame with the mean of that column. .. ipython:: python - dff = pd.DataFrame(np.random.randn(10, 3), columns=list('ABC')) + dff = pd.DataFrame(np.random.randn(10, 3), columns=list("ABC")) dff.iloc[3:5, 0] = np.nan dff.iloc[4:6, 1] = np.nan dff.iloc[5:8, 2] = np.nan dff dff.fillna(dff.mean()) - dff.fillna(dff.mean()['B':'C']) + dff.fillna(dff.mean()["B":"C"]) Same result as above, but is aligning the 'fill' value which is a Series in this case. .. ipython:: python - dff.where(pd.notna(dff), dff.mean(), axis='columns') + dff.where(pd.notna(dff), dff.mean(), axis="columns") .. _missing_data.dropna: @@ -317,15 +320,15 @@ data. To do this, use :meth:`~DataFrame.dropna`: .. ipython:: python :suppress: - df['two'] = df['two'].fillna(0) - df['three'] = df['three'].fillna(0) + df["two"] = df["two"].fillna(0) + df["three"] = df["three"].fillna(0) .. ipython:: python df df.dropna(axis=0) df.dropna(axis=1) - df['one'].dropna() + df["one"].dropna() An equivalent :meth:`~Series.dropna` is available for Series. DataFrame.dropna has considerably more options than Series.dropna, which can be @@ -343,7 +346,7 @@ that, by default, performs linear interpolation at missing data points. :suppress: np.random.seed(123456) - idx = pd.date_range('1/1/2000', periods=100, freq='BM') + idx = pd.date_range("1/1/2000", periods=100, freq="BM") ts = pd.Series(np.random.randn(100), index=idx) ts[1:5] = np.nan ts[20:30] = np.nan @@ -376,28 +379,29 @@ Index aware interpolation is available via the ``method`` keyword: ts2 ts2.interpolate() - ts2.interpolate(method='time') + ts2.interpolate(method="time") For a floating-point index, use ``method='values'``: .. ipython:: python :suppress: - idx = [0., 1., 10.] - ser = pd.Series([0., np.nan, 10.], idx) + idx = [0.0, 1.0, 10.0] + ser = pd.Series([0.0, np.nan, 10.0], idx) .. ipython:: python ser ser.interpolate() - ser.interpolate(method='values') + ser.interpolate(method="values") You can also interpolate with a DataFrame: .. ipython:: python - df = pd.DataFrame({'A': [1, 2.1, np.nan, 4.7, 5.6, 6.8], - 'B': [.25, np.nan, np.nan, 4, 12.2, 14.4]}) + df = pd.DataFrame( + {"A": [1, 2.1, np.nan, 4.7, 5.6, 6.8], "B": [0.25, np.nan, np.nan, 4, 12.2, 14.4]} + ) df df.interpolate() @@ -418,20 +422,20 @@ The appropriate interpolation method will depend on the type of data you are wor .. ipython:: python - df.interpolate(method='barycentric') + df.interpolate(method="barycentric") - df.interpolate(method='pchip') + df.interpolate(method="pchip") - df.interpolate(method='akima') + df.interpolate(method="akima") When interpolating via a polynomial or spline approximation, you must also specify the degree or order of the approximation: .. ipython:: python - df.interpolate(method='spline', order=2) + df.interpolate(method="spline", order=2) - df.interpolate(method='polynomial', order=2) + df.interpolate(method="polynomial", order=2) Compare several methods: @@ -439,10 +443,10 @@ Compare several methods: np.random.seed(2) - ser = pd.Series(np.arange(1, 10.1, .25) ** 2 + np.random.randn(37)) + ser = pd.Series(np.arange(1, 10.1, 0.25) ** 2 + np.random.randn(37)) missing = np.array([4, 13, 14, 15, 16, 17, 18, 20, 29]) ser[missing] = np.nan - methods = ['linear', 'quadratic', 'cubic'] + methods = ["linear", "quadratic", "cubic"] df = pd.DataFrame({m: ser.interpolate(method=m) for m in methods}) @savefig compare_interpolations.png @@ -460,7 +464,7 @@ at the new values. # interpolate at new_index new_index = ser.index | pd.Index([49.25, 49.5, 49.75, 50.25, 50.5, 50.75]) - interp_s = ser.reindex(new_index).interpolate(method='pchip') + interp_s = ser.reindex(new_index).interpolate(method="pchip") interp_s[49:51] .. _scipy: https://www.scipy.org @@ -478,8 +482,7 @@ filled since the last valid observation: .. ipython:: python - ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, - np.nan, 13, np.nan, np.nan]) + ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, np.nan, 13, np.nan, np.nan]) ser # fill all consecutive values in a forward direction @@ -494,13 +497,13 @@ By default, ``NaN`` values are filled in a ``forward`` direction. Use .. ipython:: python # fill one consecutive value backwards - ser.interpolate(limit=1, limit_direction='backward') + ser.interpolate(limit=1, limit_direction="backward") # fill one consecutive value in both directions - ser.interpolate(limit=1, limit_direction='both') + ser.interpolate(limit=1, limit_direction="both") # fill all consecutive values in both directions - ser.interpolate(limit_direction='both') + ser.interpolate(limit_direction="both") By default, ``NaN`` values are filled whether they are inside (surrounded by) existing valid values, or outside existing valid values. The ``limit_area`` @@ -509,13 +512,13 @@ parameter restricts filling to either inside or outside values. .. ipython:: python # fill one consecutive inside value in both directions - ser.interpolate(limit_direction='both', limit_area='inside', limit=1) + ser.interpolate(limit_direction="both", limit_area="inside", limit=1) # fill all consecutive outside values backward - ser.interpolate(limit_direction='backward', limit_area='outside') + ser.interpolate(limit_direction="backward", limit_area="outside") # fill all consecutive outside values in both directions - ser.interpolate(limit_direction='both', limit_area='outside') + ser.interpolate(limit_direction="both", limit_area="outside") .. _missing_data.replace: @@ -531,7 +534,7 @@ value: .. ipython:: python - ser = pd.Series([0., 1., 2., 3., 4.]) + ser = pd.Series([0.0, 1.0, 2.0, 3.0, 4.0]) ser.replace(0, 5) @@ -551,16 +554,16 @@ For a DataFrame, you can specify individual values by column: .. ipython:: python - df = pd.DataFrame({'a': [0, 1, 2, 3, 4], 'b': [5, 6, 7, 8, 9]}) + df = pd.DataFrame({"a": [0, 1, 2, 3, 4], "b": [5, 6, 7, 8, 9]}) - df.replace({'a': 0, 'b': 5}, 100) + df.replace({"a": 0, "b": 5}, 100) Instead of replacing with specified values, you can treat all given values as missing and interpolate over them: .. ipython:: python - ser.replace([1, 2, 3], method='pad') + ser.replace([1, 2, 3], method="pad") .. _missing_data.replace_expression: @@ -581,67 +584,67 @@ Replace the '.' with ``NaN`` (str -> str): .. ipython:: python - d = {'a': list(range(4)), 'b': list('ab..'), 'c': ['a', 'b', np.nan, 'd']} + d = {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]} df = pd.DataFrame(d) - df.replace('.', np.nan) + df.replace(".", np.nan) Now do it with a regular expression that removes surrounding whitespace (regex -> regex): .. ipython:: python - df.replace(r'\s*\.\s*', np.nan, regex=True) + df.replace(r"\s*\.\s*", np.nan, regex=True) Replace a few different values (list -> list): .. ipython:: python - df.replace(['a', '.'], ['b', np.nan]) + df.replace(["a", "."], ["b", np.nan]) list of regex -> list of regex: .. ipython:: python - df.replace([r'\.', r'(a)'], ['dot', r'\1stuff'], regex=True) + df.replace([r"\.", r"(a)"], ["dot", r"\1stuff"], regex=True) Only search in column ``'b'`` (dict -> dict): .. ipython:: python - df.replace({'b': '.'}, {'b': np.nan}) + df.replace({"b": "."}, {"b": np.nan}) Same as the previous example, but use a regular expression for searching instead (dict of regex -> dict): .. ipython:: python - df.replace({'b': r'\s*\.\s*'}, {'b': np.nan}, regex=True) + df.replace({"b": r"\s*\.\s*"}, {"b": np.nan}, regex=True) You can pass nested dictionaries of regular expressions that use ``regex=True``: .. ipython:: python - df.replace({'b': {'b': r''}}, regex=True) + df.replace({"b": {"b": r""}}, regex=True) Alternatively, you can pass the nested dictionary like so: .. ipython:: python - df.replace(regex={'b': {r'\s*\.\s*': np.nan}}) + df.replace(regex={"b": {r"\s*\.\s*": np.nan}}) You can also use the group of a regular expression match when replacing (dict of regex -> dict of regex), this works for lists as well. .. ipython:: python - df.replace({'b': r'\s*(\.)\s*'}, {'b': r'\1ty'}, regex=True) + df.replace({"b": r"\s*(\.)\s*"}, {"b": r"\1ty"}, regex=True) You can pass a list of regular expressions, of which those that match will be replaced with a scalar (list of regex -> regex). .. ipython:: python - df.replace([r'\s*\.\s*', r'a|b'], np.nan, regex=True) + df.replace([r"\s*\.\s*", r"a|b"], np.nan, regex=True) All of the regular expression examples can also be passed with the ``to_replace`` argument as the ``regex`` argument. In this case the ``value`` @@ -650,7 +653,7 @@ dictionary. The previous example, in this case, would then be: .. ipython:: python - df.replace(regex=[r'\s*\.\s*', r'a|b'], value=np.nan) + df.replace(regex=[r"\s*\.\s*", r"a|b"], value=np.nan) This can be convenient if you do not want to pass ``regex=True`` every time you want to use a regular expression. @@ -676,7 +679,7 @@ Replacing more than one value is possible by passing a list. .. ipython:: python df00 = df.iloc[0, 0] - df.replace([1.5, df00], [np.nan, 'a']) + df.replace([1.5, df00], [np.nan, "a"]) df[1].dtype You can also operate on the DataFrame in place: @@ -932,7 +935,7 @@ the first 10 columns. .. ipython:: python - bb = pd.read_csv('data/baseball.csv', index_col='id') + bb = pd.read_csv("data/baseball.csv", index_col="id") bb[bb.columns[:10]].dtypes .. ipython:: python diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst index 206d8dd0f4739..f36f27269a996 100644 --- a/doc/source/user_guide/scale.rst +++ b/doc/source/user_guide/scale.rst @@ -72,7 +72,7 @@ Option 1 loads in all the data and then filters to what we need. .. ipython:: python - columns = ['id_0', 'name_0', 'x_0', 'y_0'] + columns = ["id_0", "name_0", "x_0", "y_0"] pd.read_parquet("timeseries_wide.parquet")[columns] @@ -123,7 +123,7 @@ space-efficient integers to know which specific name is used in each row. .. ipython:: python ts2 = ts.copy() - ts2['name'] = ts2['name'].astype('category') + ts2["name"] = ts2["name"].astype("category") ts2.memory_usage(deep=True) We can go a bit further and downcast the numeric columns to their smallest types @@ -131,8 +131,8 @@ using :func:`pandas.to_numeric`. .. ipython:: python - ts2['id'] = pd.to_numeric(ts2['id'], downcast='unsigned') - ts2[['x', 'y']] = ts2[['x', 'y']].apply(pd.to_numeric, downcast='float') + ts2["id"] = pd.to_numeric(ts2["id"], downcast="unsigned") + ts2[["x", "y"]] = ts2[["x", "y"]].apply(pd.to_numeric, downcast="float") ts2.dtypes .. ipython:: python @@ -141,8 +141,7 @@ using :func:`pandas.to_numeric`. .. ipython:: python - reduction = (ts2.memory_usage(deep=True).sum() - / ts.memory_usage(deep=True).sum()) + reduction = ts2.memory_usage(deep=True).sum() / ts.memory_usage(deep=True).sum() print(f"{reduction:0.2f}") In all, we've reduced the in-memory footprint of this dataset to 1/5 of its @@ -174,13 +173,13 @@ files. Each file in the directory represents a different year of the entire data import pathlib N = 12 - starts = [f'20{i:>02d}-01-01' for i in range(N)] - ends = [f'20{i:>02d}-12-13' for i in range(N)] + starts = [f"20{i:>02d}-01-01" for i in range(N)] + ends = [f"20{i:>02d}-12-13" for i in range(N)] pathlib.Path("data/timeseries").mkdir(exist_ok=True) for i, (start, end) in enumerate(zip(starts, ends)): - ts = _make_timeseries(start=start, end=end, freq='1T', seed=i) + ts = _make_timeseries(start=start, end=end, freq="1T", seed=i) ts.to_parquet(f"data/timeseries/ts-{i:0>2d}.parquet") @@ -215,7 +214,7 @@ work for arbitrary-sized datasets. # Only one dataframe is in memory at a time... df = pd.read_parquet(path) # ... plus a small Series ``counts``, which is updated. - counts = counts.add(df['name'].value_counts(), fill_value=0) + counts = counts.add(df["name"].value_counts(), fill_value=0) counts.astype(int) Some readers, like :meth:`pandas.read_csv`, offer parameters to control the @@ -278,8 +277,8 @@ Rather than executing immediately, doing operations build up a **task graph**. .. ipython:: python ddf - ddf['name'] - ddf['name'].value_counts() + ddf["name"] + ddf["name"].value_counts() Each of these calls is instant because the result isn't being computed yet. We're just building up a list of computation to do when someone needs the @@ -291,7 +290,7 @@ To get the actual result you can call ``.compute()``. .. ipython:: python - %time ddf['name'].value_counts().compute() + %time ddf["name"].value_counts().compute() At that point, you get back the same thing you'd get with pandas, in this case a concrete pandas Series with the count of each ``name``. @@ -324,7 +323,7 @@ a familiar groupby aggregation. .. ipython:: python - %time ddf.groupby('name')[['x', 'y']].mean().compute().head() + %time ddf.groupby("name")[["x", "y"]].mean().compute().head() The grouping and aggregation is done out-of-core and in parallel. @@ -336,8 +335,8 @@ we need to supply the divisions manually. .. ipython:: python N = 12 - starts = [f'20{i:>02d}-01-01' for i in range(N)] - ends = [f'20{i:>02d}-12-13' for i in range(N)] + starts = [f"20{i:>02d}-01-01" for i in range(N)] + ends = [f"20{i:>02d}-12-13" for i in range(N)] divisions = tuple(pd.to_datetime(starts)) + (pd.Timestamp(ends[-1]),) ddf.divisions = divisions @@ -347,7 +346,7 @@ Now we can do things like fast random access with ``.loc``. .. ipython:: python - ddf.loc['2002-01-01 12:01':'2002-01-01 12:05'].compute() + ddf.loc["2002-01-01 12:01":"2002-01-01 12:05"].compute() Dask knows to just look in the 3rd partition for selecting values in 2002. It doesn't need to look at any other data. @@ -362,7 +361,7 @@ out of memory. At that point it's just a regular pandas object. :okwarning: @savefig dask_resample.png - ddf[['x', 'y']].resample("1D").mean().cumsum().compute().plot() + ddf[["x", "y"]].resample("1D").mean().cumsum().compute().plot() These Dask examples have all be done using multiple processes on a single machine. Dask can be `deployed on a cluster diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 61902b4a41b7c..11ec90085d9bf 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -19,42 +19,43 @@ Parsing time series information from various sources and formats import datetime - dti = pd.to_datetime(['1/1/2018', np.datetime64('2018-01-01'), - datetime.datetime(2018, 1, 1)]) + dti = pd.to_datetime( + ["1/1/2018", np.datetime64("2018-01-01"), datetime.datetime(2018, 1, 1)] + ) dti Generate sequences of fixed-frequency dates and time spans .. ipython:: python - dti = pd.date_range('2018-01-01', periods=3, freq='H') + dti = pd.date_range("2018-01-01", periods=3, freq="H") dti Manipulating and converting date times with timezone information .. ipython:: python - dti = dti.tz_localize('UTC') + dti = dti.tz_localize("UTC") dti - dti.tz_convert('US/Pacific') + dti.tz_convert("US/Pacific") Resampling or converting a time series to a particular frequency .. ipython:: python - idx = pd.date_range('2018-01-01', periods=5, freq='H') + idx = pd.date_range("2018-01-01", periods=5, freq="H") ts = pd.Series(range(len(idx)), index=idx) ts - ts.resample('2H').mean() + ts.resample("2H").mean() Performing date and time arithmetic with absolute or relative time increments .. ipython:: python - friday = pd.Timestamp('2018-01-05') + friday = pd.Timestamp("2018-01-05") friday.day_name() # Add 1 day - saturday = friday + pd.Timedelta('1 day') + saturday = friday + pd.Timedelta("1 day") saturday.day_name() # Add 1 business day (Friday --> Monday) monday = friday + pd.offsets.BDay() @@ -90,13 +91,13 @@ so manipulations can be performed with respect to the time element. .. ipython:: python - pd.Series(range(3), index=pd.date_range('2000', freq='D', periods=3)) + pd.Series(range(3), index=pd.date_range("2000", freq="D", periods=3)) However, :class:`Series` and :class:`DataFrame` can directly also support the time component as data itself. .. ipython:: python - pd.Series(pd.date_range('2000', freq='D', periods=3)) + pd.Series(pd.date_range("2000", freq="D", periods=3)) :class:`Series` and :class:`DataFrame` have extended data type support and functionality for ``datetime``, ``timedelta`` and ``Period`` data when passed into those constructors. ``DateOffset`` @@ -104,9 +105,9 @@ data however will be stored as ``object`` data. .. ipython:: python - pd.Series(pd.period_range('1/1/2011', freq='M', periods=3)) + pd.Series(pd.period_range("1/1/2011", freq="M", periods=3)) pd.Series([pd.DateOffset(1), pd.DateOffset(2)]) - pd.Series(pd.date_range('1/1/2011', freq='M', periods=3)) + pd.Series(pd.date_range("1/1/2011", freq="M", periods=3)) Lastly, pandas represents null date times, time deltas, and time spans as ``NaT`` which is useful for representing missing or null date like values and behaves similar @@ -132,7 +133,7 @@ time. .. ipython:: python pd.Timestamp(datetime.datetime(2012, 5, 1)) - pd.Timestamp('2012-05-01') + pd.Timestamp("2012-05-01") pd.Timestamp(2012, 5, 1) However, in many cases it is more natural to associate things like change @@ -143,9 +144,9 @@ For example: .. ipython:: python - pd.Period('2011-01') + pd.Period("2011-01") - pd.Period('2012-05', freq='D') + pd.Period("2012-05", freq="D") :class:`Timestamp` and :class:`Period` can serve as an index. Lists of ``Timestamp`` and ``Period`` are automatically coerced to :class:`DatetimeIndex` @@ -153,9 +154,11 @@ and :class:`PeriodIndex` respectively. .. ipython:: python - dates = [pd.Timestamp('2012-05-01'), - pd.Timestamp('2012-05-02'), - pd.Timestamp('2012-05-03')] + dates = [ + pd.Timestamp("2012-05-01"), + pd.Timestamp("2012-05-02"), + pd.Timestamp("2012-05-03"), + ] ts = pd.Series(np.random.randn(3), dates) type(ts.index) @@ -163,7 +166,7 @@ and :class:`PeriodIndex` respectively. ts - periods = [pd.Period('2012-01'), pd.Period('2012-02'), pd.Period('2012-03')] + periods = [pd.Period("2012-01"), pd.Period("2012-02"), pd.Period("2012-03")] ts = pd.Series(np.random.randn(3), periods) @@ -193,18 +196,18 @@ is converted to a ``DatetimeIndex``: .. ipython:: python - pd.to_datetime(pd.Series(['Jul 31, 2009', '2010-01-10', None])) + pd.to_datetime(pd.Series(["Jul 31, 2009", "2010-01-10", None])) - pd.to_datetime(['2005/11/23', '2010.12.31']) + pd.to_datetime(["2005/11/23", "2010.12.31"]) If you use dates which start with the day first (i.e. European style), you can pass the ``dayfirst`` flag: .. ipython:: python - pd.to_datetime(['04-01-2012 10:00'], dayfirst=True) + pd.to_datetime(["04-01-2012 10:00"], dayfirst=True) - pd.to_datetime(['14-01-2012', '01-14-2012'], dayfirst=True) + pd.to_datetime(["14-01-2012", "01-14-2012"], dayfirst=True) .. warning:: @@ -218,22 +221,22 @@ options like ``dayfirst`` or ``format``, so use ``to_datetime`` if these are req .. ipython:: python - pd.to_datetime('2010/11/12') + pd.to_datetime("2010/11/12") - pd.Timestamp('2010/11/12') + pd.Timestamp("2010/11/12") You can also use the ``DatetimeIndex`` constructor directly: .. ipython:: python - pd.DatetimeIndex(['2018-01-01', '2018-01-03', '2018-01-05']) + pd.DatetimeIndex(["2018-01-01", "2018-01-03", "2018-01-05"]) The string 'infer' can be passed in order to set the frequency of the index as the inferred frequency upon creation: .. ipython:: python - pd.DatetimeIndex(['2018-01-01', '2018-01-03', '2018-01-05'], freq='infer') + pd.DatetimeIndex(["2018-01-01", "2018-01-03", "2018-01-05"], freq="infer") .. _timeseries.converting.format: @@ -245,9 +248,9 @@ This could also potentially speed up the conversion considerably. .. ipython:: python - pd.to_datetime('2010/11/12', format='%Y/%m/%d') + pd.to_datetime("2010/11/12", format="%Y/%m/%d") - pd.to_datetime('12-11-2010 00:00', format='%d-%m-%Y %H:%M') + pd.to_datetime("12-11-2010 00:00", format="%d-%m-%Y %H:%M") For more information on the choices available when specifying the ``format`` option, see the Python `datetime documentation`_. @@ -261,10 +264,9 @@ You can also pass a ``DataFrame`` of integer or string columns to assemble into .. ipython:: python - df = pd.DataFrame({'year': [2015, 2016], - 'month': [2, 3], - 'day': [4, 5], - 'hour': [2, 3]}) + df = pd.DataFrame( + {"year": [2015, 2016], "month": [2, 3], "day": [4, 5], "hour": [2, 3]} + ) pd.to_datetime(df) @@ -272,7 +274,7 @@ You can pass only the columns that you need to assemble. .. ipython:: python - pd.to_datetime(df[['year', 'month', 'day']]) + pd.to_datetime(df[["year", "month", "day"]]) ``pd.to_datetime`` looks for standard designations of the datetime component in the column names, including: @@ -293,13 +295,13 @@ Pass ``errors='ignore'`` to return the original input when unparsable: .. ipython:: python - pd.to_datetime(['2009/07/31', 'asd'], errors='ignore') + pd.to_datetime(["2009/07/31", "asd"], errors="ignore") Pass ``errors='coerce'`` to convert unparsable data to ``NaT`` (not a time): .. ipython:: python - pd.to_datetime(['2009/07/31', 'asd'], errors='coerce') + pd.to_datetime(["2009/07/31", "asd"], errors="coerce") .. _timeseries.converting.epoch: @@ -315,11 +317,12 @@ which can be specified. These are computed from the starting point specified by .. ipython:: python - pd.to_datetime([1349720105, 1349806505, 1349892905, - 1349979305, 1350065705], unit='s') + pd.to_datetime([1349720105, 1349806505, 1349892905, 1349979305, 1350065705], unit="s") - pd.to_datetime([1349720105100, 1349720105200, 1349720105300, - 1349720105400, 1349720105500], unit='ms') + pd.to_datetime( + [1349720105100, 1349720105200, 1349720105300, 1349720105400, 1349720105500], + unit="ms", + ) .. note:: @@ -336,8 +339,8 @@ as timezone-naive timestamps and then localize to the appropriate timezone: .. ipython:: python - pd.Timestamp(1262347200000000000).tz_localize('US/Pacific') - pd.DatetimeIndex([1262347200000000000]).tz_localize('US/Pacific') + pd.Timestamp(1262347200000000000).tz_localize("US/Pacific") + pd.DatetimeIndex([1262347200000000000]).tz_localize("US/Pacific") .. note:: @@ -353,8 +356,8 @@ as timezone-naive timestamps and then localize to the appropriate timezone: .. ipython:: python - pd.to_datetime([1490195805.433, 1490195805.433502912], unit='s') - pd.to_datetime(1490195805433502912, unit='ns') + pd.to_datetime([1490195805.433, 1490195805.433502912], unit="s") + pd.to_datetime(1490195805433502912, unit="ns") .. seealso:: @@ -369,7 +372,7 @@ To invert the operation from above, namely, to convert from a ``Timestamp`` to a .. ipython:: python - stamps = pd.date_range('2012-10-08 18:15:05', periods=4, freq='D') + stamps = pd.date_range("2012-10-08 18:15:05", periods=4, freq="D") stamps We subtract the epoch (midnight at January 1, 1970 UTC) and then floor divide by the @@ -377,7 +380,7 @@ We subtract the epoch (midnight at January 1, 1970 UTC) and then floor divide by .. ipython:: python - (stamps - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s') + (stamps - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s") .. _timeseries.origin: @@ -389,14 +392,14 @@ of a ``DatetimeIndex``. For example, to use 1960-01-01 as the starting date: .. ipython:: python - pd.to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01')) + pd.to_datetime([1, 2, 3], unit="D", origin=pd.Timestamp("1960-01-01")) The default is set at ``origin='unix'``, which defaults to ``1970-01-01 00:00:00``. Commonly called 'unix epoch' or POSIX time. .. ipython:: python - pd.to_datetime([1, 2, 3], unit='D') + pd.to_datetime([1, 2, 3], unit="D") .. _timeseries.daterange: @@ -408,9 +411,11 @@ To generate an index with timestamps, you can use either the ``DatetimeIndex`` o .. ipython:: python - dates = [datetime.datetime(2012, 5, 1), - datetime.datetime(2012, 5, 2), - datetime.datetime(2012, 5, 3)] + dates = [ + datetime.datetime(2012, 5, 1), + datetime.datetime(2012, 5, 2), + datetime.datetime(2012, 5, 3), + ] # Note the frequency information index = pd.DatetimeIndex(dates) @@ -442,9 +447,9 @@ variety of :ref:`frequency aliases `: .. ipython:: python - pd.date_range(start, periods=1000, freq='M') + pd.date_range(start, periods=1000, freq="M") - pd.bdate_range(start, periods=250, freq='BQS') + pd.bdate_range(start, periods=250, freq="BQS") ``date_range`` and ``bdate_range`` make it easy to generate a range of dates using various combinations of parameters like ``start``, ``end``, ``periods``, @@ -453,9 +458,9 @@ of those specified will not be generated: .. ipython:: python - pd.date_range(start, end, freq='BM') + pd.date_range(start, end, freq="BM") - pd.date_range(start, end, freq='W') + pd.date_range(start, end, freq="W") pd.bdate_range(end=end, periods=20) @@ -467,9 +472,9 @@ resulting ``DatetimeIndex``: .. ipython:: python - pd.date_range('2018-01-01', '2018-01-05', periods=5) + pd.date_range("2018-01-01", "2018-01-05", periods=5) - pd.date_range('2018-01-01', '2018-01-05', periods=10) + pd.date_range("2018-01-01", "2018-01-05", periods=10) .. _timeseries.custom-freq-ranges: @@ -482,13 +487,13 @@ used if a custom frequency string is passed. .. ipython:: python - weekmask = 'Mon Wed Fri' + weekmask = "Mon Wed Fri" holidays = [datetime.datetime(2011, 1, 5), datetime.datetime(2011, 3, 14)] - pd.bdate_range(start, end, freq='C', weekmask=weekmask, holidays=holidays) + pd.bdate_range(start, end, freq="C", weekmask=weekmask, holidays=holidays) - pd.bdate_range(start, end, freq='CBMS', weekmask=weekmask) + pd.bdate_range(start, end, freq="CBMS", weekmask=weekmask) .. seealso:: @@ -545,7 +550,7 @@ intelligent functionality like selection, slicing, etc. .. ipython:: python - rng = pd.date_range(start, end, freq='BM') + rng = pd.date_range(start, end, freq="BM") ts = pd.Series(np.random.randn(len(rng)), index=rng) ts.index ts[:5].index @@ -560,20 +565,20 @@ Dates and strings that parse to timestamps can be passed as indexing parameters: .. ipython:: python - ts['1/31/2011'] + ts["1/31/2011"] ts[datetime.datetime(2011, 12, 25):] - ts['10/31/2011':'12/31/2011'] + ts["10/31/2011":"12/31/2011"] To provide convenience for accessing longer time series, you can also pass in the year or year and month as strings: .. ipython:: python - ts['2011'] + ts["2011"] - ts['2011-6'] + ts["2011-6"] This type of slicing will work on a ``DataFrame`` with a ``DatetimeIndex`` as well. Since the partial string selection is a form of label slicing, the endpoints **will be** included. This @@ -586,10 +591,13 @@ would include matching times on an included date: .. ipython:: python :okwarning: - dft = pd.DataFrame(np.random.randn(100000, 1), columns=['A'], - index=pd.date_range('20130101', periods=100000, freq='T')) + dft = pd.DataFrame( + np.random.randn(100000, 1), + columns=["A"], + index=pd.date_range("20130101", periods=100000, freq="T"), + ) dft - dft['2013'] + dft["2013"] This starts on the very first time in the month, and includes the last date and time for the month: @@ -597,43 +605,45 @@ time for the month: .. ipython:: python :okwarning: - dft['2013-1':'2013-2'] + dft["2013-1":"2013-2"] This specifies a stop time **that includes all of the times on the last day**: .. ipython:: python :okwarning: - dft['2013-1':'2013-2-28'] + dft["2013-1":"2013-2-28"] This specifies an **exact** stop time (and is not the same as the above): .. ipython:: python :okwarning: - dft['2013-1':'2013-2-28 00:00:00'] + dft["2013-1":"2013-2-28 00:00:00"] We are stopping on the included end-point as it is part of the index: .. ipython:: python :okwarning: - dft['2013-1-15':'2013-1-15 12:30:00'] + dft["2013-1-15":"2013-1-15 12:30:00"] ``DatetimeIndex`` partial string indexing also works on a ``DataFrame`` with a ``MultiIndex``: .. ipython:: python - dft2 = pd.DataFrame(np.random.randn(20, 1), - columns=['A'], - index=pd.MultiIndex.from_product( - [pd.date_range('20130101', periods=10, freq='12H'), - ['a', 'b']])) + dft2 = pd.DataFrame( + np.random.randn(20, 1), + columns=["A"], + index=pd.MultiIndex.from_product( + [pd.date_range("20130101", periods=10, freq="12H"), ["a", "b"]] + ), + ) dft2 - dft2.loc['2013-01-05'] + dft2.loc["2013-01-05"] idx = pd.IndexSlice dft2 = dft2.swaplevel(0, 1).sort_index() - dft2.loc[idx[:, '2013-01-05'], :] + dft2.loc[idx[:, "2013-01-05"], :] .. versionadded:: 0.25.0 @@ -642,9 +652,9 @@ Slicing with string indexing also honors UTC offset. .. ipython:: python :okwarning: - df = pd.DataFrame([0], index=pd.DatetimeIndex(['2019-01-01'], tz='US/Pacific')) + df = pd.DataFrame([0], index=pd.DatetimeIndex(["2019-01-01"], tz="US/Pacific")) df - df['2019-01-01 12:00:00+04:00':'2019-01-01 13:00:00+04:00'] + df["2019-01-01 12:00:00+04:00":"2019-01-01 13:00:00+04:00"] .. _timeseries.slice_vs_exact_match: @@ -657,45 +667,48 @@ Consider a ``Series`` object with a minute resolution index: .. ipython:: python - series_minute = pd.Series([1, 2, 3], - pd.DatetimeIndex(['2011-12-31 23:59:00', - '2012-01-01 00:00:00', - '2012-01-01 00:02:00'])) + series_minute = pd.Series( + [1, 2, 3], + pd.DatetimeIndex( + ["2011-12-31 23:59:00", "2012-01-01 00:00:00", "2012-01-01 00:02:00"] + ), + ) series_minute.index.resolution A timestamp string less accurate than a minute gives a ``Series`` object. .. ipython:: python - series_minute['2011-12-31 23'] + series_minute["2011-12-31 23"] A timestamp string with minute resolution (or more accurate), gives a scalar instead, i.e. it is not casted to a slice. .. ipython:: python - series_minute['2011-12-31 23:59'] - series_minute['2011-12-31 23:59:00'] + series_minute["2011-12-31 23:59"] + series_minute["2011-12-31 23:59:00"] If index resolution is second, then the minute-accurate timestamp gives a ``Series``. .. ipython:: python - series_second = pd.Series([1, 2, 3], - pd.DatetimeIndex(['2011-12-31 23:59:59', - '2012-01-01 00:00:00', - '2012-01-01 00:00:01'])) + series_second = pd.Series( + [1, 2, 3], + pd.DatetimeIndex( + ["2011-12-31 23:59:59", "2012-01-01 00:00:00", "2012-01-01 00:00:01"] + ), + ) series_second.index.resolution - series_second['2011-12-31 23:59'] + series_second["2011-12-31 23:59"] If the timestamp string is treated as a slice, it can be used to index ``DataFrame`` with ``[]`` as well. .. ipython:: python :okwarning: - dft_minute = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, - index=series_minute.index) - dft_minute['2011-12-31 23'] + dft_minute = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=series_minute.index) + dft_minute["2011-12-31 23"] .. warning:: @@ -706,16 +719,17 @@ If the timestamp string is treated as a slice, it can be used to index ``DataFra .. ipython:: python - dft_minute.loc['2011-12-31 23:59'] + dft_minute.loc["2011-12-31 23:59"] Note also that ``DatetimeIndex`` resolution cannot be less precise than day. .. ipython:: python - series_monthly = pd.Series([1, 2, 3], - pd.DatetimeIndex(['2011-12', '2012-01', '2012-02'])) + series_monthly = pd.Series( + [1, 2, 3], pd.DatetimeIndex(["2011-12", "2012-01", "2012-02"]) + ) series_monthly.index.resolution - series_monthly['2011-12'] # returns Series + series_monthly["2011-12"] # returns Series Exact indexing @@ -727,14 +741,15 @@ These ``Timestamp`` and ``datetime`` objects have exact ``hours, minutes,`` and .. ipython:: python - dft[datetime.datetime(2013, 1, 1):datetime.datetime(2013, 2, 28)] + dft[datetime.datetime(2013, 1, 1): datetime.datetime(2013, 2, 28)] With no defaults. .. ipython:: python - dft[datetime.datetime(2013, 1, 1, 10, 12, 0): - datetime.datetime(2013, 2, 28, 10, 12, 0)] + dft[ + datetime.datetime(2013, 1, 1, 10, 12, 0): datetime.datetime(2013, 2, 28, 10, 12, 0) + ] Truncating & fancy indexing @@ -747,11 +762,11 @@ partially matching dates: .. ipython:: python - rng2 = pd.date_range('2011-01-01', '2012-01-01', freq='W') + rng2 = pd.date_range("2011-01-01", "2012-01-01", freq="W") ts2 = pd.Series(np.random.randn(len(rng2)), index=rng2) - ts2.truncate(before='2011-11', after='2011-12') - ts2['2011-11':'2011-12'] + ts2.truncate(before="2011-11", after="2011-12") + ts2["2011-11":"2011-12"] Even complicated fancy indexing that breaks the ``DatetimeIndex`` frequency regularity will result in a ``DatetimeIndex``, although frequency is lost: @@ -807,7 +822,7 @@ You may obtain the year, week and day components of the ISO year from the ISO 86 .. ipython:: python - idx = pd.date_range(start='2019-12-29', freq='D', periods=4) + idx = pd.date_range(start="2019-12-29", freq="D", periods=4) idx.isocalendar() idx.to_series().dt.isocalendar() @@ -837,12 +852,12 @@ arithmetic operator (``+``) or the ``apply`` method can be used to perform the s .. ipython:: python # This particular day contains a day light savings time transition - ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki') + ts = pd.Timestamp("2016-10-30 00:00:00", tz="Europe/Helsinki") # Respects absolute time ts + pd.Timedelta(days=1) # Respects calendar time ts + pd.DateOffset(days=1) - friday = pd.Timestamp('2018-01-05') + friday = pd.Timestamp("2018-01-05") friday.day_name() # Add 2 business days (Friday --> Tuesday) two_business_days = 2 * pd.offsets.BDay() @@ -900,10 +915,10 @@ business offsets operate on the weekdays. .. ipython:: python - ts = pd.Timestamp('2018-01-06 00:00:00') + ts = pd.Timestamp("2018-01-06 00:00:00") ts.day_name() # BusinessHour's valid offset dates are Monday through Friday - offset = pd.offsets.BusinessHour(start='09:00') + offset = pd.offsets.BusinessHour(start="09:00") # Bring the date to the closest offset date (Monday) offset.rollforward(ts) # Date is brought to the closest offset date first and then the hour is added @@ -916,12 +931,12 @@ in the operation). .. ipython:: python - ts = pd.Timestamp('2014-01-01 09:00') + ts = pd.Timestamp("2014-01-01 09:00") day = pd.offsets.Day() day.apply(ts) day.apply(ts).normalize() - ts = pd.Timestamp('2014-01-01 22:00') + ts = pd.Timestamp("2014-01-01 22:00") hour = pd.offsets.Hour() hour.apply(ts) hour.apply(ts).normalize() @@ -974,7 +989,7 @@ apply the offset to each element. .. ipython:: python - rng = pd.date_range('2012-01-01', '2012-01-03') + rng = pd.date_range("2012-01-01", "2012-01-03") s = pd.Series(rng) rng rng + pd.DateOffset(months=2) @@ -989,7 +1004,7 @@ used exactly like a ``Timedelta`` - see the .. ipython:: python s - pd.offsets.Day(2) - td = s - pd.Series(pd.date_range('2011-12-29', '2011-12-31')) + td = s - pd.Series(pd.date_range("2011-12-29", "2011-12-31")) td td + pd.offsets.Minute(15) @@ -1016,16 +1031,13 @@ As an interesting example, let's look at Egypt where a Friday-Saturday weekend i .. ipython:: python - weekmask_egypt = 'Sun Mon Tue Wed Thu' + weekmask_egypt = "Sun Mon Tue Wed Thu" # They also observe International Workers' Day so let's # add that for a couple of years - holidays = ['2012-05-01', - datetime.datetime(2013, 5, 1), - np.datetime64('2014-05-01')] - bday_egypt = pd.offsets.CustomBusinessDay(holidays=holidays, - weekmask=weekmask_egypt) + holidays = ["2012-05-01", datetime.datetime(2013, 5, 1), np.datetime64("2014-05-01")] + bday_egypt = pd.offsets.CustomBusinessDay(holidays=holidays, weekmask=weekmask_egypt) dt = datetime.datetime(2013, 4, 30) dt + 2 * bday_egypt @@ -1035,8 +1047,7 @@ Let's map to the weekday names: dts = pd.date_range(dt, periods=5, freq=bday_egypt) - pd.Series(dts.weekday, dts).map( - pd.Series('Mon Tue Wed Thu Fri Sat Sun'.split())) + pd.Series(dts.weekday, dts).map(pd.Series("Mon Tue Wed Thu Fri Sat Sun".split())) Holiday calendars can be used to provide the list of holidays. See the :ref:`holiday calendar` section for more information. @@ -1058,15 +1069,14 @@ in the usual way. .. ipython:: python - bmth_us = pd.offsets.CustomBusinessMonthBegin( - calendar=USFederalHolidayCalendar()) + bmth_us = pd.offsets.CustomBusinessMonthBegin(calendar=USFederalHolidayCalendar()) # Skip new years dt = datetime.datetime(2013, 12, 17) dt + bmth_us # Define date index with custom offset - pd.date_range(start='20100101', end='20120101', freq=bmth_us) + pd.date_range(start="20100101", end="20120101", freq=bmth_us) .. note:: @@ -1097,23 +1107,23 @@ hours are added to the next business day. bh # 2014-08-01 is Friday - pd.Timestamp('2014-08-01 10:00').weekday() - pd.Timestamp('2014-08-01 10:00') + bh + pd.Timestamp("2014-08-01 10:00").weekday() + pd.Timestamp("2014-08-01 10:00") + bh # Below example is the same as: pd.Timestamp('2014-08-01 09:00') + bh - pd.Timestamp('2014-08-01 08:00') + bh + pd.Timestamp("2014-08-01 08:00") + bh # If the results is on the end time, move to the next business day - pd.Timestamp('2014-08-01 16:00') + bh + pd.Timestamp("2014-08-01 16:00") + bh # Remainings are added to the next day - pd.Timestamp('2014-08-01 16:30') + bh + pd.Timestamp("2014-08-01 16:30") + bh # Adding 2 business hours - pd.Timestamp('2014-08-01 10:00') + pd.offsets.BusinessHour(2) + pd.Timestamp("2014-08-01 10:00") + pd.offsets.BusinessHour(2) # Subtracting 3 business hours - pd.Timestamp('2014-08-01 10:00') + pd.offsets.BusinessHour(-3) + pd.Timestamp("2014-08-01 10:00") + pd.offsets.BusinessHour(-3) You can also specify ``start`` and ``end`` time by keywords. The argument must be a ``str`` with an ``hour:minute`` representation or a ``datetime.time`` @@ -1122,12 +1132,12 @@ results in ``ValueError``. .. ipython:: python - bh = pd.offsets.BusinessHour(start='11:00', end=datetime.time(20, 0)) + bh = pd.offsets.BusinessHour(start="11:00", end=datetime.time(20, 0)) bh - pd.Timestamp('2014-08-01 13:00') + bh - pd.Timestamp('2014-08-01 09:00') + bh - pd.Timestamp('2014-08-01 18:00') + bh + pd.Timestamp("2014-08-01 13:00") + bh + pd.Timestamp("2014-08-01 09:00") + bh + pd.Timestamp("2014-08-01 18:00") + bh Passing ``start`` time later than ``end`` represents midnight business hour. In this case, business hour exceeds midnight and overlap to the next day. @@ -1135,19 +1145,19 @@ Valid business hours are distinguished by whether it started from valid ``Busine .. ipython:: python - bh = pd.offsets.BusinessHour(start='17:00', end='09:00') + bh = pd.offsets.BusinessHour(start="17:00", end="09:00") bh - pd.Timestamp('2014-08-01 17:00') + bh - pd.Timestamp('2014-08-01 23:00') + bh + pd.Timestamp("2014-08-01 17:00") + bh + pd.Timestamp("2014-08-01 23:00") + bh # Although 2014-08-02 is Saturday, # it is valid because it starts from 08-01 (Friday). - pd.Timestamp('2014-08-02 04:00') + bh + pd.Timestamp("2014-08-02 04:00") + bh # Although 2014-08-04 is Monday, # it is out of business hours because it starts from 08-03 (Sunday). - pd.Timestamp('2014-08-04 04:00') + bh + pd.Timestamp("2014-08-04 04:00") + bh Applying ``BusinessHour.rollforward`` and ``rollback`` to out of business hours results in the next business hour start or previous day's end. Different from other offsets, ``BusinessHour.rollforward`` @@ -1160,19 +1170,19 @@ under the default business hours (9:00 - 17:00), there is no gap (0 minutes) bet .. ipython:: python # This adjusts a Timestamp to business hour edge - pd.offsets.BusinessHour().rollback(pd.Timestamp('2014-08-02 15:00')) - pd.offsets.BusinessHour().rollforward(pd.Timestamp('2014-08-02 15:00')) + pd.offsets.BusinessHour().rollback(pd.Timestamp("2014-08-02 15:00")) + pd.offsets.BusinessHour().rollforward(pd.Timestamp("2014-08-02 15:00")) # It is the same as BusinessHour().apply(pd.Timestamp('2014-08-01 17:00')). # And it is the same as BusinessHour().apply(pd.Timestamp('2014-08-04 09:00')) - pd.offsets.BusinessHour().apply(pd.Timestamp('2014-08-02 15:00')) + pd.offsets.BusinessHour().apply(pd.Timestamp("2014-08-02 15:00")) # BusinessDay results (for reference) - pd.offsets.BusinessHour().rollforward(pd.Timestamp('2014-08-02')) + pd.offsets.BusinessHour().rollforward(pd.Timestamp("2014-08-02")) # It is the same as BusinessDay().apply(pd.Timestamp('2014-08-01')) # The result is the same as rollworward because BusinessDay never overlap. - pd.offsets.BusinessHour().apply(pd.Timestamp('2014-08-02')) + pd.offsets.BusinessHour().apply(pd.Timestamp("2014-08-02")) ``BusinessHour`` regards Saturday and Sunday as holidays. To use arbitrary holidays, you can use ``CustomBusinessHour`` offset, as explained in the @@ -1190,6 +1200,7 @@ as ``BusinessHour`` except that it skips specified custom holidays. .. ipython:: python from pandas.tseries.holiday import USFederalHolidayCalendar + bhour_us = pd.offsets.CustomBusinessHour(calendar=USFederalHolidayCalendar()) # Friday before MLK Day dt = datetime.datetime(2014, 1, 17, 15) @@ -1203,8 +1214,7 @@ You can use keyword arguments supported by either ``BusinessHour`` and ``CustomB .. ipython:: python - bhour_mon = pd.offsets.CustomBusinessHour(start='10:00', - weekmask='Tue Wed Thu Fri') + bhour_mon = pd.offsets.CustomBusinessHour(start="10:00", weekmask="Tue Wed Thu Fri") # Monday is skipped because it's a holiday, business hour starts from 10:00 dt + bhour_mon * 2 @@ -1257,7 +1267,7 @@ most functions: .. ipython:: python - pd.date_range(start, periods=5, freq='B') + pd.date_range(start, periods=5, freq="B") pd.date_range(start, periods=5, freq=pd.offsets.BDay()) @@ -1265,9 +1275,9 @@ You can combine together day and intraday offsets: .. ipython:: python - pd.date_range(start, periods=10, freq='2h20min') + pd.date_range(start, periods=10, freq="2h20min") - pd.date_range(start, periods=10, freq='1D10U') + pd.date_range(start, periods=10, freq="1D10U") Anchored offsets ~~~~~~~~~~~~~~~~ @@ -1326,39 +1336,39 @@ anchor point, and moved ``|n|-1`` additional steps forwards or backwards. .. ipython:: python - pd.Timestamp('2014-01-02') + pd.offsets.MonthBegin(n=1) - pd.Timestamp('2014-01-02') + pd.offsets.MonthEnd(n=1) + pd.Timestamp("2014-01-02") + pd.offsets.MonthBegin(n=1) + pd.Timestamp("2014-01-02") + pd.offsets.MonthEnd(n=1) - pd.Timestamp('2014-01-02') - pd.offsets.MonthBegin(n=1) - pd.Timestamp('2014-01-02') - pd.offsets.MonthEnd(n=1) + pd.Timestamp("2014-01-02") - pd.offsets.MonthBegin(n=1) + pd.Timestamp("2014-01-02") - pd.offsets.MonthEnd(n=1) - pd.Timestamp('2014-01-02') + pd.offsets.MonthBegin(n=4) - pd.Timestamp('2014-01-02') - pd.offsets.MonthBegin(n=4) + pd.Timestamp("2014-01-02") + pd.offsets.MonthBegin(n=4) + pd.Timestamp("2014-01-02") - pd.offsets.MonthBegin(n=4) If the given date *is* on an anchor point, it is moved ``|n|`` points forwards or backwards. .. ipython:: python - pd.Timestamp('2014-01-01') + pd.offsets.MonthBegin(n=1) - pd.Timestamp('2014-01-31') + pd.offsets.MonthEnd(n=1) + pd.Timestamp("2014-01-01") + pd.offsets.MonthBegin(n=1) + pd.Timestamp("2014-01-31") + pd.offsets.MonthEnd(n=1) - pd.Timestamp('2014-01-01') - pd.offsets.MonthBegin(n=1) - pd.Timestamp('2014-01-31') - pd.offsets.MonthEnd(n=1) + pd.Timestamp("2014-01-01") - pd.offsets.MonthBegin(n=1) + pd.Timestamp("2014-01-31") - pd.offsets.MonthEnd(n=1) - pd.Timestamp('2014-01-01') + pd.offsets.MonthBegin(n=4) - pd.Timestamp('2014-01-31') - pd.offsets.MonthBegin(n=4) + pd.Timestamp("2014-01-01") + pd.offsets.MonthBegin(n=4) + pd.Timestamp("2014-01-31") - pd.offsets.MonthBegin(n=4) For the case when ``n=0``, the date is not moved if on an anchor point, otherwise it is rolled forward to the next anchor point. .. ipython:: python - pd.Timestamp('2014-01-02') + pd.offsets.MonthBegin(n=0) - pd.Timestamp('2014-01-02') + pd.offsets.MonthEnd(n=0) + pd.Timestamp("2014-01-02") + pd.offsets.MonthBegin(n=0) + pd.Timestamp("2014-01-02") + pd.offsets.MonthEnd(n=0) - pd.Timestamp('2014-01-01') + pd.offsets.MonthBegin(n=0) - pd.Timestamp('2014-01-31') + pd.offsets.MonthEnd(n=0) + pd.Timestamp("2014-01-01") + pd.offsets.MonthBegin(n=0) + pd.Timestamp("2014-01-31") + pd.offsets.MonthEnd(n=0) .. _timeseries.holiday: @@ -1394,14 +1404,22 @@ An example of how holidays and holiday calendars are defined: .. ipython:: python - from pandas.tseries.holiday import Holiday, USMemorialDay,\ - AbstractHolidayCalendar, nearest_workday, MO + from pandas.tseries.holiday import ( + Holiday, + USMemorialDay, + AbstractHolidayCalendar, + nearest_workday, + MO, + ) + + class ExampleCalendar(AbstractHolidayCalendar): rules = [ USMemorialDay, - Holiday('July 4th', month=7, day=4, observance=nearest_workday), - Holiday('Columbus Day', month=10, day=1, - offset=pd.DateOffset(weekday=MO(2)))] + Holiday("July 4th", month=7, day=4, observance=nearest_workday), + Holiday("Columbus Day", month=10, day=1, offset=pd.DateOffset(weekday=MO(2))), + ] + cal = ExampleCalendar() cal.holidays(datetime.datetime(2012, 1, 1), datetime.datetime(2012, 12, 31)) @@ -1417,8 +1435,9 @@ or ``Timestamp`` objects. .. ipython:: python - pd.date_range(start='7/1/2012', end='7/10/2012', - freq=pd.offsets.CDay(calendar=cal)).to_pydatetime() + pd.date_range( + start="7/1/2012", end="7/10/2012", freq=pd.offsets.CDay(calendar=cal) + ).to_pydatetime() offset = pd.offsets.CustomBusinessDay(calendar=cal) datetime.datetime(2012, 5, 25) + offset datetime.datetime(2012, 7, 3) + offset @@ -1450,11 +1469,11 @@ or calendars with additional rules. .. ipython:: python - from pandas.tseries.holiday import get_calendar, HolidayCalendarFactory,\ - USLaborDay - cal = get_calendar('ExampleCalendar') + from pandas.tseries.holiday import get_calendar, HolidayCalendarFactory, USLaborDay + + cal = get_calendar("ExampleCalendar") cal.rules - new_cal = HolidayCalendarFactory('NewExampleCalendar', cal, USLaborDay) + new_cal = HolidayCalendarFactory("NewExampleCalendar", cal, USLaborDay) new_cal.rules .. _timeseries.advanced_datetime: @@ -1484,9 +1503,9 @@ rather than changing the alignment of the data and the index: .. ipython:: python - ts.shift(5, freq='D') + ts.shift(5, freq="D") ts.shift(5, freq=pd.offsets.BDay()) - ts.shift(5, freq='BM') + ts.shift(5, freq="BM") Note that with when ``freq`` is specified, the leading entry is no longer NaN because the data is not being realigned. @@ -1501,7 +1520,7 @@ calls ``reindex``. .. ipython:: python - dr = pd.date_range('1/1/2010', periods=3, freq=3 * pd.offsets.BDay()) + dr = pd.date_range("1/1/2010", periods=3, freq=3 * pd.offsets.BDay()) ts = pd.Series(np.random.randn(3), index=dr) ts ts.asfreq(pd.offsets.BDay()) @@ -1511,7 +1530,7 @@ method for any gaps that may appear after the frequency conversion. .. ipython:: python - ts.asfreq(pd.offsets.BDay(), method='pad') + ts.asfreq(pd.offsets.BDay(), method="pad") Filling forward / backward ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1552,11 +1571,11 @@ Basics .. ipython:: python - rng = pd.date_range('1/1/2012', periods=100, freq='S') + rng = pd.date_range("1/1/2012", periods=100, freq="S") ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) - ts.resample('5Min').sum() + ts.resample("5Min").sum() The ``resample`` function is very flexible and allows you to specify many different parameters to control the frequency conversion and resampling @@ -1568,11 +1587,11 @@ a method of the returned object, including ``sum``, ``mean``, ``std``, ``sem``, .. ipython:: python - ts.resample('5Min').mean() + ts.resample("5Min").mean() - ts.resample('5Min').ohlc() + ts.resample("5Min").ohlc() - ts.resample('5Min').max() + ts.resample("5Min").max() For downsampling, ``closed`` can be set to 'left' or 'right' to specify which @@ -1580,9 +1599,9 @@ end of the interval is closed: .. ipython:: python - ts.resample('5Min', closed='right').mean() + ts.resample("5Min", closed="right").mean() - ts.resample('5Min', closed='left').mean() + ts.resample("5Min", closed="left").mean() Parameters like ``label`` are used to manipulate the resulting labels. ``label`` specifies whether the result is labeled with the beginning or @@ -1590,9 +1609,9 @@ the end of the interval. .. ipython:: python - ts.resample('5Min').mean() # by default label='left' + ts.resample("5Min").mean() # by default label='left' - ts.resample('5Min', label='left').mean() + ts.resample("5Min", label="left").mean() .. warning:: @@ -1606,12 +1625,12 @@ the end of the interval. .. ipython:: python - s = pd.date_range('2000-01-01', '2000-01-05').to_series() + s = pd.date_range("2000-01-01", "2000-01-05").to_series() s.iloc[2] = pd.NaT s.dt.day_name() # default: label='left', closed='left' - s.resample('B').last().dt.day_name() + s.resample("B").last().dt.day_name() Notice how the value for Sunday got pulled back to the previous Friday. To get the behavior where the value for Sunday is pushed to Monday, use @@ -1619,7 +1638,7 @@ the end of the interval. .. ipython:: python - s.resample('B', label='right', closed='right').last().dt.day_name() + s.resample("B", label="right", closed="right").last().dt.day_name() The ``axis`` parameter can be set to 0 or 1 and allows you to resample the specified axis for a ``DataFrame``. @@ -1642,11 +1661,11 @@ For upsampling, you can specify a way to upsample and the ``limit`` parameter to # from secondly to every 250 milliseconds - ts[:2].resample('250L').asfreq() + ts[:2].resample("250L").asfreq() - ts[:2].resample('250L').ffill() + ts[:2].resample("250L").ffill() - ts[:2].resample('250L').ffill(limit=2) + ts[:2].resample("250L").ffill(limit=2) Sparse resampling ~~~~~~~~~~~~~~~~~ @@ -1662,14 +1681,14 @@ resample only the groups that are not all ``NaN``. .. ipython:: python - rng = pd.date_range('2014-1-1', periods=100, freq='D') + pd.Timedelta('1s') + rng = pd.date_range("2014-1-1", periods=100, freq="D") + pd.Timedelta("1s") ts = pd.Series(range(100), index=rng) If we want to resample to the full range of the series: .. ipython:: python - ts.resample('3T').sum() + ts.resample("3T").sum() We can instead only resample those groups where we have points as follows: @@ -1678,12 +1697,14 @@ We can instead only resample those groups where we have points as follows: from functools import partial from pandas.tseries.frequencies import to_offset + def round(t, freq): # round a Timestamp to a specified freq freq = to_offset(freq) return pd.Timestamp((t.value // freq.delta.value) * freq.delta.value) - ts.groupby(partial(round, freq='3T')).sum() + + ts.groupby(partial(round, freq="3T")).sum() .. _timeseries.aggregate: @@ -1697,25 +1718,27 @@ Resampling a ``DataFrame``, the default will be to act on all columns with the s .. ipython:: python - df = pd.DataFrame(np.random.randn(1000, 3), - index=pd.date_range('1/1/2012', freq='S', periods=1000), - columns=['A', 'B', 'C']) - r = df.resample('3T') + df = pd.DataFrame( + np.random.randn(1000, 3), + index=pd.date_range("1/1/2012", freq="S", periods=1000), + columns=["A", "B", "C"], + ) + r = df.resample("3T") r.mean() We can select a specific column or columns using standard getitem. .. ipython:: python - r['A'].mean() + r["A"].mean() - r[['A', 'B']].mean() + r[["A", "B"]].mean() You can pass a list or dict of functions to do aggregation with, outputting a ``DataFrame``: .. ipython:: python - r['A'].agg([np.sum, np.mean, np.std]) + r["A"].agg([np.sum, np.mean, np.std]) On a resampled ``DataFrame``, you can pass a list of functions to apply to each column, which produces an aggregated result with a hierarchical index: @@ -1730,21 +1753,20 @@ columns of a ``DataFrame``: .. ipython:: python :okexcept: - r.agg({'A': np.sum, - 'B': lambda x: np.std(x, ddof=1)}) + r.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)}) The function names can also be strings. In order for a string to be valid it must be implemented on the resampled object: .. ipython:: python - r.agg({'A': 'sum', 'B': 'std'}) + r.agg({"A": "sum", "B": "std"}) Furthermore, you can also specify multiple aggregation functions for each column separately. .. ipython:: python - r.agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) + r.agg({"A": ["sum", "std"], "B": ["mean", "std"]}) If a ``DataFrame`` does not have a datetimelike index, but instead you want @@ -1753,14 +1775,15 @@ to resample based on datetimelike column in the frame, it can passed to the .. ipython:: python - df = pd.DataFrame({'date': pd.date_range('2015-01-01', freq='W', periods=5), - 'a': np.arange(5)}, - index=pd.MultiIndex.from_arrays([ - [1, 2, 3, 4, 5], - pd.date_range('2015-01-01', freq='W', periods=5)], - names=['v', 'd'])) + df = pd.DataFrame( + {"date": pd.date_range("2015-01-01", freq="W", periods=5), "a": np.arange(5)}, + index=pd.MultiIndex.from_arrays( + [[1, 2, 3, 4, 5], pd.date_range("2015-01-01", freq="W", periods=5)], + names=["v", "d"], + ), + ) df - df.resample('M', on='date').sum() + df.resample("M", on="date").sum() Similarly, if you instead want to resample by a datetimelike level of ``MultiIndex``, its name or location can be passed to the @@ -1768,7 +1791,7 @@ level of ``MultiIndex``, its name or location can be passed to the .. ipython:: python - df.resample('M', level='d').sum() + df.resample("M", level="d").sum() .. _timeseries.iterating-label: @@ -1782,14 +1805,18 @@ natural and functions similarly to :py:func:`itertools.groupby`: small = pd.Series( range(6), - index=pd.to_datetime(['2017-01-01T00:00:00', - '2017-01-01T00:30:00', - '2017-01-01T00:31:00', - '2017-01-01T01:00:00', - '2017-01-01T03:00:00', - '2017-01-01T03:05:00']) + index=pd.to_datetime( + [ + "2017-01-01T00:00:00", + "2017-01-01T00:30:00", + "2017-01-01T00:31:00", + "2017-01-01T01:00:00", + "2017-01-01T03:00:00", + "2017-01-01T03:05:00", + ] + ), ) - resampled = small.resample('H') + resampled = small.resample("H") for name, group in resampled: print("Group: ", name) @@ -1811,9 +1838,9 @@ For example: .. ipython:: python - start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00' - middle = '2000-10-02 00:00:00' - rng = pd.date_range(start, end, freq='7min') + start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00" + middle = "2000-10-02 00:00:00" + rng = pd.date_range(start, end, freq="7min") ts = pd.Series(np.arange(len(rng)) * 3, index=rng) ts @@ -1821,32 +1848,32 @@ Here we can see that, when using ``origin`` with its default value (``'start_day .. ipython:: python - ts.resample('17min', origin='start_day').sum() - ts[middle:end].resample('17min', origin='start_day').sum() + ts.resample("17min", origin="start_day").sum() + ts[middle:end].resample("17min", origin="start_day").sum() Here we can see that, when setting ``origin`` to ``'epoch'``, the result after ``'2000-10-02 00:00:00'`` are identical depending on the start of time series: .. ipython:: python - ts.resample('17min', origin='epoch').sum() - ts[middle:end].resample('17min', origin='epoch').sum() + ts.resample("17min", origin="epoch").sum() + ts[middle:end].resample("17min", origin="epoch").sum() If needed you can use a custom timestamp for ``origin``: .. ipython:: python - ts.resample('17min', origin='2001-01-01').sum() - ts[middle:end].resample('17min', origin=pd.Timestamp('2001-01-01')).sum() + ts.resample("17min", origin="2001-01-01").sum() + ts[middle:end].resample("17min", origin=pd.Timestamp("2001-01-01")).sum() If needed you can just adjust the bins with an ``offset`` Timedelta that would be added to the default ``origin``. Those two examples are equivalent for this time series: .. ipython:: python - ts.resample('17min', origin='start').sum() - ts.resample('17min', offset='23h30min').sum() + ts.resample("17min", origin="start").sum() + ts.resample("17min", offset="23h30min").sum() Note the use of ``'start'`` for ``origin`` on the last example. In that case, ``origin`` will be set to the first value of the timeseries. @@ -1869,37 +1896,37 @@ Because ``freq`` represents a span of ``Period``, it cannot be negative like "-3 .. ipython:: python - pd.Period('2012', freq='A-DEC') + pd.Period("2012", freq="A-DEC") - pd.Period('2012-1-1', freq='D') + pd.Period("2012-1-1", freq="D") - pd.Period('2012-1-1 19:00', freq='H') + pd.Period("2012-1-1 19:00", freq="H") - pd.Period('2012-1-1 19:00', freq='5H') + pd.Period("2012-1-1 19:00", freq="5H") Adding and subtracting integers from periods shifts the period by its own frequency. Arithmetic is not allowed between ``Period`` with different ``freq`` (span). .. ipython:: python - p = pd.Period('2012', freq='A-DEC') + p = pd.Period("2012", freq="A-DEC") p + 1 p - 3 - p = pd.Period('2012-01', freq='2M') + p = pd.Period("2012-01", freq="2M") p + 2 p - 1 @okexcept - p == pd.Period('2012-01', freq='3M') + p == pd.Period("2012-01", freq="3M") If ``Period`` freq is daily or higher (``D``, ``H``, ``T``, ``S``, ``L``, ``U``, ``N``), ``offsets`` and ``timedelta``-like can be added if the result can have the same freq. Otherwise, ``ValueError`` will be raised. .. ipython:: python - p = pd.Period('2014-07-01 09:00', freq='H') + p = pd.Period("2014-07-01 09:00", freq="H") p + pd.offsets.Hour(2) p + datetime.timedelta(minutes=120) - p + np.timedelta64(7200, 's') + p + np.timedelta64(7200, "s") .. code-block:: ipython @@ -1912,7 +1939,7 @@ If ``Period`` has other frequencies, only the same ``offsets`` can be added. Oth .. ipython:: python - p = pd.Period('2014-07', freq='M') + p = pd.Period("2014-07", freq="M") p + pd.offsets.MonthEnd(3) .. code-block:: ipython @@ -1927,7 +1954,7 @@ return the number of frequency units between them: .. ipython:: python - pd.Period('2012', freq='A-DEC') - pd.Period('2002', freq='A-DEC') + pd.Period("2012", freq="A-DEC") - pd.Period("2002", freq="A-DEC") PeriodIndex and period_range ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1936,21 +1963,21 @@ which can be constructed using the ``period_range`` convenience function: .. ipython:: python - prng = pd.period_range('1/1/2011', '1/1/2012', freq='M') + prng = pd.period_range("1/1/2011", "1/1/2012", freq="M") prng The ``PeriodIndex`` constructor can also be used directly: .. ipython:: python - pd.PeriodIndex(['2011-1', '2011-2', '2011-3'], freq='M') + pd.PeriodIndex(["2011-1", "2011-2", "2011-3"], freq="M") Passing multiplied frequency outputs a sequence of ``Period`` which has multiplied span. .. ipython:: python - pd.period_range(start='2014-01', freq='3M', periods=4) + pd.period_range(start="2014-01", freq="3M", periods=4) If ``start`` or ``end`` are ``Period`` objects, they will be used as anchor endpoints for a ``PeriodIndex`` with frequency matching that of the @@ -1958,8 +1985,9 @@ endpoints for a ``PeriodIndex`` with frequency matching that of the .. ipython:: python - pd.period_range(start=pd.Period('2017Q1', freq='Q'), - end=pd.Period('2017Q2', freq='Q'), freq='M') + pd.period_range( + start=pd.Period("2017Q1", freq="Q"), end=pd.Period("2017Q2", freq="Q"), freq="M" + ) Just like ``DatetimeIndex``, a ``PeriodIndex`` can also be used to index pandas objects: @@ -1973,11 +2001,11 @@ objects: .. ipython:: python - idx = pd.period_range('2014-07-01 09:00', periods=5, freq='H') + idx = pd.period_range("2014-07-01 09:00", periods=5, freq="H") idx idx + pd.offsets.Hour(2) - idx = pd.period_range('2014-07', periods=5, freq='M') + idx = pd.period_range("2014-07", periods=5, freq="M") idx idx + pd.offsets.MonthEnd(3) @@ -1996,7 +2024,7 @@ The ``period`` dtype holds the ``freq`` attribute and is represented with .. ipython:: python - pi = pd.period_range('2016-01-01', periods=3, freq='M') + pi = pd.period_range("2016-01-01", periods=3, freq="M") pi pi.dtype @@ -2007,15 +2035,15 @@ The ``period`` dtype can be used in ``.astype(...)``. It allows one to change th .. ipython:: python # change monthly freq to daily freq - pi.astype('period[D]') + pi.astype("period[D]") # convert to DatetimeIndex - pi.astype('datetime64[ns]') + pi.astype("datetime64[ns]") # convert to PeriodIndex - dti = pd.date_range('2011-01-01', freq='M', periods=3) + dti = pd.date_range("2011-01-01", freq="M", periods=3) dti - dti.astype('period[M]') + dti.astype("period[M]") PeriodIndex partial string indexing @@ -2029,32 +2057,32 @@ You can pass in dates and strings to ``Series`` and ``DataFrame`` with ``PeriodI .. ipython:: python - ps['2011-01'] + ps["2011-01"] ps[datetime.datetime(2011, 12, 25):] - ps['10/31/2011':'12/31/2011'] + ps["10/31/2011":"12/31/2011"] Passing a string representing a lower frequency than ``PeriodIndex`` returns partial sliced data. .. ipython:: python :okwarning: - ps['2011'] + ps["2011"] - dfp = pd.DataFrame(np.random.randn(600, 1), - columns=['A'], - index=pd.period_range('2013-01-01 9:00', - periods=600, - freq='T')) + dfp = pd.DataFrame( + np.random.randn(600, 1), + columns=["A"], + index=pd.period_range("2013-01-01 9:00", periods=600, freq="T"), + ) dfp - dfp['2013-01-01 10H'] + dfp["2013-01-01 10H"] As with ``DatetimeIndex``, the endpoints will be included in the result. The example below slices data starting from 10:00 to 11:59. .. ipython:: python - dfp['2013-01-01 10H':'2013-01-01 11H'] + dfp["2013-01-01 10H":"2013-01-01 11H"] Frequency conversion and resampling with PeriodIndex @@ -2064,7 +2092,7 @@ method. Let's start with the fiscal year 2011, ending in December: .. ipython:: python - p = pd.Period('2011', freq='A-DEC') + p = pd.Period("2011", freq="A-DEC") p We can convert it to a monthly frequency. Using the ``how`` parameter, we can @@ -2072,16 +2100,16 @@ specify whether to return the starting or ending month: .. ipython:: python - p.asfreq('M', how='start') + p.asfreq("M", how="start") - p.asfreq('M', how='end') + p.asfreq("M", how="end") The shorthands 's' and 'e' are provided for convenience: .. ipython:: python - p.asfreq('M', 's') - p.asfreq('M', 'e') + p.asfreq("M", "s") + p.asfreq("M", "e") Converting to a "super-period" (e.g., annual frequency is a super-period of quarterly frequency) automatically returns the super-period that includes the @@ -2089,9 +2117,9 @@ input period: .. ipython:: python - p = pd.Period('2011-12', freq='M') + p = pd.Period("2011-12", freq="M") - p.asfreq('A-NOV') + p.asfreq("A-NOV") Note that since we converted to an annual frequency that ends the year in November, the monthly period of December 2011 is actually in the 2012 A-NOV @@ -2110,21 +2138,21 @@ frequencies ``Q-JAN`` through ``Q-DEC``. .. ipython:: python - p = pd.Period('2012Q1', freq='Q-DEC') + p = pd.Period("2012Q1", freq="Q-DEC") - p.asfreq('D', 's') + p.asfreq("D", "s") - p.asfreq('D', 'e') + p.asfreq("D", "e") ``Q-MAR`` defines fiscal year end in March: .. ipython:: python - p = pd.Period('2011Q4', freq='Q-MAR') + p = pd.Period("2011Q4", freq="Q-MAR") - p.asfreq('D', 's') + p.asfreq("D", "s") - p.asfreq('D', 'e') + p.asfreq("D", "e") .. _timeseries.interchange: @@ -2136,7 +2164,7 @@ and vice-versa using ``to_timestamp``: .. ipython:: python - rng = pd.date_range('1/1/2012', periods=5, freq='M') + rng = pd.date_range("1/1/2012", periods=5, freq="M") ts = pd.Series(np.random.randn(len(rng)), index=rng) @@ -2153,7 +2181,7 @@ end of the period: .. ipython:: python - ps.to_timestamp('D', how='s') + ps.to_timestamp("D", how="s") Converting between period and timestamp enables some convenient arithmetic functions to be used. In the following example, we convert a quarterly @@ -2162,11 +2190,11 @@ the quarter end: .. ipython:: python - prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV') + prng = pd.period_range("1990Q1", "2000Q4", freq="Q-NOV") ts = pd.Series(np.random.randn(len(prng)), prng) - ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9 + ts.index = (prng.asfreq("M", "e") + 1).asfreq("H", "s") + 9 ts.head() @@ -2180,7 +2208,7 @@ then you can use a ``PeriodIndex`` and/or ``Series`` of ``Periods`` to do comput .. ipython:: python - span = pd.period_range('1215-01-01', '1381-01-01', freq='D') + span = pd.period_range("1215-01-01", "1381-01-01", freq="D") span To convert from an ``int64`` based YYYYMMDD representation. @@ -2190,9 +2218,10 @@ To convert from an ``int64`` based YYYYMMDD representation. s = pd.Series([20121231, 20141130, 99991231]) s + def conv(x): - return pd.Period(year=x // 10000, month=x // 100 % 100, - day=x % 100, freq='D') + return pd.Period(year=x // 10000, month=x // 100 % 100, day=x % 100, freq="D") + s.apply(conv) s.apply(conv)[2] @@ -2221,7 +2250,7 @@ By default, pandas objects are time zone unaware: .. ipython:: python - rng = pd.date_range('3/6/2012 00:00', periods=15, freq='D') + rng = pd.date_range("3/6/2012 00:00", periods=15, freq="D") rng.tz is None To localize these dates to a time zone (assign a particular time zone to a naive date), @@ -2241,18 +2270,16 @@ To return ``dateutil`` time zone objects, append ``dateutil/`` before the string import dateutil # pytz - rng_pytz = pd.date_range('3/6/2012 00:00', periods=3, freq='D', - tz='Europe/London') + rng_pytz = pd.date_range("3/6/2012 00:00", periods=3, freq="D", tz="Europe/London") rng_pytz.tz # dateutil - rng_dateutil = pd.date_range('3/6/2012 00:00', periods=3, freq='D') - rng_dateutil = rng_dateutil.tz_localize('dateutil/Europe/London') + rng_dateutil = pd.date_range("3/6/2012 00:00", periods=3, freq="D") + rng_dateutil = rng_dateutil.tz_localize("dateutil/Europe/London") rng_dateutil.tz # dateutil - utc special case - rng_utc = pd.date_range('3/6/2012 00:00', periods=3, freq='D', - tz=dateutil.tz.tzutc()) + rng_utc = pd.date_range("3/6/2012 00:00", periods=3, freq="D", tz=dateutil.tz.tzutc()) rng_utc.tz .. versionadded:: 0.25.0 @@ -2260,8 +2287,7 @@ To return ``dateutil`` time zone objects, append ``dateutil/`` before the string .. ipython:: python # datetime.timezone - rng_utc = pd.date_range('3/6/2012 00:00', periods=3, freq='D', - tz=datetime.timezone.utc) + rng_utc = pd.date_range("3/6/2012 00:00", periods=3, freq="D", tz=datetime.timezone.utc) rng_utc.tz Note that the ``UTC`` time zone is a special case in ``dateutil`` and should be constructed explicitly @@ -2273,15 +2299,14 @@ zones objects explicitly first. import pytz # pytz - tz_pytz = pytz.timezone('Europe/London') - rng_pytz = pd.date_range('3/6/2012 00:00', periods=3, freq='D') + tz_pytz = pytz.timezone("Europe/London") + rng_pytz = pd.date_range("3/6/2012 00:00", periods=3, freq="D") rng_pytz = rng_pytz.tz_localize(tz_pytz) rng_pytz.tz == tz_pytz # dateutil - tz_dateutil = dateutil.tz.gettz('Europe/London') - rng_dateutil = pd.date_range('3/6/2012 00:00', periods=3, freq='D', - tz=tz_dateutil) + tz_dateutil = dateutil.tz.gettz("Europe/London") + rng_dateutil = pd.date_range("3/6/2012 00:00", periods=3, freq="D", tz=tz_dateutil) rng_dateutil.tz == tz_dateutil To convert a time zone aware pandas object from one time zone to another, @@ -2289,7 +2314,7 @@ you can use the ``tz_convert`` method. .. ipython:: python - rng_pytz.tz_convert('US/Eastern') + rng_pytz.tz_convert("US/Eastern") .. note:: @@ -2301,9 +2326,9 @@ you can use the ``tz_convert`` method. .. ipython:: python - dti = pd.date_range('2019-01-01', periods=3, freq='D', tz='US/Pacific') + dti = pd.date_range("2019-01-01", periods=3, freq="D", tz="US/Pacific") dti.tz - ts = pd.Timestamp('2019-01-01', tz='US/Pacific') + ts = pd.Timestamp("2019-01-01", tz="US/Pacific") ts.tz .. warning:: @@ -2344,11 +2369,11 @@ you can use the ``tz_convert`` method. .. ipython:: python - d_2037 = '2037-03-31T010101' - d_2038 = '2038-03-31T010101' - DST = 'Europe/London' - assert pd.Timestamp(d_2037, tz=DST) != pd.Timestamp(d_2037, tz='GMT') - assert pd.Timestamp(d_2038, tz=DST) == pd.Timestamp(d_2038, tz='GMT') + d_2037 = "2037-03-31T010101" + d_2038 = "2038-03-31T010101" + DST = "Europe/London" + assert pd.Timestamp(d_2037, tz=DST) != pd.Timestamp(d_2037, tz="GMT") + assert pd.Timestamp(d_2038, tz=DST) == pd.Timestamp(d_2038, tz="GMT") Under the hood, all timestamps are stored in UTC. Values from a time zone aware :class:`DatetimeIndex` or :class:`Timestamp` will have their fields (day, hour, minute, etc.) @@ -2357,8 +2382,8 @@ still considered to be equal even if they are in different time zones: .. ipython:: python - rng_eastern = rng_utc.tz_convert('US/Eastern') - rng_berlin = rng_utc.tz_convert('Europe/Berlin') + rng_eastern = rng_utc.tz_convert("US/Eastern") + rng_berlin = rng_utc.tz_convert("Europe/Berlin") rng_eastern[2] rng_berlin[2] @@ -2369,9 +2394,9 @@ Operations between :class:`Series` in different time zones will yield UTC .. ipython:: python - ts_utc = pd.Series(range(3), pd.date_range('20130101', periods=3, tz='UTC')) - eastern = ts_utc.tz_convert('US/Eastern') - berlin = ts_utc.tz_convert('Europe/Berlin') + ts_utc = pd.Series(range(3), pd.date_range("20130101", periods=3, tz="UTC")) + eastern = ts_utc.tz_convert("US/Eastern") + berlin = ts_utc.tz_convert("Europe/Berlin") result = eastern + berlin result result.index @@ -2382,14 +2407,13 @@ To remove time zone information, use ``tz_localize(None)`` or ``tz_convert(None) .. ipython:: python - didx = pd.date_range(start='2014-08-01 09:00', freq='H', - periods=3, tz='US/Eastern') + didx = pd.date_range(start="2014-08-01 09:00", freq="H", periods=3, tz="US/Eastern") didx didx.tz_localize(None) didx.tz_convert(None) # tz_convert(None) is identical to tz_convert('UTC').tz_localize(None) - didx.tz_convert('UTC').tz_localize(None) + didx.tz_convert("UTC").tz_localize(None) .. _timeseries.fold: @@ -2415,10 +2439,12 @@ control over how they are handled. .. ipython:: python - pd.Timestamp(datetime.datetime(2019, 10, 27, 1, 30, 0, 0), - tz='dateutil/Europe/London', fold=0) - pd.Timestamp(year=2019, month=10, day=27, hour=1, minute=30, - tz='dateutil/Europe/London', fold=1) + pd.Timestamp( + datetime.datetime(2019, 10, 27, 1, 30, 0, 0), tz="dateutil/Europe/London", fold=0 + ) + pd.Timestamp( + year=2019, month=10, day=27, hour=1, minute=30, tz="dateutil/Europe/London", fold=1 + ) .. _timeseries.timezone_ambiguous: @@ -2436,8 +2462,9 @@ twice within one day ("clocks fall back"). The following options are available: .. ipython:: python - rng_hourly = pd.DatetimeIndex(['11/06/2011 00:00', '11/06/2011 01:00', - '11/06/2011 01:00', '11/06/2011 02:00']) + rng_hourly = pd.DatetimeIndex( + ["11/06/2011 00:00", "11/06/2011 01:00", "11/06/2011 01:00", "11/06/2011 02:00"] + ) This will fail as there are ambiguous times (``'11/06/2011 01:00'``) @@ -2450,9 +2477,9 @@ Handle these ambiguous times by specifying the following. .. ipython:: python - rng_hourly.tz_localize('US/Eastern', ambiguous='infer') - rng_hourly.tz_localize('US/Eastern', ambiguous='NaT') - rng_hourly.tz_localize('US/Eastern', ambiguous=[True, True, False, False]) + rng_hourly.tz_localize("US/Eastern", ambiguous="infer") + rng_hourly.tz_localize("US/Eastern", ambiguous="NaT") + rng_hourly.tz_localize("US/Eastern", ambiguous=[True, True, False, False]) .. _timeseries.timezone_nonexistent: @@ -2471,7 +2498,7 @@ can be controlled by the ``nonexistent`` argument. The following options are ava .. ipython:: python - dti = pd.date_range(start='2015-03-29 02:30:00', periods=3, freq='H') + dti = pd.date_range(start="2015-03-29 02:30:00", periods=3, freq="H") # 2:30 is a nonexistent time Localization of nonexistent times will raise an error by default. @@ -2486,10 +2513,10 @@ Transform nonexistent times to ``NaT`` or shift the times. .. ipython:: python dti - dti.tz_localize('Europe/Warsaw', nonexistent='shift_forward') - dti.tz_localize('Europe/Warsaw', nonexistent='shift_backward') - dti.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta(1, unit='H')) - dti.tz_localize('Europe/Warsaw', nonexistent='NaT') + dti.tz_localize("Europe/Warsaw", nonexistent="shift_forward") + dti.tz_localize("Europe/Warsaw", nonexistent="shift_backward") + dti.tz_localize("Europe/Warsaw", nonexistent=pd.Timedelta(1, unit="H")) + dti.tz_localize("Europe/Warsaw", nonexistent="NaT") .. _timeseries.timezone_series: @@ -2502,7 +2529,7 @@ represented with a dtype of ``datetime64[ns]``. .. ipython:: python - s_naive = pd.Series(pd.date_range('20130101', periods=3)) + s_naive = pd.Series(pd.date_range("20130101", periods=3)) s_naive A :class:`Series` with a time zone **aware** values is @@ -2510,7 +2537,7 @@ represented with a dtype of ``datetime64[ns, tz]`` where ``tz`` is the time zone .. ipython:: python - s_aware = pd.Series(pd.date_range('20130101', periods=3, tz='US/Eastern')) + s_aware = pd.Series(pd.date_range("20130101", periods=3, tz="US/Eastern")) s_aware Both of these :class:`Series` time zone information @@ -2520,7 +2547,7 @@ For example, to localize and convert a naive stamp to time zone aware. .. ipython:: python - s_naive.dt.tz_localize('UTC').dt.tz_convert('US/Eastern') + s_naive.dt.tz_localize("UTC").dt.tz_convert("US/Eastern") Time zone information can also be manipulated using the ``astype`` method. This method can localize and convert time zone naive timestamps or @@ -2529,13 +2556,13 @@ convert time zone aware timestamps. .. ipython:: python # localize and convert a naive time zone - s_naive.astype('datetime64[ns, US/Eastern]') + s_naive.astype("datetime64[ns, US/Eastern]") # make an aware tz naive - s_aware.astype('datetime64[ns]') + s_aware.astype("datetime64[ns]") # convert to a new time zone - s_aware.astype('datetime64[ns, CET]') + s_aware.astype("datetime64[ns, CET]") .. note:: @@ -2561,4 +2588,4 @@ convert time zone aware timestamps. .. ipython:: python - s_aware.to_numpy(dtype='datetime64[ns]') + s_aware.to_numpy(dtype="datetime64[ns]") From de32fc72ce5afac1db6ee50764ae1243e40b6318 Mon Sep 17 00:00:00 2001 From: Prayag Savsani Date: Sat, 3 Oct 2020 20:05:02 +0530 Subject: [PATCH 06/38] DOC: use black to fix code style in doc pandas-dev#36777 (#36824) --- doc/source/development/extending.rst | 7 +++--- doc/source/user_guide/duplicates.rst | 35 ++++++++++++---------------- doc/source/user_guide/gotchas.rst | 28 +++++++++++++--------- 3 files changed, 36 insertions(+), 34 deletions(-) diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index 46960140d3a8c..77fe930cf21e3 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -50,8 +50,9 @@ decorate a class, providing the name of attribute to add. The class's Now users can access your methods using the ``geo`` namespace: - >>> ds = pd.DataFrame({'longitude': np.linspace(0, 10), - ... 'latitude': np.linspace(0, 20)}) + >>> ds = pd.Dataframe( + ... {"longitude": np.linspace(0, 10), "latitude": np.linspace(0, 20)} + ... ) >>> ds.geo.center (5.0, 10.0) >>> ds.geo.plot() @@ -499,4 +500,4 @@ registers the default "matplotlib" backend as follows. More information on how to implement a third-party plotting backend can be found at -https://github.com/pandas-dev/pandas/blob/master/pandas/plotting/__init__.py#L1. \ No newline at end of file +https://github.com/pandas-dev/pandas/blob/master/pandas/plotting/__init__.py#L1. diff --git a/doc/source/user_guide/duplicates.rst b/doc/source/user_guide/duplicates.rst index b65822fab2b23..2993ca7799510 100644 --- a/doc/source/user_guide/duplicates.rst +++ b/doc/source/user_guide/duplicates.rst @@ -29,8 +29,8 @@ duplicates present. The output can't be determined, and so pandas raises. .. ipython:: python :okexcept: - s1 = pd.Series([0, 1, 2], index=['a', 'b', 'b']) - s1.reindex(['a', 'b', 'c']) + s1 = pd.Series([0, 1, 2], index=["a", "b", "b"]) + s1.reindex(["a", "b", "c"]) Other methods, like indexing, can give very surprising results. Typically indexing with a scalar will *reduce dimensionality*. Slicing a ``DataFrame`` @@ -39,30 +39,30 @@ return a scalar. But with duplicates, this isn't the case. .. ipython:: python - df1 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=['A', 'A', 'B']) + df1 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=["A", "A", "B"]) df1 We have duplicates in the columns. If we slice ``'B'``, we get back a ``Series`` .. ipython:: python - df1['B'] # a series + df1["B"] # a series But slicing ``'A'`` returns a ``DataFrame`` .. ipython:: python - df1['A'] # a DataFrame + df1["A"] # a DataFrame This applies to row labels as well .. ipython:: python - df2 = pd.DataFrame({"A": [0, 1, 2]}, index=['a', 'a', 'b']) + df2 = pd.DataFrame({"A": [0, 1, 2]}, index=["a", "a", "b"]) df2 - df2.loc['b', 'A'] # a scalar - df2.loc['a', 'A'] # a Series + df2.loc["b", "A"] # a scalar + df2.loc["a", "A"] # a Series Duplicate Label Detection ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -121,29 +121,24 @@ will be raised. .. ipython:: python :okexcept: - pd.Series( - [0, 1, 2], - index=['a', 'b', 'b'] - ).set_flags(allows_duplicate_labels=False) + pd.Series([0, 1, 2], index=["a", "b", "b"]).set_flags(allows_duplicate_labels=False) This applies to both row and column labels for a :class:`DataFrame` .. ipython:: python :okexcept: - pd.DataFrame( - [[0, 1, 2], [3, 4, 5]], columns=["A", "B", "C"], - ).set_flags(allows_duplicate_labels=False) + pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=["A", "B", "C"],).set_flags( + allows_duplicate_labels=False + ) This attribute can be checked or set with :attr:`~DataFrame.flags.allows_duplicate_labels`, which indicates whether that object can have duplicate labels. .. ipython:: python - df = ( - pd.DataFrame({"A": [0, 1, 2, 3]}, - index=['x', 'y', 'X', 'Y']) - .set_flags(allows_duplicate_labels=False) + df = pd.DataFrame({"A": [0, 1, 2, 3]}, index=["x", "y", "X", "Y"]).set_flags( + allows_duplicate_labels=False ) df df.flags.allows_duplicate_labels @@ -198,7 +193,7 @@ operations. .. ipython:: python :okexcept: - s1 = pd.Series(0, index=['a', 'b']).set_flags(allows_duplicate_labels=False) + s1 = pd.Series(0, index=["a", "b"]).set_flags(allows_duplicate_labels=False) s1 s1.head().rename({"a": "b"}) diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst index a96c70405d859..07c856c96426d 100644 --- a/doc/source/user_guide/gotchas.rst +++ b/doc/source/user_guide/gotchas.rst @@ -21,12 +21,19 @@ when calling :meth:`~DataFrame.info`: .. ipython:: python - dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]', - 'complex128', 'object', 'bool'] + dtypes = [ + "int64", + "float64", + "datetime64[ns]", + "timedelta64[ns]", + "complex128", + "object", + "bool", + ] n = 5000 data = {t: np.random.randint(100, size=n).astype(t) for t in dtypes} df = pd.DataFrame(data) - df['categorical'] = df['object'].astype('category') + df["categorical"] = df["object"].astype("category") df.info() @@ -40,7 +47,7 @@ as it can be expensive to do this deeper introspection. .. ipython:: python - df.info(memory_usage='deep') + df.info(memory_usage="deep") By default the display option is set to ``True`` but can be explicitly overridden by passing the ``memory_usage`` argument when invoking ``df.info()``. @@ -155,7 +162,7 @@ index, not membership among the values. .. ipython:: python - s = pd.Series(range(5), index=list('abcde')) + s = pd.Series(range(5), index=list("abcde")) 2 in s 'b' in s @@ -206,11 +213,11 @@ arrays. For example: .. ipython:: python - s = pd.Series([1, 2, 3, 4, 5], index=list('abcde')) + s = pd.Series([1, 2, 3, 4, 5], index=list("abcde")) s s.dtype - s2 = s.reindex(['a', 'b', 'c', 'f', 'u']) + s2 = s.reindex(["a", "b", "c", "f", "u"]) s2 s2.dtype @@ -227,12 +234,11 @@ the nullable-integer extension dtypes provided by pandas .. ipython:: python - s_int = pd.Series([1, 2, 3, 4, 5], index=list('abcde'), - dtype=pd.Int64Dtype()) + s_int = pd.Series([1, 2, 3, 4, 5], index=list("abcde"), dtype=pd.Int64Dtype()) s_int s_int.dtype - s2_int = s_int.reindex(['a', 'b', 'c', 'f', 'u']) + s2_int = s_int.reindex(["a", "b", "c", "f", "u"]) s2_int s2_int.dtype @@ -334,7 +340,7 @@ constructors using something similar to the following: .. ipython:: python - x = np.array(list(range(10)), '>i4') # big endian + x = np.array(list(range(10)), ">i4") # big endian newx = x.byteswap().newbyteorder() # force native byteorder s = pd.Series(newx) From f41ce46a5e2dc1feb860b07c03cab0bfaedb674b Mon Sep 17 00:00:00 2001 From: Erfan Nariman <34067903+erfannariman@users.noreply.github.com> Date: Sat, 3 Oct 2020 16:58:46 +0200 Subject: [PATCH 07/38] CI: silence codecov for unrelated lines (#36600) --- codecov.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/codecov.yml b/codecov.yml index 1644bf315e0ac..6dd1e33a7a671 100644 --- a/codecov.yml +++ b/codecov.yml @@ -1,7 +1,7 @@ codecov: branch: master -comment: off +comment: false coverage: status: @@ -11,3 +11,6 @@ coverage: patch: default: target: '50' + +github_checks: + annotations: false From c5c0aadfca67731ad6a6e9d170bac29661eb2fed Mon Sep 17 00:00:00 2001 From: Erfan Nariman <34067903+erfannariman@users.noreply.github.com> Date: Sat, 3 Oct 2020 20:09:21 +0200 Subject: [PATCH 08/38] DOC: reformat doc groupby.rst (#36831) --- doc/source/user_guide/groupby.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 9696f14f03b56..ec64442319a84 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -677,7 +677,7 @@ and unpack the keyword arguments animals.groupby("kind").agg( **{ - "total weight": pd.NamedAgg(column="weight", aggfunc=sum), + "total weight": pd.NamedAgg(column="weight", aggfunc=sum) } ) From 007138950089b11e6723b2a53a6a08ac7d9cab7f Mon Sep 17 00:00:00 2001 From: Alex Thorne Date: Sat, 3 Oct 2020 19:17:07 +0100 Subject: [PATCH 09/38] TST: GH23452 test reorder_categories() on categorical index (#36648) --- pandas/tests/indexing/test_categorical.py | 28 +++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 9f3ee81fac2eb..fae229aecc3d4 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -807,3 +807,31 @@ def test_loc_with_non_string_categories(self, idx_values, ordered): result.loc[sl, "A"] = ["qux", "qux2"] expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "categories", + [ + pytest.param(["a", "b", "c"], id="str"), + pytest.param( + [pd.Interval(0, 1), pd.Interval(1, 2), pd.Interval(2, 3)], + id="pd.Interval", + ), + ], + ) + def test_reorder_index_with_categories(self, categories): + # GH23452 + df = DataFrame( + {"foo": range(len(categories))}, + index=CategoricalIndex( + data=categories, categories=categories, ordered=True + ), + ) + df.index = df.index.reorder_categories(df.index.categories[::-1]) + result = df.sort_index() + expected = DataFrame( + {"foo": reversed(range(len(categories)))}, + index=CategoricalIndex( + data=categories[::-1], categories=categories[::-1], ordered=True + ), + ) + tm.assert_frame_equal(result, expected) From b9be7ba965246e6553f2c9f1fc2c6f6d1a08f636 Mon Sep 17 00:00:00 2001 From: lrjball <50599110+lrjball@users.noreply.github.com> Date: Sun, 4 Oct 2020 04:27:50 +0100 Subject: [PATCH 10/38] Typo fix (#36844) Noticed a minor typo when using the docs --- doc/source/user_guide/style.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index 77a1fef28f373..12dd72f761408 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -793,7 +793,7 @@ "source": [ "The next option you have are \"table styles\".\n", "These are styles that apply to the table as a whole, but don't look at the data.\n", - "Certain sytlings, including pseudo-selectors like `:hover` can only be used this way." + "Certain stylings, including pseudo-selectors like `:hover` can only be used this way." ] }, { From b55d51ae667460b6bc92b2695037a8d4964cfb6d Mon Sep 17 00:00:00 2001 From: Maxim Ivanov <41443370+ivanovmg@users.noreply.github.com> Date: Sun, 4 Oct 2020 11:39:42 +0700 Subject: [PATCH 11/38] CLN: private funcs in concat.py (#36726) * REF: extract func _select_upcast_cls_from_dtype * REF: extract function _get_upcast_classes * CLN: rename g -> common_dtype * TYP: type extracted functions * DOC: add docstrings to extracted methods * TYP: cast instead of ignoring mypy error * CLN: import SparseDtype only for type checking --- pandas/core/internals/concat.py | 107 ++++++++++++++++++-------------- 1 file changed, 62 insertions(+), 45 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index f5d0c921e1006..7ad058cfeb83c 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -1,6 +1,6 @@ from collections import defaultdict import copy -from typing import Dict, List +from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple, cast import numpy as np @@ -28,6 +28,9 @@ from pandas.core.internals.blocks import make_block from pandas.core.internals.managers import BlockManager +if TYPE_CHECKING: + from pandas.core.arrays.sparse.dtype import SparseDtype + def concatenate_block_managers( mgrs_indexers, axes, concat_axis: int, copy: bool @@ -344,7 +347,7 @@ def _concatenate_join_units(join_units, concat_axis, copy): return concat_values -def _get_empty_dtype_and_na(join_units): +def _get_empty_dtype_and_na(join_units: Sequence[JoinUnit]) -> Tuple[DtypeObj, Any]: """ Return dtype and N/A values to use when concatenating specified units. @@ -374,45 +377,8 @@ def _get_empty_dtype_and_na(join_units): else: dtypes[i] = unit.dtype - upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list) - null_upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list) - for dtype, unit in zip(dtypes, join_units): - if dtype is None: - continue - - if is_categorical_dtype(dtype): - upcast_cls = "category" - elif is_datetime64tz_dtype(dtype): - upcast_cls = "datetimetz" - - elif is_extension_array_dtype(dtype): - upcast_cls = "extension" - - elif issubclass(dtype.type, np.bool_): - upcast_cls = "bool" - elif issubclass(dtype.type, np.object_): - upcast_cls = "object" - elif is_datetime64_dtype(dtype): - upcast_cls = "datetime" - elif is_timedelta64_dtype(dtype): - upcast_cls = "timedelta" - elif is_sparse(dtype): - upcast_cls = dtype.subtype.name - elif is_float_dtype(dtype) or is_numeric_dtype(dtype): - upcast_cls = dtype.name - else: - upcast_cls = "float" + upcast_classes = _get_upcast_classes(join_units, dtypes) - # Null blocks should not influence upcast class selection, unless there - # are only null blocks, when same upcasting rules must be applied to - # null upcast classes. - if unit.is_na: - null_upcast_classes[upcast_cls].append(dtype) - else: - upcast_classes[upcast_cls].append(dtype) - - if not upcast_classes: - upcast_classes = null_upcast_classes # TODO: de-duplicate with maybe_promote? # create the result if "extension" in upcast_classes: @@ -441,23 +407,74 @@ def _get_empty_dtype_and_na(join_units): return np.dtype("m8[ns]"), np.timedelta64("NaT", "ns") else: # pragma try: - g = np.find_common_type(upcast_classes, []) + common_dtype = np.find_common_type(upcast_classes, []) except TypeError: # At least one is an ExtensionArray return np.dtype(np.object_), np.nan else: - if is_float_dtype(g): - return g, g.type(np.nan) - elif is_numeric_dtype(g): + if is_float_dtype(common_dtype): + return common_dtype, common_dtype.type(np.nan) + elif is_numeric_dtype(common_dtype): if has_none_blocks: return np.dtype(np.float64), np.nan else: - return g, None + return common_dtype, None msg = "invalid dtype determination in get_concat_dtype" raise AssertionError(msg) +def _get_upcast_classes( + join_units: Sequence[JoinUnit], + dtypes: Sequence[DtypeObj], +) -> Dict[str, List[DtypeObj]]: + """Create mapping between upcast class names and lists of dtypes.""" + upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list) + null_upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list) + for dtype, unit in zip(dtypes, join_units): + if dtype is None: + continue + + upcast_cls = _select_upcast_cls_from_dtype(dtype) + # Null blocks should not influence upcast class selection, unless there + # are only null blocks, when same upcasting rules must be applied to + # null upcast classes. + if unit.is_na: + null_upcast_classes[upcast_cls].append(dtype) + else: + upcast_classes[upcast_cls].append(dtype) + + if not upcast_classes: + upcast_classes = null_upcast_classes + + return upcast_classes + + +def _select_upcast_cls_from_dtype(dtype: DtypeObj) -> str: + """Select upcast class name based on dtype.""" + if is_categorical_dtype(dtype): + return "category" + elif is_datetime64tz_dtype(dtype): + return "datetimetz" + elif is_extension_array_dtype(dtype): + return "extension" + elif issubclass(dtype.type, np.bool_): + return "bool" + elif issubclass(dtype.type, np.object_): + return "object" + elif is_datetime64_dtype(dtype): + return "datetime" + elif is_timedelta64_dtype(dtype): + return "timedelta" + elif is_sparse(dtype): + dtype = cast("SparseDtype", dtype) + return dtype.subtype.name + elif is_float_dtype(dtype) or is_numeric_dtype(dtype): + return dtype.name + else: + return "float" + + def _is_uniform_join_units(join_units: List[JoinUnit]) -> bool: """ Check if the join units consist of blocks of uniform type that can From 74b7b419aa057dde650aa506c4e06d643e8c7f94 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 3 Oct 2020 21:47:46 -0700 Subject: [PATCH 12/38] ENH: match stdlib behavior for datetimelike comparisons (#36647) * ENH: match stdlib behavior for datetimelike comparisons * update test Co-authored-by: Jeff Reback --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/_libs/lib.pyx | 7 -- pandas/_libs/tslibs/timestamps.pxd | 2 +- pandas/_libs/tslibs/timestamps.pyx | 26 ++++-- pandas/core/arrays/datetimelike.py | 13 ++- pandas/tests/arithmetic/test_datetime64.py | 83 ++++++++++++------- pandas/tests/reductions/test_reductions.py | 10 +-- .../scalar/timestamp/test_comparisons.py | 27 +++--- pandas/tests/series/indexing/test_datetime.py | 2 +- 9 files changed, 104 insertions(+), 67 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index aad21dfb19a74..ff16efc17617f 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -308,6 +308,7 @@ Datetimelike - Bug in :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with ``datetime64``, ``timedelta64`` or ``Period`` dtype placement of ``NaT`` values being inconsistent with ``NumPy`` (:issue:`36176`, :issue:`36254`) - Inconsistency in :class:`DatetimeArray`, :class:`TimedeltaArray`, and :class:`PeriodArray` setitem casting arrays of strings to datetimelike scalars but not scalar strings (:issue:`36261`) - Bug in :class:`DatetimeIndex.shift` incorrectly raising when shifting empty indexes (:issue:`14811`) +- :class:`Timestamp` and :class:`DatetimeIndex` comparisons between timezone-aware and timezone-naive objects now follow the standard library ``datetime`` behavior, returning ``True``/``False`` for ``!=``/``==`` and raising for inequality comparisons (:issue:`28507`) - Bug in :meth:`DatetimeIndex.equals` and :meth:`TimedeltaIndex.equals` incorrectly considering ``int64`` indexes as equal (:issue:`36744`) Timedelta diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 61a9634b00211..922dcd7e74aa0 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -584,13 +584,6 @@ def array_equivalent_object(left: object[:], right: object[:]) -> bool: elif not (PyObject_RichCompareBool(x, y, Py_EQ) or (x is None or is_nan(x)) and (y is None or is_nan(y))): return False - except TypeError as err: - # Avoid raising TypeError on tzawareness mismatch - # TODO: This try/except can be removed if/when Timestamp - # comparisons are changed to match datetime, see GH#28507 - if "tz-naive and tz-aware" in str(err): - return False - raise except ValueError: # Avoid raising ValueError when comparing Numpy arrays to other types if cnp.PyArray_IsAnyScalar(x) != cnp.PyArray_IsAnyScalar(y): diff --git a/pandas/_libs/tslibs/timestamps.pxd b/pandas/_libs/tslibs/timestamps.pxd index 307b6dfc90715..6fb7b1ea8f520 100644 --- a/pandas/_libs/tslibs/timestamps.pxd +++ b/pandas/_libs/tslibs/timestamps.pxd @@ -19,8 +19,8 @@ cdef class _Timestamp(ABCTimestamp): cdef bint _get_start_end_field(self, str field) cdef _get_date_name_field(self, str field, object locale) cdef int64_t _maybe_convert_value_to_local(self) + cdef bint _can_compare(self, datetime other) cpdef to_datetime64(self) - cdef _assert_tzawareness_compat(_Timestamp self, datetime other) cpdef datetime to_pydatetime(_Timestamp self, bint warn=*) cdef bint _compare_outside_nanorange(_Timestamp self, datetime other, int op) except -1 diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 78f7b2150f720..a8f6c60bcb300 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -260,6 +260,10 @@ cdef class _Timestamp(ABCTimestamp): if other.dtype.kind == "M": if self.tz is None: return PyObject_RichCompare(self.asm8, other, op) + elif op == Py_NE: + return np.ones(other.shape, dtype=np.bool_) + elif op == Py_EQ: + return np.zeros(other.shape, dtype=np.bool_) raise TypeError( "Cannot compare tz-naive and tz-aware timestamps" ) @@ -278,7 +282,12 @@ cdef class _Timestamp(ABCTimestamp): else: return NotImplemented - self._assert_tzawareness_compat(ots) + if not self._can_compare(ots): + if op == Py_NE or op == Py_EQ: + return NotImplemented + raise TypeError( + "Cannot compare tz-naive and tz-aware timestamps" + ) return cmp_scalar(self.value, ots.value, op) cdef bint _compare_outside_nanorange(_Timestamp self, datetime other, @@ -286,16 +295,15 @@ cdef class _Timestamp(ABCTimestamp): cdef: datetime dtval = self.to_pydatetime() - self._assert_tzawareness_compat(other) + if not self._can_compare(other): + return NotImplemented + return PyObject_RichCompareBool(dtval, other, op) - cdef _assert_tzawareness_compat(_Timestamp self, datetime other): - if self.tzinfo is None: - if other.tzinfo is not None: - raise TypeError('Cannot compare tz-naive and tz-aware ' - 'timestamps') - elif other.tzinfo is None: - raise TypeError('Cannot compare tz-naive and tz-aware timestamps') + cdef bint _can_compare(self, datetime other): + if self.tzinfo is not None: + return other.tzinfo is not None + return other.tzinfo is None def __add__(self, other): cdef: diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 89e0f76b14875..36fd487e92327 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -672,7 +672,11 @@ def _validate_comparison_value(self, other, opname: str): if isinstance(other, self._recognized_scalars) or other is NaT: other = self._scalar_type(other) # type: ignore[call-arg] - self._check_compatible_with(other) + try: + self._check_compatible_with(other) + except TypeError as err: + # e.g. tzawareness mismatch + raise InvalidComparison(other) from err elif not is_list_like(other): raise InvalidComparison(other) @@ -683,8 +687,13 @@ def _validate_comparison_value(self, other, opname: str): else: try: other = self._validate_listlike(other, opname, allow_object=True) + self._check_compatible_with(other) except TypeError as err: - raise InvalidComparison(other) from err + if is_object_dtype(getattr(other, "dtype", None)): + # We will have to operate element-wise + pass + else: + raise InvalidComparison(other) from err return other diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 46be296759088..e9dc83d106651 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -558,26 +558,30 @@ def test_comparison_tzawareness_compat(self, op, box_with_array): dr = tm.box_expected(dr, box) dz = tm.box_expected(dz, box) - msg = "Cannot compare tz-naive and tz-aware" - with pytest.raises(TypeError, match=msg): - op(dr, dz) - if box is pd.DataFrame: tolist = lambda x: x.astype(object).values.tolist()[0] else: tolist = list - with pytest.raises(TypeError, match=msg): - op(dr, tolist(dz)) - with pytest.raises(TypeError, match=msg): - op(dr, np.array(tolist(dz), dtype=object)) - with pytest.raises(TypeError, match=msg): - op(dz, dr) + if op not in [operator.eq, operator.ne]: + msg = ( + r"Invalid comparison between dtype=datetime64\[ns.*\] " + "and (Timestamp|DatetimeArray|list|ndarray)" + ) + with pytest.raises(TypeError, match=msg): + op(dr, dz) - with pytest.raises(TypeError, match=msg): - op(dz, tolist(dr)) - with pytest.raises(TypeError, match=msg): - op(dz, np.array(tolist(dr), dtype=object)) + with pytest.raises(TypeError, match=msg): + op(dr, tolist(dz)) + with pytest.raises(TypeError, match=msg): + op(dr, np.array(tolist(dz), dtype=object)) + with pytest.raises(TypeError, match=msg): + op(dz, dr) + + with pytest.raises(TypeError, match=msg): + op(dz, tolist(dr)) + with pytest.raises(TypeError, match=msg): + op(dz, np.array(tolist(dr), dtype=object)) # The aware==aware and naive==naive comparisons should *not* raise assert np.all(dr == dr) @@ -609,17 +613,20 @@ def test_comparison_tzawareness_compat_scalars(self, op, box_with_array): ts_tz = pd.Timestamp("2000-03-14 01:59", tz="Europe/Amsterdam") assert np.all(dr > ts) - msg = "Cannot compare tz-naive and tz-aware" - with pytest.raises(TypeError, match=msg): - op(dr, ts_tz) + msg = r"Invalid comparison between dtype=datetime64\[ns.*\] and Timestamp" + if op not in [operator.eq, operator.ne]: + with pytest.raises(TypeError, match=msg): + op(dr, ts_tz) assert np.all(dz > ts_tz) - with pytest.raises(TypeError, match=msg): - op(dz, ts) + if op not in [operator.eq, operator.ne]: + with pytest.raises(TypeError, match=msg): + op(dz, ts) - # GH#12601: Check comparison against Timestamps and DatetimeIndex - with pytest.raises(TypeError, match=msg): - op(ts, dz) + if op not in [operator.eq, operator.ne]: + # GH#12601: Check comparison against Timestamps and DatetimeIndex + with pytest.raises(TypeError, match=msg): + op(ts, dz) @pytest.mark.parametrize( "op", @@ -637,15 +644,31 @@ def test_comparison_tzawareness_compat_scalars(self, op, box_with_array): def test_scalar_comparison_tzawareness( self, op, other, tz_aware_fixture, box_with_array ): + box = box_with_array tz = tz_aware_fixture dti = pd.date_range("2016-01-01", periods=2, tz=tz) + xbox = box if box not in [pd.Index, pd.array] else np.ndarray dtarr = tm.box_expected(dti, box_with_array) - msg = "Cannot compare tz-naive and tz-aware" - with pytest.raises(TypeError, match=msg): - op(dtarr, other) - with pytest.raises(TypeError, match=msg): - op(other, dtarr) + if op in [operator.eq, operator.ne]: + exbool = op is operator.ne + expected = np.array([exbool, exbool], dtype=bool) + expected = tm.box_expected(expected, xbox) + + result = op(dtarr, other) + tm.assert_equal(result, expected) + + result = op(other, dtarr) + tm.assert_equal(result, expected) + else: + msg = ( + r"Invalid comparison between dtype=datetime64\[ns, .*\] " + f"and {type(other).__name__}" + ) + with pytest.raises(TypeError, match=msg): + op(dtarr, other) + with pytest.raises(TypeError, match=msg): + op(other, dtarr) @pytest.mark.parametrize( "op", @@ -745,10 +768,8 @@ def test_dti_cmp_object_dtype(self): tm.assert_numpy_array_equal(result, expected) other = dti.tz_localize(None) - msg = "Cannot compare tz-naive and tz-aware" - with pytest.raises(TypeError, match=msg): - # tzawareness failure - dti != other + result = dti != other + tm.assert_numpy_array_equal(result, expected) other = np.array(list(dti[:5]) + [Timedelta(days=1)] * 5) result = dti == other diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index db7cd54d23a2b..fe97925c2bb74 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -56,13 +56,13 @@ def test_ops(self, opname, obj): expected = getattr(obj.values, opname)() else: expected = pd.Period(ordinal=getattr(obj.asi8, opname)(), freq=obj.freq) - try: - assert result == expected - except TypeError: - # comparing tz-aware series with np.array results in - # TypeError + + if getattr(obj, "tz", None) is not None: + # We need to de-localize before comparing to the numpy-produced result expected = expected.astype("M8[ns]").astype("int64") assert result.value == expected + else: + assert result == expected @pytest.mark.parametrize("opname", ["max", "min"]) @pytest.mark.parametrize( diff --git a/pandas/tests/scalar/timestamp/test_comparisons.py b/pandas/tests/scalar/timestamp/test_comparisons.py index 71693a9ca61ce..3d1f71def5836 100644 --- a/pandas/tests/scalar/timestamp/test_comparisons.py +++ b/pandas/tests/scalar/timestamp/test_comparisons.py @@ -56,9 +56,18 @@ def test_comparison_dt64_ndarray_tzaware(self, reverse, all_compare_operators): if reverse: left, right = arr, ts - msg = "Cannot compare tz-naive and tz-aware timestamps" - with pytest.raises(TypeError, match=msg): - op(left, right) + if op is operator.eq: + expected = np.array([False, False], dtype=bool) + result = op(left, right) + tm.assert_numpy_array_equal(result, expected) + elif op is operator.ne: + expected = np.array([True, True], dtype=bool) + result = op(left, right) + tm.assert_numpy_array_equal(result, expected) + else: + msg = "Cannot compare tz-naive and tz-aware timestamps" + with pytest.raises(TypeError, match=msg): + op(left, right) def test_comparison_object_array(self): # GH#15183 @@ -139,10 +148,8 @@ def test_cant_compare_tz_naive_w_aware(self, utc_fixture): b = Timestamp("3/12/2012", tz=utc_fixture) msg = "Cannot compare tz-naive and tz-aware timestamps" - with pytest.raises(TypeError, match=msg): - a == b - with pytest.raises(TypeError, match=msg): - a != b + assert not a == b + assert a != b with pytest.raises(TypeError, match=msg): a < b with pytest.raises(TypeError, match=msg): @@ -152,10 +159,8 @@ def test_cant_compare_tz_naive_w_aware(self, utc_fixture): with pytest.raises(TypeError, match=msg): a >= b - with pytest.raises(TypeError, match=msg): - b == a - with pytest.raises(TypeError, match=msg): - b != a + assert not b == a + assert b != a with pytest.raises(TypeError, match=msg): b < a with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index b7fbed2b325b3..0389099a195d0 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -258,7 +258,7 @@ def test_getitem_setitem_datetimeindex(): lb = datetime(1990, 1, 1, 4) rb = datetime(1990, 1, 1, 7) - msg = "Cannot compare tz-naive and tz-aware datetime-like objects" + msg = r"Invalid comparison between dtype=datetime64\[ns, US/Eastern\] and datetime" with pytest.raises(TypeError, match=msg): # tznaive vs tzaware comparison is invalid # see GH#18376, GH#18162 From 9d2399f83d5bc9ad1ce980efb5a12aa1b1d63a69 Mon Sep 17 00:00:00 2001 From: Meghana Varanasi Date: Sun, 4 Oct 2020 17:15:42 +0530 Subject: [PATCH 13/38] doc/source/ecosystem.rst (#36856) --- doc/source/ecosystem.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index ed6ce7e9759b6..4086f64817568 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -170,7 +170,9 @@ invoked with the following command .. code:: python - import dtale; dtale.show(df) + import dtale + + dtale.show(df) D-Tale integrates seamlessly with jupyter notebooks, python terminals, kaggle & Google Colab. Here are some demos of the `grid `__ From f6d3dfc8f65c089604bb99ba287e64f277beb8ee Mon Sep 17 00:00:00 2001 From: beanan Date: Mon, 5 Oct 2020 01:39:23 +0800 Subject: [PATCH 14/38] DOC: black enhancingperf.rst and 10min.rst code style (#36849) --- doc/source/user_guide/10min.rst | 101 ++++++++--------- doc/source/user_guide/enhancingperf.rst | 141 ++++++++++++------------ setup.cfg | 3 +- 3 files changed, 124 insertions(+), 121 deletions(-) diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index 673f8689736f1..8270b2ee49bd8 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -34,9 +34,9 @@ and labeled columns: .. ipython:: python - dates = pd.date_range('20130101', periods=6) + dates = pd.date_range("20130101", periods=6) dates - df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')) + df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD")) df Creating a :class:`DataFrame` by passing a dict of objects that can be converted to series-like. @@ -156,7 +156,7 @@ Sorting by values: .. ipython:: python - df.sort_values(by='B') + df.sort_values(by="B") Selection --------- @@ -178,14 +178,14 @@ equivalent to ``df.A``: .. ipython:: python - df['A'] + df["A"] Selecting via ``[]``, which slices the rows. .. ipython:: python df[0:3] - df['20130102':'20130104'] + df["20130102":"20130104"] Selection by label ~~~~~~~~~~~~~~~~~~ @@ -202,31 +202,31 @@ Selecting on a multi-axis by label: .. ipython:: python - df.loc[:, ['A', 'B']] + df.loc[:, ["A", "B"]] Showing label slicing, both endpoints are *included*: .. ipython:: python - df.loc['20130102':'20130104', ['A', 'B']] + df.loc["20130102":"20130104", ["A", "B"]] Reduction in the dimensions of the returned object: .. ipython:: python - df.loc['20130102', ['A', 'B']] + df.loc["20130102", ["A", "B"]] For getting a scalar value: .. ipython:: python - df.loc[dates[0], 'A'] + df.loc[dates[0], "A"] For getting fast access to a scalar (equivalent to the prior method): .. ipython:: python - df.at[dates[0], 'A'] + df.at[dates[0], "A"] Selection by position ~~~~~~~~~~~~~~~~~~~~~ @@ -282,7 +282,7 @@ Using a single column's values to select data. .. ipython:: python - df[df['A'] > 0] + df[df["A"] > 0] Selecting values from a DataFrame where a boolean condition is met. @@ -295,9 +295,9 @@ Using the :func:`~Series.isin` method for filtering: .. ipython:: python df2 = df.copy() - df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three'] + df2["E"] = ["one", "one", "two", "three", "four", "three"] df2 - df2[df2['E'].isin(['two', 'four'])] + df2[df2["E"].isin(["two", "four"])] Setting ~~~~~~~ @@ -307,15 +307,15 @@ by the indexes. .. ipython:: python - s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130102', periods=6)) + s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range("20130102", periods=6)) s1 - df['F'] = s1 + df["F"] = s1 Setting values by label: .. ipython:: python - df.at[dates[0], 'A'] = 0 + df.at[dates[0], "A"] = 0 Setting values by position: @@ -327,7 +327,7 @@ Setting by assigning with a NumPy array: .. ipython:: python - df.loc[:, 'D'] = np.array([5] * len(df)) + df.loc[:, "D"] = np.array([5] * len(df)) The result of the prior setting operations. @@ -356,15 +356,15 @@ returns a copy of the data. .. ipython:: python - df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E']) - df1.loc[dates[0]:dates[1], 'E'] = 1 + df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"]) + df1.loc[dates[0] : dates[1], "E"] = 1 df1 To drop any rows that have missing data. .. ipython:: python - df1.dropna(how='any') + df1.dropna(how="any") Filling missing data. @@ -408,7 +408,7 @@ In addition, pandas automatically broadcasts along the specified dimension. s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2) s - df.sub(s, axis='index') + df.sub(s, axis="index") Apply @@ -444,7 +444,7 @@ some cases always uses them). See more at :ref:`Vectorized String Methods .. ipython:: python - s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) + s = pd.Series(["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"]) s.str.lower() Merge @@ -486,21 +486,21 @@ SQL style merges. See the :ref:`Database style joining ` section. .. ipython:: python - left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]}) - right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]}) + left = pd.DataFrame({"key": ["foo", "foo"], "lval": [1, 2]}) + right = pd.DataFrame({"key": ["foo", "foo"], "rval": [4, 5]}) left right - pd.merge(left, right, on='key') + pd.merge(left, right, on="key") Another example that can be given is: .. ipython:: python - left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]}) - right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]}) + left = pd.DataFrame({"key": ["foo", "bar"], "lval": [1, 2]}) + right = pd.DataFrame({"key": ["foo", "bar"], "rval": [4, 5]}) left right - pd.merge(left, right, on='key') + pd.merge(left, right, on="key") Grouping -------- @@ -531,14 +531,14 @@ groups. .. ipython:: python - df.groupby('A').sum() + df.groupby("A").sum() Grouping by multiple columns forms a hierarchical index, and again we can apply the :meth:`~pandas.core.groupby.GroupBy.sum` function. .. ipython:: python - df.groupby(['A', 'B']).sum() + df.groupby(["A", "B"]).sum() Reshaping --------- @@ -559,8 +559,8 @@ Stack ] ) ) - index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) - df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B']) + index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"]) + df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=["A", "B"]) df2 = df[:4] df2 @@ -603,7 +603,7 @@ We can produce pivot tables from this data very easily: .. ipython:: python - pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C']) + pd.pivot_table(df, values="D", index=["A", "B"], columns=["C"]) Time series @@ -616,31 +616,31 @@ financial applications. See the :ref:`Time Series section `. .. ipython:: python - rng = pd.date_range('1/1/2012', periods=100, freq='S') + rng = pd.date_range("1/1/2012", periods=100, freq="S") ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) - ts.resample('5Min').sum() + ts.resample("5Min").sum() Time zone representation: .. ipython:: python - rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D') + rng = pd.date_range("3/6/2012 00:00", periods=5, freq="D") ts = pd.Series(np.random.randn(len(rng)), rng) ts - ts_utc = ts.tz_localize('UTC') + ts_utc = ts.tz_localize("UTC") ts_utc Converting to another time zone: .. ipython:: python - ts_utc.tz_convert('US/Eastern') + ts_utc.tz_convert("US/Eastern") Converting between time span representations: .. ipython:: python - rng = pd.date_range('1/1/2012', periods=5, freq='M') + rng = pd.date_range("1/1/2012", periods=5, freq="M") ts = pd.Series(np.random.randn(len(rng)), index=rng) ts ps = ts.to_period() @@ -654,9 +654,9 @@ the quarter end: .. ipython:: python - prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV') + prng = pd.period_range("1990Q1", "2000Q4", freq="Q-NOV") ts = pd.Series(np.random.randn(len(prng)), prng) - ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9 + ts.index = (prng.asfreq("M", "e") + 1).asfreq("H", "s") + 9 ts.head() Categoricals @@ -754,19 +754,20 @@ CSV .. ipython:: python - df.to_csv('foo.csv') + df.to_csv("foo.csv") :ref:`Reading from a csv file. ` .. ipython:: python - pd.read_csv('foo.csv') + pd.read_csv("foo.csv") .. ipython:: python :suppress: import os - os.remove('foo.csv') + + os.remove("foo.csv") HDF5 ~~~~ @@ -777,18 +778,18 @@ Writing to a HDF5 Store. .. ipython:: python - df.to_hdf('foo.h5', 'df') + df.to_hdf("foo.h5", "df") Reading from a HDF5 Store. .. ipython:: python - pd.read_hdf('foo.h5', 'df') + pd.read_hdf("foo.h5", "df") .. ipython:: python :suppress: - os.remove('foo.h5') + os.remove("foo.h5") Excel ~~~~~ @@ -799,18 +800,18 @@ Writing to an excel file. .. ipython:: python - df.to_excel('foo.xlsx', sheet_name='Sheet1') + df.to_excel("foo.xlsx", sheet_name="Sheet1") Reading from an excel file. .. ipython:: python - pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA']) + pd.read_excel("foo.xlsx", "Sheet1", index_col=None, na_values=["NA"]) .. ipython:: python :suppress: - os.remove('foo.xlsx') + os.remove("foo.xlsx") Gotchas ------- diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index ce9db0a5279c3..d30554986607d 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -48,10 +48,14 @@ We have a ``DataFrame`` to which we want to apply a function row-wise. .. ipython:: python - df = pd.DataFrame({'a': np.random.randn(1000), - 'b': np.random.randn(1000), - 'N': np.random.randint(100, 1000, (1000)), - 'x': 'x'}) + df = pd.DataFrame( + { + "a": np.random.randn(1000), + "b": np.random.randn(1000), + "N": np.random.randint(100, 1000, (1000)), + "x": "x", + } + ) df Here's the function in pure Python: @@ -61,6 +65,7 @@ Here's the function in pure Python: def f(x): return x * (x - 1) + def integrate_f(a, b, N): s = 0 dx = (b - a) / N @@ -72,7 +77,7 @@ We achieve our result by using ``apply`` (row-wise): .. code-block:: ipython - In [7]: %timeit df.apply(lambda x: integrate_f(x['a'], x['b'], x['N']), axis=1) + In [7]: %timeit df.apply(lambda x: integrate_f(x["a"], x["b"], x["N"]), axis=1) 10 loops, best of 3: 174 ms per loop But clearly this isn't fast enough for us. Let's take a look and see where the @@ -81,7 +86,7 @@ four calls) using the `prun ipython magic function 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)') + %timeit pd.eval("(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)") :func:`~pandas.eval` also works with unaligned pandas objects: @@ -560,7 +557,7 @@ Now let's do the same thing but with comparisons: .. ipython:: python - %timeit pd.eval('df1 + df2 + df3 + df4 + s') + %timeit pd.eval("df1 + df2 + df3 + df4 + s") .. note:: @@ -587,19 +584,19 @@ evaluate an expression in the "context" of a :class:`~pandas.DataFrame`. :suppress: try: - del a + del a except NameError: - pass + pass try: - del b + del b except NameError: - pass + pass .. ipython:: python - df = pd.DataFrame(np.random.randn(5, 2), columns=['a', 'b']) - df.eval('a + b') + df = pd.DataFrame(np.random.randn(5, 2), columns=["a", "b"]) + df.eval("a + b") Any expression that is a valid :func:`pandas.eval` expression is also a valid :meth:`DataFrame.eval` expression, with the added benefit that you don't have to @@ -617,9 +614,9 @@ on the original ``DataFrame`` or return a copy with the new column. .. ipython:: python df = pd.DataFrame(dict(a=range(5), b=range(5, 10))) - df.eval('c = a + b', inplace=True) - df.eval('d = a + b + c', inplace=True) - df.eval('a = 1', inplace=True) + df.eval("c = a + b", inplace=True) + df.eval("d = a + b + c", inplace=True) + df.eval("a = 1", inplace=True) df When ``inplace`` is set to ``False``, the default, a copy of the ``DataFrame`` with the @@ -628,7 +625,7 @@ new or modified columns is returned and the original frame is unchanged. .. ipython:: python df - df.eval('e = a - c', inplace=False) + df.eval("e = a - c", inplace=False) df As a convenience, multiple assignments can be performed by using a @@ -636,19 +633,22 @@ multi-line string. .. ipython:: python - df.eval(""" + df.eval( + """ c = a + b d = a + b + c - a = 1""", inplace=False) + a = 1""", + inplace=False, + ) The equivalent in standard Python would be .. ipython:: python df = pd.DataFrame(dict(a=range(5), b=range(5, 10))) - df['c'] = df['a'] + df['b'] - df['d'] = df['a'] + df['b'] + df['c'] - df['a'] = 1 + df["c"] = df["a"] + df["b"] + df["d"] = df["a"] + df["b"] + df["c"] + df["a"] = 1 df The ``query`` method has a ``inplace`` keyword which determines @@ -657,8 +657,8 @@ whether the query modifies the original frame. .. ipython:: python df = pd.DataFrame(dict(a=range(5), b=range(5, 10))) - df.query('a > 2') - df.query('a > 2', inplace=True) + df.query("a > 2") + df.query("a > 2", inplace=True) df Local variables @@ -669,10 +669,10 @@ expression by placing the ``@`` character in front of the name. For example, .. ipython:: python - df = pd.DataFrame(np.random.randn(5, 2), columns=list('ab')) + df = pd.DataFrame(np.random.randn(5, 2), columns=list("ab")) newcol = np.random.randn(len(df)) - df.eval('b + @newcol') - df.query('b < @newcol') + df.eval("b + @newcol") + df.query("b < @newcol") If you don't prefix the local variable with ``@``, pandas will raise an exception telling you the variable is undefined. @@ -685,8 +685,8 @@ name in an expression. .. ipython:: python a = np.random.randn() - df.query('@a < a') - df.loc[a < df['a']] # same as the previous expression + df.query("@a < a") + df.loc[a < df["a"]] # same as the previous expression With :func:`pandas.eval` you cannot use the ``@`` prefix *at all*, because it isn't defined in that context. ``pandas`` will let you know this if you try to @@ -696,14 +696,14 @@ use ``@`` in a top-level call to :func:`pandas.eval`. For example, :okexcept: a, b = 1, 2 - pd.eval('@a + b') + pd.eval("@a + b") In this case, you should simply refer to the variables like you would in standard Python. .. ipython:: python - pd.eval('a + b') + pd.eval("a + b") :func:`pandas.eval` parsers @@ -723,10 +723,10 @@ semantics. .. ipython:: python - expr = '(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)' - x = pd.eval(expr, parser='python') - expr_no_parens = 'df1 > 0 & df2 > 0 & df3 > 0 & df4 > 0' - y = pd.eval(expr_no_parens, parser='pandas') + expr = "(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)" + x = pd.eval(expr, parser="python") + expr_no_parens = "df1 > 0 & df2 > 0 & df3 > 0 & df4 > 0" + y = pd.eval(expr_no_parens, parser="pandas") np.all(x == y) @@ -735,10 +735,10 @@ well: .. ipython:: python - expr = '(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)' - x = pd.eval(expr, parser='python') - expr_with_ands = 'df1 > 0 and df2 > 0 and df3 > 0 and df4 > 0' - y = pd.eval(expr_with_ands, parser='pandas') + expr = "(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)" + x = pd.eval(expr, parser="python") + expr_with_ands = "df1 > 0 and df2 > 0 and df3 > 0 and df4 > 0" + y = pd.eval(expr_with_ands, parser="pandas") np.all(x == y) @@ -768,7 +768,7 @@ is a bit slower (not by much) than evaluating the same expression in Python .. ipython:: python - %timeit pd.eval('df1 + df2 + df3 + df4', engine='python') + %timeit pd.eval("df1 + df2 + df3 + df4", engine="python") :func:`pandas.eval` performance @@ -812,10 +812,11 @@ you have an expression--for example .. ipython:: python - df = pd.DataFrame({'strings': np.repeat(list('cba'), 3), - 'nums': np.repeat(range(3), 3)}) + df = pd.DataFrame( + {"strings": np.repeat(list("cba"), 3), "nums": np.repeat(range(3), 3)} + ) df - df.query('strings == "a" and nums == 1') + df.query("strings == 'a' and nums == 1") the numeric part of the comparison (``nums == 1``) will be evaluated by ``numexpr``. diff --git a/setup.cfg b/setup.cfg index 73986f692b6cd..8702e903d825b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -39,7 +39,8 @@ bootstrap = import pandas as pd np # avoiding error when importing again numpy or pandas pd # (in some cases we want to do it to show users) -ignore = E402, # module level import not at top of file +ignore = E203, # space before : (needed for how black formats slicing) + E402, # module level import not at top of file W503, # line break before binary operator # Classes/functions in different blocks can generate those errors E302, # expected 2 blank lines, found 0 From 198af6ae4ad11e75ead3cd00f93634531f5a8e7d Mon Sep 17 00:00:00 2001 From: Levi Matus Date: Sun, 4 Oct 2020 17:13:31 -0300 Subject: [PATCH 15/38] DOC: normalize usage of word "pandas" (#36845) --- doc/source/development/code_style.rst | 2 +- doc/source/development/contributing.rst | 6 ++--- doc/source/development/maintaining.rst | 2 +- doc/source/ecosystem.rst | 22 +++++++++---------- .../comparison/comparison_with_r.rst | 16 +++++++------- .../comparison/comparison_with_stata.rst | 8 +++---- doc/source/getting_started/install.rst | 4 ++-- doc/source/getting_started/overview.rst | 2 +- doc/source/reference/arrays.rst | 14 ++++++------ doc/source/reference/series.rst | 2 +- doc/source/user_guide/basics.rst | 18 +++++++-------- doc/source/user_guide/boolean.rst | 2 +- doc/source/user_guide/categorical.rst | 4 ++-- doc/source/user_guide/cookbook.rst | 2 +- doc/source/user_guide/dsintro.rst | 12 +++++----- doc/source/user_guide/duplicates.rst | 2 +- doc/source/user_guide/enhancingperf.rst | 2 +- doc/source/user_guide/groupby.rst | 4 ++-- doc/source/user_guide/indexing.rst | 10 ++++----- doc/source/user_guide/integer_na.rst | 2 +- doc/source/user_guide/io.rst | 20 ++++++++--------- doc/source/user_guide/missing_data.rst | 4 ++-- doc/source/user_guide/scale.rst | 8 +++---- doc/source/user_guide/sparse.rst | 4 ++-- doc/source/user_guide/timedeltas.rst | 2 +- doc/source/user_guide/timeseries.rst | 2 +- doc/source/user_guide/visualization.rst | 4 ++-- doc/source/whatsnew/v0.11.0.rst | 2 +- doc/source/whatsnew/v0.13.0.rst | 2 +- doc/source/whatsnew/v0.17.0.rst | 4 ++-- doc/source/whatsnew/v0.19.0.rst | 6 ++--- doc/source/whatsnew/v0.20.0.rst | 8 +++---- doc/source/whatsnew/v0.21.0.rst | 4 ++-- doc/source/whatsnew/v0.21.1.rst | 2 +- doc/source/whatsnew/v0.22.0.rst | 2 +- doc/source/whatsnew/v0.23.0.rst | 12 +++++----- doc/source/whatsnew/v0.23.2.rst | 2 +- doc/source/whatsnew/v0.24.0.rst | 10 ++++----- doc/source/whatsnew/v0.25.0.rst | 16 +++++++------- doc/source/whatsnew/v0.25.1.rst | 2 +- doc/source/whatsnew/v0.25.2.rst | 2 +- doc/source/whatsnew/v1.0.0.rst | 8 +++---- doc/source/whatsnew/v1.1.3.rst | 2 +- doc/source/whatsnew/v1.2.0.rst | 4 ++-- 44 files changed, 134 insertions(+), 134 deletions(-) diff --git a/doc/source/development/code_style.rst b/doc/source/development/code_style.rst index 387f65ea583a0..5aa1c1099d6e0 100644 --- a/doc/source/development/code_style.rst +++ b/doc/source/development/code_style.rst @@ -9,7 +9,7 @@ pandas code style guide .. contents:: Table of contents: :local: -*pandas* follows the `PEP8 `_ +pandas follows the `PEP8 `_ standard and uses `Black `_ and `Flake8 `_ to ensure a consistent code format throughout the project. For details see the diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index d6955c5d4b8d2..17eba825d1c29 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -155,7 +155,7 @@ Using a Docker container Instead of manually setting up a development environment, you can use `Docker `_ to automatically create the environment with just several -commands. Pandas provides a ``DockerFile`` in the root directory to build a Docker image +commands. pandas provides a ``DockerFile`` in the root directory to build a Docker image with a full pandas development environment. **Docker Commands** @@ -190,7 +190,7 @@ Note that you might need to rebuild the C extensions if/when you merge with upst Installing a C compiler ~~~~~~~~~~~~~~~~~~~~~~~ -Pandas uses C extensions (mostly written using Cython) to speed up certain +pandas uses C extensions (mostly written using Cython) to speed up certain operations. To install pandas from source, you need to compile these C extensions, which means you need a C compiler. This process depends on which platform you're using. @@ -1219,7 +1219,7 @@ This test shows off several useful features of Hypothesis, as well as demonstrating a good use-case: checking properties that should hold over a large or complicated domain of inputs. -To keep the Pandas test suite running quickly, parametrized tests are +To keep the pandas test suite running quickly, parametrized tests are preferred if the inputs or logic are simple, with Hypothesis tests reserved for cases with complex logic or where there are too many combinations of options or subtle interactions to test (or think of!) all of them. diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst index cd084ab263477..2a21704c27005 100644 --- a/doc/source/development/maintaining.rst +++ b/doc/source/development/maintaining.rst @@ -207,7 +207,7 @@ Only core team members can merge pull requests. We have a few guidelines. 1. You should typically not self-merge your own pull requests. Exceptions include things like small changes to fix CI (e.g. pinning a package version). 2. You should not merge pull requests that have an active discussion, or pull - requests that has any ``-1`` votes from a core maintainer. Pandas operates + requests that has any ``-1`` votes from a core maintainer. pandas operates by consensus. 3. For larger changes, it's good to have a +1 from at least two core team members. diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index 4086f64817568..8f04d05cfcb04 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -98,7 +98,7 @@ With Altair, you can spend more time understanding your data and its meaning. Altair's API is simple, friendly and consistent and built on top of the powerful Vega-Lite JSON specification. This elegant simplicity produces beautiful and effective visualizations with a -minimal amount of code. Altair works with Pandas DataFrames. +minimal amount of code. Altair works with pandas DataFrames. `Bokeh `__ @@ -110,7 +110,7 @@ graphics in the style of Protovis/D3, while delivering high-performance interact large data to thin clients. `Pandas-Bokeh `__ provides a high level API -for Bokeh that can be loaded as a native Pandas plotting backend via +for Bokeh that can be loaded as a native pandas plotting backend via .. code:: python @@ -187,7 +187,7 @@ IDE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IPython is an interactive command shell and distributed computing -environment. IPython tab completion works with Pandas methods and also +environment. IPython tab completion works with pandas methods and also attributes like DataFrame columns. `Jupyter Notebook / Jupyter Lab `__ @@ -201,7 +201,7 @@ Jupyter notebooks can be converted to a number of open standard output formats Python) through 'Download As' in the web interface and ``jupyter convert`` in a shell. -Pandas DataFrames implement ``_repr_html_``and ``_repr_latex`` methods +pandas DataFrames implement ``_repr_html_``and ``_repr_latex`` methods which are utilized by Jupyter Notebook for displaying (abbreviated) HTML or LaTeX tables. LaTeX output is properly escaped. (Note: HTML tables may or may not be @@ -229,7 +229,7 @@ Its `Variable Explorer `__ allows users to view, manipulate and edit pandas ``Index``, ``Series``, and ``DataFrame`` objects like a "spreadsheet", including copying and modifying values, sorting, displaying a "heatmap", converting data types and more. -Pandas objects can also be renamed, duplicated, new columns added, +pandas objects can also be renamed, duplicated, new columns added, copyed/pasted to/from the clipboard (as TSV), and saved/loaded to/from a file. Spyder can also import data from a variety of plain text and binary files or the clipboard into a new pandas DataFrame via a sophisticated import wizard. @@ -276,13 +276,13 @@ The following data feeds are available: `Quandl/Python `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Quandl API for Python wraps the Quandl REST API to return -Pandas DataFrames with timeseries indexes. +pandas DataFrames with timeseries indexes. `Pydatastream `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PyDatastream is a Python interface to the `Refinitiv Datastream (DWS) `__ -REST API to return indexed Pandas DataFrames with financial data. +REST API to return indexed pandas DataFrames with financial data. This package requires valid credentials for this API (non free). `pandaSDMX `__ @@ -357,7 +357,7 @@ Out-of-core ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Blaze provides a standard API for doing computations with various -in-memory and on-disk backends: NumPy, Pandas, SQLAlchemy, MongoDB, PyTables, +in-memory and on-disk backends: NumPy, pandas, SQLAlchemy, MongoDB, PyTables, PySpark. `Dask `__ @@ -403,7 +403,7 @@ If also displays progress bars. `Ray `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Pandas on Ray is an early stage DataFrame library that wraps Pandas and transparently distributes the data and computation. The user does not need to know how many cores their system has, nor do they need to specify how to distribute the data. In fact, users can continue using their previous Pandas notebooks while experiencing a considerable speedup from Pandas on Ray, even on a single machine. Only a modification of the import statement is needed, as we demonstrate below. Once you’ve changed your import statement, you’re ready to use Pandas on Ray just like you would Pandas. +pandas on Ray is an early stage DataFrame library that wraps pandas and transparently distributes the data and computation. The user does not need to know how many cores their system has, nor do they need to specify how to distribute the data. In fact, users can continue using their previous pandas notebooks while experiencing a considerable speedup from pandas on Ray, even on a single machine. Only a modification of the import statement is needed, as we demonstrate below. Once you’ve changed your import statement, you’re ready to use pandas on Ray just like you would pandas. .. code:: python @@ -414,7 +414,7 @@ Pandas on Ray is an early stage DataFrame library that wraps Pandas and transpar `Vaex `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Increasingly, packages are being built on top of pandas to address specific needs in data preparation, analysis and visualization. Vaex is a python library for Out-of-Core DataFrames (similar to Pandas), to visualize and explore big tabular datasets. It can calculate statistics such as mean, sum, count, standard deviation etc, on an N-dimensional grid up to a billion (10\ :sup:`9`) objects/rows per second. Visualization is done using histograms, density plots and 3d volume rendering, allowing interactive exploration of big data. Vaex uses memory mapping, zero memory copy policy and lazy computations for best performance (no memory wasted). +Increasingly, packages are being built on top of pandas to address specific needs in data preparation, analysis and visualization. Vaex is a python library for Out-of-Core DataFrames (similar to pandas), to visualize and explore big tabular datasets. It can calculate statistics such as mean, sum, count, standard deviation etc, on an N-dimensional grid up to a billion (10\ :sup:`9`) objects/rows per second. Visualization is done using histograms, density plots and 3d volume rendering, allowing interactive exploration of big data. Vaex uses memory mapping, zero memory copy policy and lazy computations for best performance (no memory wasted). * vaex.from_pandas * vaex.to_pandas_df @@ -424,7 +424,7 @@ Increasingly, packages are being built on top of pandas to address specific need Extension data types -------------------- -Pandas provides an interface for defining +pandas provides an interface for defining :ref:`extension types ` to extend NumPy's type system. The following libraries implement that interface to provide types not found in NumPy or pandas, which work well with pandas' data containers. diff --git a/doc/source/getting_started/comparison/comparison_with_r.rst b/doc/source/getting_started/comparison/comparison_with_r.rst index 358bb6ad951f0..864081002086b 100644 --- a/doc/source/getting_started/comparison/comparison_with_r.rst +++ b/doc/source/getting_started/comparison/comparison_with_r.rst @@ -5,11 +5,11 @@ Comparison with R / R libraries ******************************* -Since ``pandas`` aims to provide a lot of the data manipulation and analysis +Since pandas aims to provide a lot of the data manipulation and analysis functionality that people use `R `__ for, this page was started to provide a more detailed look at the `R language `__ and its many third -party libraries as they relate to ``pandas``. In comparisons with R and CRAN +party libraries as they relate to pandas. In comparisons with R and CRAN libraries, we care about the following things: * **Functionality / flexibility**: what can/cannot be done with each tool @@ -21,7 +21,7 @@ libraries, we care about the following things: This page is also here to offer a bit of a translation guide for users of these R packages. -For transfer of ``DataFrame`` objects from ``pandas`` to R, one option is to +For transfer of ``DataFrame`` objects from pandas to R, one option is to use HDF5 files, see :ref:`io.external_compatibility` for an example. @@ -118,7 +118,7 @@ or by integer location df <- data.frame(matrix(rnorm(1000), ncol=100)) df[, c(1:10, 25:30, 40, 50:100)] -Selecting multiple columns by name in ``pandas`` is straightforward +Selecting multiple columns by name in pandas is straightforward .. ipython:: python @@ -235,7 +235,7 @@ since the subclass sizes are possibly irregular. Using a data.frame called tapply(baseball$batting.average, baseball.example$team, max) -In ``pandas`` we may use :meth:`~pandas.pivot_table` method to handle this: +In pandas we may use :meth:`~pandas.pivot_table` method to handle this: .. ipython:: python @@ -268,7 +268,7 @@ column's values are less than another column's values: subset(df, a <= b) df[df$a <= df$b,] # note the comma -In ``pandas``, there are a few ways to perform subsetting. You can use +In pandas, there are a few ways to perform subsetting. You can use :meth:`~pandas.DataFrame.query` or pass an expression as if it were an index/slice as well as standard boolean indexing: @@ -295,7 +295,7 @@ An expression using a data.frame called ``df`` in R with the columns ``a`` and with(df, a + b) df$a + df$b # same as the previous expression -In ``pandas`` the equivalent expression, using the +In pandas the equivalent expression, using the :meth:`~pandas.DataFrame.eval` method, would be: .. ipython:: python @@ -347,7 +347,7 @@ summarize ``x`` by ``month``: mean = round(mean(x), 2), sd = round(sd(x), 2)) -In ``pandas`` the equivalent expression, using the +In pandas the equivalent expression, using the :meth:`~pandas.DataFrame.groupby` method, would be: .. ipython:: python diff --git a/doc/source/getting_started/comparison/comparison_with_stata.rst b/doc/source/getting_started/comparison/comparison_with_stata.rst index 7b8d9c6be61db..014506cc18327 100644 --- a/doc/source/getting_started/comparison/comparison_with_stata.rst +++ b/doc/source/getting_started/comparison/comparison_with_stata.rst @@ -146,7 +146,7 @@ the pandas command would be: # alternatively, read_table is an alias to read_csv with tab delimiter tips = pd.read_table("tips.csv", header=None) -Pandas can also read Stata data sets in ``.dta`` format with the :func:`read_stata` function. +pandas can also read Stata data sets in ``.dta`` format with the :func:`read_stata` function. .. code-block:: python @@ -172,7 +172,7 @@ Similarly in pandas, the opposite of ``read_csv`` is :meth:`DataFrame.to_csv`. tips.to_csv("tips2.csv") -Pandas can also export to Stata file format with the :meth:`DataFrame.to_stata` method. +pandas can also export to Stata file format with the :meth:`DataFrame.to_stata` method. .. code-block:: python @@ -583,7 +583,7 @@ should be used for comparisons. outer_join[pd.isna(outer_join["value_x"])] outer_join[pd.notna(outer_join["value_x"])] -Pandas also provides a variety of methods to work with missing data -- some of +pandas also provides a variety of methods to work with missing data -- some of which would be challenging to express in Stata. For example, there are methods to drop all rows with any missing values, replacing missing values with a specified value, like the mean, or forward filling from previous rows. See the @@ -674,7 +674,7 @@ Other considerations Disk vs memory ~~~~~~~~~~~~~~ -Pandas and Stata both operate exclusively in memory. This means that the size of +pandas and Stata both operate exclusively in memory. This means that the size of data able to be loaded in pandas is limited by your machine's memory. If out of core processing is needed, one possibility is the `dask.dataframe `_ diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index a6341451b1b80..70d145c54e919 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -184,7 +184,7 @@ You can find simple installation instructions for pandas in this document: ``ins Installing from source ~~~~~~~~~~~~~~~~~~~~~~ -See the :ref:`contributing guide ` for complete instructions on building from the git source tree. Further, see :ref:`creating a development environment ` if you wish to create a *pandas* development environment. +See the :ref:`contributing guide ` for complete instructions on building from the git source tree. Further, see :ref:`creating a development environment ` if you wish to create a pandas development environment. Running the test suite ---------------------- @@ -249,7 +249,7 @@ Recommended dependencies Optional dependencies ~~~~~~~~~~~~~~~~~~~~~ -Pandas has many optional dependencies that are only used for specific methods. +pandas has many optional dependencies that are only used for specific methods. For example, :func:`pandas.read_hdf` requires the ``pytables`` package, while :meth:`DataFrame.to_markdown` requires the ``tabulate`` package. If the optional dependency is not installed, pandas will raise an ``ImportError`` when diff --git a/doc/source/getting_started/overview.rst b/doc/source/getting_started/overview.rst index 57d87d4ec8a91..3043cf25c5312 100644 --- a/doc/source/getting_started/overview.rst +++ b/doc/source/getting_started/overview.rst @@ -6,7 +6,7 @@ Package overview **************** -**pandas** is a `Python `__ package providing fast, +pandas is a `Python `__ package providing fast, flexible, and expressive data structures designed to make working with "relational" or "labeled" data both easy and intuitive. It aims to be the fundamental high-level building block for doing practical, **real-world** data diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index 1725c415fa020..5c068d8404cd6 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -16,7 +16,7 @@ For some data types, pandas extends NumPy's type system. String aliases for thes can be found at :ref:`basics.dtypes`. =================== ========================= ================== ============================= -Kind of Data Pandas Data Type Scalar Array +Kind of Data pandas Data Type Scalar Array =================== ========================= ================== ============================= TZ-aware datetime :class:`DatetimeTZDtype` :class:`Timestamp` :ref:`api.arrays.datetime` Timedeltas (none) :class:`Timedelta` :ref:`api.arrays.timedelta` @@ -29,7 +29,7 @@ Strings :class:`StringDtype` :class:`str` :ref:`api.array Boolean (with NA) :class:`BooleanDtype` :class:`bool` :ref:`api.arrays.bool` =================== ========================= ================== ============================= -Pandas and third-party libraries can extend NumPy's type system (see :ref:`extending.extension-types`). +pandas and third-party libraries can extend NumPy's type system (see :ref:`extending.extension-types`). The top-level :meth:`array` method can be used to create a new array, which may be stored in a :class:`Series`, :class:`Index`, or as a column in a :class:`DataFrame`. @@ -43,7 +43,7 @@ stored in a :class:`Series`, :class:`Index`, or as a column in a :class:`DataFra Datetime data ------------- -NumPy cannot natively represent timezone-aware datetimes. Pandas supports this +NumPy cannot natively represent timezone-aware datetimes. pandas supports this with the :class:`arrays.DatetimeArray` extension array, which can hold timezone-naive or timezone-aware values. @@ -162,7 +162,7 @@ If the data are tz-aware, then every value in the array must have the same timez Timedelta data -------------- -NumPy can natively represent timedeltas. Pandas provides :class:`Timedelta` +NumPy can natively represent timedeltas. pandas provides :class:`Timedelta` for symmetry with :class:`Timestamp`. .. autosummary:: @@ -217,7 +217,7 @@ A collection of timedeltas may be stored in a :class:`TimedeltaArray`. Timespan data ------------- -Pandas represents spans of times as :class:`Period` objects. +pandas represents spans of times as :class:`Period` objects. Period ------ @@ -352,7 +352,7 @@ Nullable integer ---------------- :class:`numpy.ndarray` cannot natively represent integer-data with missing values. -Pandas provides this through :class:`arrays.IntegerArray`. +pandas provides this through :class:`arrays.IntegerArray`. .. autosummary:: :toctree: api/ @@ -378,7 +378,7 @@ Pandas provides this through :class:`arrays.IntegerArray`. Categorical data ---------------- -Pandas defines a custom data type for representing data that can take only a +pandas defines a custom data type for representing data that can take only a limited, fixed set of values. The dtype of a ``Categorical`` can be described by a :class:`pandas.api.types.CategoricalDtype`. diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 5131d35334693..f1069e46b56cc 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -280,7 +280,7 @@ Time Series-related Accessors --------- -Pandas provides dtype-specific methods under various accessors. +pandas provides dtype-specific methods under various accessors. These are separate namespaces within :class:`Series` that only apply to specific data types. diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index e348111fe7881..5fa214d2ed389 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -52,7 +52,7 @@ Note, **these attributes can be safely assigned to**! df.columns = [x.lower() for x in df.columns] df -Pandas objects (:class:`Index`, :class:`Series`, :class:`DataFrame`) can be +pandas objects (:class:`Index`, :class:`Series`, :class:`DataFrame`) can be thought of as containers for arrays, which hold the actual data and do the actual computation. For many types, the underlying array is a :class:`numpy.ndarray`. However, pandas and 3rd party libraries may *extend* @@ -410,7 +410,7 @@ data structure with a scalar value: pd.Series(['foo', 'bar', 'baz']) == 'foo' pd.Index(['foo', 'bar', 'baz']) == 'foo' -Pandas also handles element-wise comparisons between different array-like +pandas also handles element-wise comparisons between different array-like objects of the same length: .. ipython:: python @@ -804,7 +804,7 @@ Is equivalent to: (df_p.pipe(extract_city_name) .pipe(add_country_name, country_name="US")) -Pandas encourages the second style, which is known as method chaining. +pandas encourages the second style, which is known as method chaining. ``pipe`` makes it easy to use your own or another library's functions in method chains, alongside pandas' methods. @@ -1498,7 +1498,7 @@ Thus, for example, iterating over a DataFrame gives you the column names: print(col) -Pandas objects also have the dict-like :meth:`~DataFrame.items` method to +pandas objects also have the dict-like :meth:`~DataFrame.items` method to iterate over the (key, value) pairs. To iterate over the rows of a DataFrame, you can use the following methods: @@ -1741,7 +1741,7 @@ always uses them). .. note:: Prior to pandas 1.0, string methods were only available on ``object`` -dtype - ``Series``. Pandas 1.0 added the :class:`StringDtype` which is dedicated + ``Series``. pandas 1.0 added the :class:`StringDtype` which is dedicated to strings. See :ref:`text.types` for more. Please see :ref:`Vectorized String Methods ` for a complete @@ -1752,7 +1752,7 @@ description. Sorting ------- -Pandas supports three kinds of sorting: sorting by index labels, +pandas supports three kinds of sorting: sorting by index labels, sorting by column values, and sorting by a combination of both. .. _basics.sort_index: @@ -1995,7 +1995,7 @@ columns of a DataFrame. NumPy provides support for ``float``, ``int``, ``bool``, ``timedelta64[ns]`` and ``datetime64[ns]`` (note that NumPy does not support timezone-aware datetimes). -Pandas and third-party libraries *extend* NumPy's type system in a few places. +pandas and third-party libraries *extend* NumPy's type system in a few places. This section describes the extensions pandas has made internally. See :ref:`extending.extension-types` for how to write your own extension that works with pandas. See :ref:`ecosystem.extensions` for a list of third-party @@ -2032,7 +2032,7 @@ documentation sections for more on each type. | Boolean (with NA) | :class:`BooleanDtype` | :class:`bool` | :class:`arrays.BooleanArray` | ``'boolean'`` | :ref:`api.arrays.bool` | +-------------------+---------------------------+--------------------+-------------------------------+-----------------------------------------+-------------------------------+ -Pandas has two ways to store strings. +pandas has two ways to store strings. 1. ``object`` dtype, which can hold any Python object, including strings. 2. :class:`StringDtype`, which is dedicated to strings. @@ -2424,5 +2424,5 @@ All NumPy dtypes are subclasses of ``numpy.generic``: .. note:: - Pandas also defines the types ``category``, and ``datetime64[ns, tz]``, which are not integrated into the normal + pandas also defines the types ``category``, and ``datetime64[ns, tz]``, which are not integrated into the normal NumPy hierarchy and won't show up with the above function. diff --git a/doc/source/user_guide/boolean.rst b/doc/source/user_guide/boolean.rst index d690c1093399a..76c922fcef638 100644 --- a/doc/source/user_guide/boolean.rst +++ b/doc/source/user_guide/boolean.rst @@ -82,7 +82,7 @@ the ``NA`` really is ``True`` or ``False``, since ``True & True`` is ``True``, but ``True & False`` is ``False``, so we can't determine the output. -This differs from how ``np.nan`` behaves in logical operations. Pandas treated +This differs from how ``np.nan`` behaves in logical operations. pandas treated ``np.nan`` is *always false in the output*. In ``or`` diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 6a8e1767ef7e8..67f11bbb45b02 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -1011,7 +1011,7 @@ The following differences to R's factor functions can be observed: * In contrast to R's ``factor`` function, using categorical data as the sole input to create a new categorical series will *not* remove unused categories but create a new categorical series which is equal to the passed in one! -* R allows for missing values to be included in its ``levels`` (pandas' ``categories``). Pandas +* R allows for missing values to be included in its ``levels`` (pandas' ``categories``). pandas does not allow ``NaN`` categories, but missing values can still be in the ``values``. @@ -1107,7 +1107,7 @@ are not numeric data (even in the case that ``.categories`` is numeric). dtype in apply ~~~~~~~~~~~~~~ -Pandas currently does not preserve the dtype in apply functions: If you apply along rows you get +pandas currently does not preserve the dtype in apply functions: If you apply along rows you get a ``Series`` of ``object`` ``dtype`` (same as getting a row -> getting one element will return a basic type) and applying along columns will also convert to object. ``NaN`` values are unaffected. You can use ``fillna`` to handle missing values before applying a function. diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 0a30d865f3c23..214b8a680fa7e 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -15,7 +15,7 @@ Simplified, condensed, new-user friendly, in-line examples have been inserted wh augment the Stack-Overflow and GitHub links. Many of the links contain expanded information, above what the in-line examples offer. -Pandas (pd) and Numpy (np) are the only two abbreviated imported modules. The rest are kept +pandas (pd) and Numpy (np) are the only two abbreviated imported modules. The rest are kept explicitly imported for newer users. These examples are written for Python 3. Minor tweaks might be necessary for earlier python diff --git a/doc/source/user_guide/dsintro.rst b/doc/source/user_guide/dsintro.rst index d698b316d321e..905877cca61db 100644 --- a/doc/source/user_guide/dsintro.rst +++ b/doc/source/user_guide/dsintro.rst @@ -78,13 +78,13 @@ Series can be instantiated from dicts: When the data is a dict, and an index is not passed, the ``Series`` index will be ordered by the dict's insertion order, if you're using Python - version >= 3.6 and Pandas version >= 0.23. + version >= 3.6 and pandas version >= 0.23. - If you're using Python < 3.6 or Pandas < 0.23, and an index is not passed, + If you're using Python < 3.6 or pandas < 0.23, and an index is not passed, the ``Series`` index will be the lexically ordered list of dict keys. In the example above, if you were on a Python version lower than 3.6 or a -Pandas version lower than 0.23, the ``Series`` would be ordered by the lexical +pandas version lower than 0.23, the ``Series`` would be ordered by the lexical order of the dict keys (i.e. ``['a', 'b', 'c']`` rather than ``['b', 'a', 'c']``). If an index is passed, the values in data corresponding to the labels in the @@ -151,7 +151,7 @@ index (to disable :ref:`automatic alignment `, for example). :attr:`Series.array` will always be an :class:`~pandas.api.extensions.ExtensionArray`. Briefly, an ExtensionArray is a thin wrapper around one or more *concrete* arrays like a -:class:`numpy.ndarray`. Pandas knows how to take an ``ExtensionArray`` and +:class:`numpy.ndarray`. pandas knows how to take an ``ExtensionArray`` and store it in a ``Series`` or a column of a ``DataFrame``. See :ref:`basics.dtypes` for more. @@ -290,9 +290,9 @@ based on common sense rules. When the data is a dict, and ``columns`` is not specified, the ``DataFrame`` columns will be ordered by the dict's insertion order, if you are using - Python version >= 3.6 and Pandas >= 0.23. + Python version >= 3.6 and pandas >= 0.23. - If you are using Python < 3.6 or Pandas < 0.23, and ``columns`` is not + If you are using Python < 3.6 or pandas < 0.23, and ``columns`` is not specified, the ``DataFrame`` columns will be the lexically ordered list of dict keys. diff --git a/doc/source/user_guide/duplicates.rst b/doc/source/user_guide/duplicates.rst index 2993ca7799510..7cda067fb24ad 100644 --- a/doc/source/user_guide/duplicates.rst +++ b/doc/source/user_guide/duplicates.rst @@ -79,7 +79,7 @@ unique with :attr:`Index.is_unique`: .. note:: Checking whether an index is unique is somewhat expensive for large datasets. - Pandas does cache this result, so re-checking on the same index is very fast. + pandas does cache this result, so re-checking on the same index is very fast. :meth:`Index.duplicated` will return a boolean ndarray indicating whether a label is repeated. diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index d30554986607d..cc8de98165fac 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -689,7 +689,7 @@ name in an expression. df.loc[a < df["a"]] # same as the previous expression With :func:`pandas.eval` you cannot use the ``@`` prefix *at all*, because it -isn't defined in that context. ``pandas`` will let you know this if you try to +isn't defined in that context. pandas will let you know this if you try to use ``@`` in a top-level call to :func:`pandas.eval`. For example, .. ipython:: python diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index ec64442319a84..6427cea6fa510 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -614,7 +614,7 @@ For a grouped ``DataFrame``, you can rename in a similar manner: grouped["C"].agg(["sum", "sum"]) - Pandas *does* allow you to provide multiple lambdas. In this case, pandas + pandas *does* allow you to provide multiple lambdas. In this case, pandas will mangle the name of the (nameless) lambda functions, appending ``_`` to each subsequent lambda. @@ -636,7 +636,7 @@ accepts the special syntax in :meth:`GroupBy.agg`, known as "named aggregation", - The keywords are the *output* column names - The values are tuples whose first element is the column to select - and the second element is the aggregation to apply to that column. Pandas + and the second element is the aggregation to apply to that column. pandas provides the ``pandas.NamedAgg`` namedtuple with the fields ``['column', 'aggfunc']`` to make it clearer what the arguments are. As usual, the aggregation can be a callable or a string alias. diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index b11baad1e3eb5..530fdfba7d12c 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -46,7 +46,7 @@ Different choices for indexing ------------------------------ Object selection has had a number of user-requested additions in order to -support more explicit location based indexing. Pandas now supports three types +support more explicit location based indexing. pandas now supports three types of multi-axis indexing. * ``.loc`` is primarily label based, but may also be used with a boolean array. ``.loc`` will raise ``KeyError`` when the items are not found. Allowed inputs are: @@ -315,7 +315,7 @@ Selection by label .. versionchanged:: 1.0.0 - Pandas will raise a ``KeyError`` if indexing with a list with missing labels. See :ref:`list-like Using loc with + pandas will raise a ``KeyError`` if indexing with a list with missing labels. See :ref:`list-like Using loc with missing keys in a list is Deprecated `. pandas provides a suite of methods in order to have **purely label based indexing**. This is a strict inclusion based protocol. @@ -433,7 +433,7 @@ Selection by position This is sometimes called ``chained assignment`` and should be avoided. See :ref:`Returning a View versus Copy `. -Pandas provides a suite of methods in order to get **purely integer based indexing**. The semantics follow closely Python and NumPy slicing. These are ``0-based`` indexing. When slicing, the start bound is *included*, while the upper bound is *excluded*. Trying to use a non-integer, even a **valid** label will raise an ``IndexError``. +pandas provides a suite of methods in order to get **purely integer based indexing**. The semantics follow closely Python and NumPy slicing. These are ``0-based`` indexing. When slicing, the start bound is *included*, while the upper bound is *excluded*. Trying to use a non-integer, even a **valid** label will raise an ``IndexError``. The ``.iloc`` attribute is the primary access method. The following are valid inputs: @@ -1812,7 +1812,7 @@ about! Sometimes a ``SettingWithCopy`` warning will arise at times when there's no obvious chained indexing going on. **These** are the bugs that -``SettingWithCopy`` is designed to catch! Pandas is probably trying to warn you +``SettingWithCopy`` is designed to catch! pandas is probably trying to warn you that you've done this: .. code-block:: python @@ -1835,7 +1835,7 @@ When you use chained indexing, the order and type of the indexing operation partially determine whether the result is a slice into the original object, or a copy of the slice. -Pandas has the ``SettingWithCopyWarning`` because assigning to a copy of a +pandas has the ``SettingWithCopyWarning`` because assigning to a copy of a slice is frequently not intentional, but a mistake caused by chained indexing returning a copy where a slice was expected. diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index acee1638570f7..be38736f493b5 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -30,7 +30,7 @@ numbers. Construction ------------ -Pandas can represent integer data with possibly missing values using +pandas can represent integer data with possibly missing values using :class:`arrays.IntegerArray`. This is an :ref:`extension types ` implemented within pandas. diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 184894bbafe28..ae22ee836cd8c 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -894,7 +894,7 @@ take full advantage of the flexibility of the date parsing API: ) df -Pandas will try to call the ``date_parser`` function in three different ways. If +pandas will try to call the ``date_parser`` function in three different ways. If an exception is raised, the next one is tried: 1. ``date_parser`` is first called with one or more arrays as arguments, @@ -926,7 +926,7 @@ Note that performance-wise, you should try these methods of parsing dates in ord Parsing a CSV with mixed timezones ++++++++++++++++++++++++++++++++++ -Pandas cannot natively represent a column or index with mixed timezones. If your CSV +pandas cannot natively represent a column or index with mixed timezones. If your CSV file contains columns with a mixture of timezones, the default result will be an object-dtype column with strings, even with ``parse_dates``. @@ -1602,7 +1602,7 @@ python engine is selected explicitly using ``engine='python'``. Reading/writing remote files '''''''''''''''''''''''''''' -You can pass in a URL to read or write remote files to many of Pandas' IO +You can pass in a URL to read or write remote files to many of pandas' IO functions - the following example shows reading a CSV file: .. code-block:: python @@ -2265,7 +2265,7 @@ The full list of types supported are described in the Table Schema spec. This table shows the mapping from pandas types: =============== ================= -Pandas type Table Schema type +pandas type Table Schema type =============== ================= int64 integer float64 number @@ -2661,7 +2661,7 @@ that contain URLs. url_df = pd.DataFrame( { - "name": ["Python", "Pandas"], + "name": ["Python", "pandas"], "url": ["https://www.python.org/", "https://pandas.pydata.org"], } ) @@ -3143,7 +3143,7 @@ one can pass an :class:`~pandas.io.excel.ExcelWriter`. Writing Excel files to memory +++++++++++++++++++++++++++++ -Pandas supports writing Excel files to buffer-like objects such as ``StringIO`` or +pandas supports writing Excel files to buffer-like objects such as ``StringIO`` or ``BytesIO`` using :class:`~pandas.io.excel.ExcelWriter`. .. code-block:: python @@ -3177,7 +3177,7 @@ Pandas supports writing Excel files to buffer-like objects such as ``StringIO`` Excel writer engines '''''''''''''''''''' -Pandas chooses an Excel writer via two methods: +pandas chooses an Excel writer via two methods: 1. the ``engine`` keyword argument 2. the filename extension (via the default specified in config options) @@ -3474,7 +3474,7 @@ for some advanced strategies .. warning:: - Pandas uses PyTables for reading and writing HDF5 files, which allows + pandas uses PyTables for reading and writing HDF5 files, which allows serializing object-dtype data with pickle. Loading pickled data received from untrusted sources can be unsafe. @@ -4734,7 +4734,7 @@ Several caveats. * Duplicate column names and non-string columns names are not supported. * The ``pyarrow`` engine always writes the index to the output, but ``fastparquet`` only writes non-default - indexes. This extra column can cause problems for non-Pandas consumers that are not expecting it. You can + indexes. This extra column can cause problems for non-pandas consumers that are not expecting it. You can force including or omitting indexes with the ``index`` argument, regardless of the underlying engine. * Index level names, if specified, must be strings. * In the ``pyarrow`` engine, categorical dtypes for non-string types can be serialized to parquet, but will de-serialize as their primitive dtype. @@ -4894,7 +4894,7 @@ ORC .. versionadded:: 1.0.0 Similar to the :ref:`parquet ` format, the `ORC Format `__ is a binary columnar serialization -for data frames. It is designed to make reading data frames efficient. Pandas provides *only* a reader for the +for data frames. It is designed to make reading data frames efficient. pandas provides *only* a reader for the ORC format, :func:`~pandas.read_orc`. This requires the `pyarrow `__ library. .. _io.sql: diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 3c97cc7da6edb..7eb377694910b 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -81,7 +81,7 @@ Integer dtypes and missing data ------------------------------- Because ``NaN`` is a float, a column of integers with even one missing values -is cast to floating-point dtype (see :ref:`gotchas.intna` for more). Pandas +is cast to floating-point dtype (see :ref:`gotchas.intna` for more). pandas provides a nullable integer array, which can be used by explicitly requesting the dtype: @@ -735,7 +735,7 @@ However, these can be filled in using :meth:`~DataFrame.fillna` and it will work reindexed[crit.fillna(False)] reindexed[crit.fillna(True)] -Pandas provides a nullable integer dtype, but you must explicitly request it +pandas provides a nullable integer dtype, but you must explicitly request it when creating the series or column. Notice that we use a capital "I" in the ``dtype="Int64"``. diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst index f36f27269a996..7f2419bc7f19d 100644 --- a/doc/source/user_guide/scale.rst +++ b/doc/source/user_guide/scale.rst @@ -4,7 +4,7 @@ Scaling to large datasets ************************* -Pandas provides data structures for in-memory analytics, which makes using pandas +pandas provides data structures for in-memory analytics, which makes using pandas to analyze datasets that are larger than memory datasets somewhat tricky. Even datasets that are a sizable fraction of memory become unwieldy, as some pandas operations need to make intermediate copies. @@ -13,7 +13,7 @@ This document provides a few recommendations for scaling your analysis to larger It's a complement to :ref:`enhancingperf`, which focuses on speeding up analysis for datasets that fit in memory. -But first, it's worth considering *not using pandas*. Pandas isn't the right +But first, it's worth considering *not using pandas*. pandas isn't the right tool for all situations. If you're working with very large datasets and a tool like PostgreSQL fits your needs, then you should probably be using that. Assuming you want or need the expressiveness and power of pandas, let's carry on. @@ -230,7 +230,7 @@ different library that implements these out-of-core algorithms for you. Use other libraries ------------------- -Pandas is just one library offering a DataFrame API. Because of its popularity, +pandas is just one library offering a DataFrame API. Because of its popularity, pandas' API has become something of a standard that other libraries implement. The pandas documentation maintains a list of libraries implementing a DataFrame API in :ref:`our ecosystem page `. @@ -259,7 +259,7 @@ Inspecting the ``ddf`` object, we see a few things * There are new attributes like ``.npartitions`` and ``.divisions`` The partitions and divisions are how Dask parallelizes computation. A **Dask** -DataFrame is made up of many **Pandas** DataFrames. A single method call on a +DataFrame is made up of many pandas DataFrames. A single method call on a Dask DataFrame ends up making many pandas method calls, and Dask knows how to coordinate everything to get the result. diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index 62e35cb994faf..3156e3088d860 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -6,7 +6,7 @@ Sparse data structures ********************** -Pandas provides data structures for efficiently storing sparse data. +pandas provides data structures for efficiently storing sparse data. These are not necessarily sparse in the typical "mostly 0". Rather, you can view these objects as being "compressed" where any data matching a specific value (``NaN`` / missing value, though any value can be chosen, including 0) is omitted. The compressed values are not actually stored in the array. @@ -116,7 +116,7 @@ Sparse accessor .. versionadded:: 0.24.0 -Pandas provides a ``.sparse`` accessor, similar to ``.str`` for string data, ``.cat`` +pandas provides a ``.sparse`` accessor, similar to ``.str`` for string data, ``.cat`` for categorical data, and ``.dt`` for datetime-like data. This namespace provides attributes and methods that are specific to sparse data. diff --git a/doc/source/user_guide/timedeltas.rst b/doc/source/user_guide/timedeltas.rst index 971a415088220..cb265d34229dd 100644 --- a/doc/source/user_guide/timedeltas.rst +++ b/doc/source/user_guide/timedeltas.rst @@ -100,7 +100,7 @@ The ``unit`` keyword argument specifies the unit of the Timedelta: Timedelta limitations ~~~~~~~~~~~~~~~~~~~~~ -Pandas represents ``Timedeltas`` in nanosecond resolution using +pandas represents ``Timedeltas`` in nanosecond resolution using 64 bit integers. As such, the 64 bit integer limits determine the ``Timedelta`` limits. diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 11ec90085d9bf..be2c67521dc5d 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1549,7 +1549,7 @@ Converting to Python datetimes Resampling ---------- -Pandas has a simple, powerful, and efficient functionality for performing +pandas has a simple, powerful, and efficient functionality for performing resampling operations during frequency conversion (e.g., converting secondly data into 5-minutely data). This is extremely common in, but not limited to, financial applications. diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index 46ab29a52747a..a6c3d9814b03d 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -776,7 +776,7 @@ See the `matplotlib pie documentation `__ around the source of the ``RuntimeWarning`` to control how these conditions are handled. @@ -1372,7 +1372,7 @@ Deprecations - ``Timestamp.offset`` property (and named arg in the constructor), has been deprecated in favor of ``freq`` (:issue:`12160`) - ``pd.tseries.util.pivot_annual`` is deprecated. Use ``pivot_table`` as alternative, an example is :ref:`here ` (:issue:`736`) - ``pd.tseries.util.isleapyear`` has been deprecated and will be removed in a subsequent release. Datetime-likes now have a ``.is_leap_year`` property (:issue:`13727`) -- ``Panel4D`` and ``PanelND`` constructors are deprecated and will be removed in a future version. The recommended way to represent these types of n-dimensional data are with the `xarray package `__. Pandas provides a :meth:`~Panel4D.to_xarray` method to automate this conversion (:issue:`13564`). +- ``Panel4D`` and ``PanelND`` constructors are deprecated and will be removed in a future version. The recommended way to represent these types of n-dimensional data are with the `xarray package `__. pandas provides a :meth:`~Panel4D.to_xarray` method to automate this conversion (:issue:`13564`). - ``pandas.tseries.frequencies.get_standard_freq`` is deprecated. Use ``pandas.tseries.frequencies.to_offset(freq).rule_code`` instead (:issue:`13874`) - ``pandas.tseries.frequencies.to_offset``'s ``freqstr`` keyword is deprecated in favor of ``freq`` (:issue:`13874`) - ``Categorical.from_array`` has been deprecated and will be removed in a future version (:issue:`13854`) diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index 3f7a89112958b..a9e57f0039735 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -26,7 +26,7 @@ Highlights include: .. warning:: - Pandas has changed the internal structure and layout of the code base. + pandas has changed the internal structure and layout of the code base. This can affect imports that are not from the top-level ``pandas.*`` namespace, please see the changes :ref:`here `. Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. @@ -243,7 +243,7 @@ The default is to infer the compression type from the extension (``compression=' UInt64 support improved ^^^^^^^^^^^^^^^^^^^^^^^ -Pandas has significantly improved support for operations involving unsigned, +pandas has significantly improved support for operations involving unsigned, or purely non-negative, integers. Previously, handling these integers would result in improper rounding or data-type casting, leading to incorrect results. Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937`) @@ -333,7 +333,7 @@ You must enable this by setting the ``display.html.table_schema`` option to ``Tr SciPy sparse matrix from/to SparseDataFrame ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Pandas now supports creating sparse dataframes directly from ``scipy.sparse.spmatrix`` instances. +pandas now supports creating sparse dataframes directly from ``scipy.sparse.spmatrix`` instances. See the :ref:`documentation ` for more information. (:issue:`4343`) All sparse formats are supported, but matrices that are not in :mod:`COOrdinate ` format will be converted, copying data as needed. @@ -1355,7 +1355,7 @@ Deprecate Panel ^^^^^^^^^^^^^^^ ``Panel`` is deprecated and will be removed in a future version. The recommended way to represent 3-D data are -with a ``MultiIndex`` on a ``DataFrame`` via the :meth:`~Panel.to_frame` or with the `xarray package `__. Pandas +with a ``MultiIndex`` on a ``DataFrame`` via the :meth:`~Panel.to_frame` or with the `xarray package `__. pandas provides a :meth:`~Panel.to_xarray` method to automate this conversion (:issue:`13563`). .. code-block:: ipython diff --git a/doc/source/whatsnew/v0.21.0.rst b/doc/source/whatsnew/v0.21.0.rst index 926bcaa21ac3a..6035b89aa8643 100644 --- a/doc/source/whatsnew/v0.21.0.rst +++ b/doc/source/whatsnew/v0.21.0.rst @@ -900,13 +900,13 @@ New behavior: No automatic Matplotlib converters ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Pandas no longer registers our ``date``, ``time``, ``datetime``, +pandas no longer registers our ``date``, ``time``, ``datetime``, ``datetime64``, and ``Period`` converters with matplotlib when pandas is imported. Matplotlib plot methods (``plt.plot``, ``ax.plot``, ...), will not nicely format the x-axis for ``DatetimeIndex`` or ``PeriodIndex`` values. You must explicitly register these methods: -Pandas built-in ``Series.plot`` and ``DataFrame.plot`` *will* register these +pandas built-in ``Series.plot`` and ``DataFrame.plot`` *will* register these converters on first-use (:issue:`17710`). .. note:: diff --git a/doc/source/whatsnew/v0.21.1.rst b/doc/source/whatsnew/v0.21.1.rst index f930dfac869cd..2d72f6470fc81 100644 --- a/doc/source/whatsnew/v0.21.1.rst +++ b/doc/source/whatsnew/v0.21.1.rst @@ -34,7 +34,7 @@ Highlights include: Restore Matplotlib datetime converter registration ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Pandas implements some matplotlib converters for nicely formatting the axis +pandas implements some matplotlib converters for nicely formatting the axis labels on plots with ``datetime`` or ``Period`` values. Prior to pandas 0.21.0, these were implicitly registered with matplotlib, as a side effect of ``import pandas``. diff --git a/doc/source/whatsnew/v0.22.0.rst b/doc/source/whatsnew/v0.22.0.rst index 66d3ab3305565..92b514ce59660 100644 --- a/doc/source/whatsnew/v0.22.0.rst +++ b/doc/source/whatsnew/v0.22.0.rst @@ -20,7 +20,7 @@ release note (singular!). Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Pandas 0.22.0 changes the handling of empty and all-*NA* sums and products. The +pandas 0.22.0 changes the handling of empty and all-*NA* sums and products. The summary is that * The sum of an empty or all-*NA* ``Series`` is now ``0`` diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst index cb811fd83d90d..f4caea9d363eb 100644 --- a/doc/source/whatsnew/v0.23.0.rst +++ b/doc/source/whatsnew/v0.23.0.rst @@ -189,7 +189,7 @@ resetting indexes. See the :ref:`Sorting by Indexes and Values Extending pandas with custom types (experimental) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Pandas now supports storing array-like objects that aren't necessarily 1-D NumPy +pandas now supports storing array-like objects that aren't necessarily 1-D NumPy arrays as columns in a DataFrame or values in a Series. This allows third-party libraries to implement extensions to NumPy's types, similar to how pandas implemented categoricals, datetimes with timezones, periods, and intervals. @@ -553,7 +553,7 @@ Other enhancements - :class:`~pandas.tseries.offsets.WeekOfMonth` constructor now supports ``n=0`` (:issue:`20517`). - :class:`DataFrame` and :class:`Series` now support matrix multiplication (``@``) operator (:issue:`10259`) for Python>=3.5 - Updated :meth:`DataFrame.to_gbq` and :meth:`pandas.read_gbq` signature and documentation to reflect changes from - the Pandas-GBQ library version 0.4.0. Adds intersphinx mapping to Pandas-GBQ + the pandas-gbq library version 0.4.0. Adds intersphinx mapping to pandas-gbq library. (:issue:`20564`) - Added new writer for exporting Stata dta files in version 117, ``StataWriter117``. This format supports exporting strings with lengths up to 2,000,000 characters (:issue:`16450`) - :func:`to_hdf` and :func:`read_hdf` now accept an ``errors`` keyword argument to control encoding error handling (:issue:`20835`) @@ -593,7 +593,7 @@ Instantiation from dicts preserves dict insertion order for Python 3.6+ Until Python 3.6, dicts in Python had no formally defined ordering. For Python version 3.6 and later, dicts are ordered by insertion order, see `PEP 468 `_. -Pandas will use the dict's insertion order, when creating a ``Series`` or +pandas will use the dict's insertion order, when creating a ``Series`` or ``DataFrame`` from a dict and you're using Python version 3.6 or higher. (:issue:`19884`) @@ -643,7 +643,7 @@ Deprecate Panel ^^^^^^^^^^^^^^^ ``Panel`` was deprecated in the 0.20.x release, showing as a ``DeprecationWarning``. Using ``Panel`` will now show a ``FutureWarning``. The recommended way to represent 3-D data are -with a ``MultiIndex`` on a ``DataFrame`` via the :meth:`~Panel.to_frame` or with the `xarray package `__. Pandas +with a ``MultiIndex`` on a ``DataFrame`` via the :meth:`~Panel.to_frame` or with the `xarray package `__. pandas provides a :meth:`~Panel.to_xarray` method to automate this conversion (:issue:`13563`, :issue:`18324`). .. code-block:: ipython @@ -884,7 +884,7 @@ Extraction of matching patterns from strings By default, extracting matching patterns from strings with :func:`str.extract` used to return a ``Series`` if a single group was being extracted (a ``DataFrame`` if more than one group was -extracted). As of Pandas 0.23.0 :func:`str.extract` always returns a ``DataFrame``, unless +extracted). As of pandas 0.23.0 :func:`str.extract` always returns a ``DataFrame``, unless ``expand`` is set to ``False``. Finally, ``None`` was an accepted value for the ``expand`` parameter (which was equivalent to ``False``), but now raises a ``ValueError``. (:issue:`11386`) @@ -1175,7 +1175,7 @@ Performance improvements Documentation changes ~~~~~~~~~~~~~~~~~~~~~ -Thanks to all of the contributors who participated in the Pandas Documentation +Thanks to all of the contributors who participated in the pandas Documentation Sprint, which took place on March 10th. We had about 500 participants from over 30 locations across the world. You should notice that many of the :ref:`API docstrings ` have greatly improved. diff --git a/doc/source/whatsnew/v0.23.2.rst b/doc/source/whatsnew/v0.23.2.rst index 9f24092d1d4ae..99650e8291d3d 100644 --- a/doc/source/whatsnew/v0.23.2.rst +++ b/doc/source/whatsnew/v0.23.2.rst @@ -11,7 +11,7 @@ and bug fixes. We recommend that all users upgrade to this version. .. note:: - Pandas 0.23.2 is first pandas release that's compatible with + pandas 0.23.2 is first pandas release that's compatible with Python 3.7 (:issue:`20552`) .. warning:: diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 9a2e96f717d9b..9ef50045d5b5e 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -38,7 +38,7 @@ Enhancements Optional integer NA support ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Pandas has gained the ability to hold integer dtypes with missing values. This long requested feature is enabled through the use of :ref:`extension types `. +pandas has gained the ability to hold integer dtypes with missing values. This long requested feature is enabled through the use of :ref:`extension types `. .. note:: @@ -384,7 +384,7 @@ Other enhancements - :meth:`Series.droplevel` and :meth:`DataFrame.droplevel` are now implemented (:issue:`20342`) - Added support for reading from/writing to Google Cloud Storage via the ``gcsfs`` library (:issue:`19454`, :issue:`23094`) - :func:`DataFrame.to_gbq` and :func:`read_gbq` signature and documentation updated to - reflect changes from the `Pandas-GBQ library version 0.8.0 + reflect changes from the `pandas-gbq library version 0.8.0 `__. Adds a ``credentials`` argument, which enables the use of any kind of `google-auth credentials @@ -432,7 +432,7 @@ Other enhancements Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Pandas 0.24.0 includes a number of API breaking changes. +pandas 0.24.0 includes a number of API breaking changes. .. _whatsnew_0240.api_breaking.deps: @@ -1217,7 +1217,7 @@ Extension type changes **Equality and hashability** -Pandas now requires that extension dtypes be hashable (i.e. the respective +pandas now requires that extension dtypes be hashable (i.e. the respective ``ExtensionDtype`` objects; hashability is not a requirement for the values of the corresponding ``ExtensionArray``). The base class implements a default ``__eq__`` and ``__hash__``. If you have a parametrized dtype, you should @@ -1925,7 +1925,7 @@ Build changes Other ^^^^^ -- Bug where C variables were declared with external linkage causing import errors if certain other C libraries were imported before Pandas. (:issue:`24113`) +- Bug where C variables were declared with external linkage causing import errors if certain other C libraries were imported before pandas. (:issue:`24113`) .. _whatsnew_0.24.0.contributors: diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 7b4440148677b..43b42c5cb5648 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -36,7 +36,7 @@ Enhancements Groupby aggregation with relabeling ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Pandas has added special groupby behavior, known as "named aggregation", for naming the +pandas has added special groupby behavior, known as "named aggregation", for naming the output columns when applying multiple aggregation functions to specific columns (:issue:`18366`, :issue:`26512`). .. ipython:: python @@ -53,7 +53,7 @@ output columns when applying multiple aggregation functions to specific columns Pass the desired columns names as the ``**kwargs`` to ``.agg``. The values of ``**kwargs`` should be tuples where the first element is the column selection, and the second element is the -aggregation function to apply. Pandas provides the ``pandas.NamedAgg`` namedtuple to make it clearer +aggregation function to apply. pandas provides the ``pandas.NamedAgg`` namedtuple to make it clearer what the arguments to the function are, but plain tuples are accepted as well. .. ipython:: python @@ -425,7 +425,7 @@ of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* t Categorical dtypes are preserved during groupby ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Previously, columns that were categorical, but not the groupby key(s) would be converted to ``object`` dtype during groupby operations. Pandas now will preserve these dtypes. (:issue:`18502`) +Previously, columns that were categorical, but not the groupby key(s) would be converted to ``object`` dtype during groupby operations. pandas now will preserve these dtypes. (:issue:`18502`) .. ipython:: python @@ -545,14 +545,14 @@ with :attr:`numpy.nan` in the case of an empty :class:`DataFrame` (:issue:`26397 ``__str__`` methods now call ``__repr__`` rather than vice versa ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Pandas has until now mostly defined string representations in a Pandas objects's +pandas has until now mostly defined string representations in a pandas objects' ``__str__``/``__unicode__``/``__bytes__`` methods, and called ``__str__`` from the ``__repr__`` method, if a specific ``__repr__`` method is not found. This is not needed for Python3. -In Pandas 0.25, the string representations of Pandas objects are now generally +In pandas 0.25, the string representations of pandas objects are now generally defined in ``__repr__``, and calls to ``__str__`` in general now pass the call on to the ``__repr__``, if a specific ``__str__`` method doesn't exist, as is standard for Python. -This change is backward compatible for direct usage of Pandas, but if you subclass -Pandas objects *and* give your subclasses specific ``__str__``/``__repr__`` methods, +This change is backward compatible for direct usage of pandas, but if you subclass +pandas objects *and* give your subclasses specific ``__str__``/``__repr__`` methods, you may have to adjust your ``__str__``/``__repr__`` methods (:issue:`26495`). .. _whatsnew_0250.api_breaking.interval_indexing: @@ -881,7 +881,7 @@ Other API changes - Bug in :meth:`DatetimeIndex.snap` which didn't preserving the ``name`` of the input :class:`Index` (:issue:`25575`) - The ``arg`` argument in :meth:`pandas.core.groupby.DataFrameGroupBy.agg` has been renamed to ``func`` (:issue:`26089`) - The ``arg`` argument in :meth:`pandas.core.window._Window.aggregate` has been renamed to ``func`` (:issue:`26372`) -- Most Pandas classes had a ``__bytes__`` method, which was used for getting a python2-style bytestring representation of the object. This method has been removed as a part of dropping Python2 (:issue:`26447`) +- Most pandas classes had a ``__bytes__`` method, which was used for getting a python2-style bytestring representation of the object. This method has been removed as a part of dropping Python2 (:issue:`26447`) - The ``.str``-accessor has been disabled for 1-level :class:`MultiIndex`, use :meth:`MultiIndex.to_flat_index` if necessary (:issue:`23679`) - Removed support of gtk package for clipboards (:issue:`26563`) - Using an unsupported version of Beautiful Soup 4 will now raise an ``ImportError`` instead of a ``ValueError`` (:issue:`27063`) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 2a2b511356a69..8a16bab63f1bf 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -10,7 +10,7 @@ I/O and LZMA ~~~~~~~~~~~~ Some users may unknowingly have an incomplete Python installation lacking the ``lzma`` module from the standard library. In this case, ``import pandas`` failed due to an ``ImportError`` (:issue:`27575`). -Pandas will now warn, rather than raising an ``ImportError`` if the ``lzma`` module is not present. Any subsequent attempt to use ``lzma`` methods will raise a ``RuntimeError``. +pandas will now warn, rather than raising an ``ImportError`` if the ``lzma`` module is not present. Any subsequent attempt to use ``lzma`` methods will raise a ``RuntimeError``. A possible fix for the lack of the ``lzma`` module is to ensure you have the necessary libraries and then re-install Python. For example, on MacOS installing Python with ``pyenv`` may lead to an incomplete Python installation due to unmet system dependencies at compilation time (like ``xz``). Compilation will succeed, but Python might fail at run time. The issue can be solved by installing the necessary dependencies and then re-installing Python. diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst index c0c68ce4b1f44..a5ea8933762ab 100644 --- a/doc/source/whatsnew/v0.25.2.rst +++ b/doc/source/whatsnew/v0.25.2.rst @@ -8,7 +8,7 @@ including other versions of pandas. .. note:: - Pandas 0.25.2 adds compatibility for Python 3.8 (:issue:`28147`). + pandas 0.25.2 adds compatibility for Python 3.8 (:issue:`28147`). .. _whatsnew_0252.bug_fixes: diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 32175d344c320..ddc40d6d40594 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -18,7 +18,7 @@ including other versions of pandas. New deprecation policy ~~~~~~~~~~~~~~~~~~~~~~ -Starting with Pandas 1.0.0, pandas will adopt a variant of `SemVer`_ to +Starting with pandas 1.0.0, pandas will adopt a variant of `SemVer`_ to version releases. Briefly, * Deprecations will be introduced in minor releases (e.g. 1.1.0, 1.2.0, 2.1.0, ...) @@ -676,7 +676,7 @@ depending on how the results are cast back to the original dtype. Increased minimum version for Python ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Pandas 1.0.0 supports Python 3.6.1 and higher (:issue:`29212`). +pandas 1.0.0 supports Python 3.6.1 and higher (:issue:`29212`). .. _whatsnew_100.api_breaking.deps: @@ -749,7 +749,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor Build changes ^^^^^^^^^^^^^ -Pandas has added a `pyproject.toml `_ file and will no longer include +pandas has added a `pyproject.toml `_ file and will no longer include cythonized files in the source distribution uploaded to PyPI (:issue:`28341`, :issue:`20775`). If you're installing a built distribution (wheel) or via conda, this shouldn't have any effect on you. If you're building pandas from source, you should no longer need to install Cython into your build environment before calling ``pip install pandas``. @@ -763,7 +763,7 @@ Other API changes - :class:`core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`) - :meth:`pandas.api.types.infer_dtype` will now return "integer-na" for integer and ``np.nan`` mix (:issue:`27283`) - :meth:`MultiIndex.from_arrays` will no longer infer names from arrays if ``names=None`` is explicitly provided (:issue:`27292`) -- In order to improve tab-completion, Pandas does not include most deprecated attributes when introspecting a pandas object using ``dir`` (e.g. ``dir(df)``). +- In order to improve tab-completion, pandas does not include most deprecated attributes when introspecting a pandas object using ``dir`` (e.g. ``dir(df)``). To see which attributes are excluded, see an object's ``_deprecations`` attribute, for example ``pd.DataFrame._deprecations`` (:issue:`28805`). - The returned dtype of :func:`unique` now matches the input dtype. (:issue:`27874`) - Changed the default configuration value for ``options.matplotlib.register_converters`` from ``True`` to ``"auto"`` (:issue:`18720`). diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index acf1dafc59885..af714b1bb2ab1 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -16,7 +16,7 @@ Enhancements Added support for new Python version ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Pandas 1.1.3 now supports Python 3.9 (:issue:`36296`). +pandas 1.1.3 now supports Python 3.9 (:issue:`36296`). Development Changes ^^^^^^^^^^^^^^^^^^^ diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index ff16efc17617f..4c58b9923c89a 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -33,7 +33,7 @@ By default, duplicates continue to be allowed pd.Series([1, 2], index=['a', 'a']).set_flags(allows_duplicate_labels=False) -Pandas will propagate the ``allows_duplicate_labels`` property through many operations. +pandas will propagate the ``allows_duplicate_labels`` property through many operations. .. ipython:: python :okexcept: @@ -175,7 +175,7 @@ Other enhancements Increased minimum version for Python ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Pandas 1.2.0 supports Python 3.7.1 and higher (:issue:`35214`). +pandas 1.2.0 supports Python 3.7.1 and higher (:issue:`35214`). .. _whatsnew_120.api_breaking.deps: From 54f270cd0318fc16adc3f5720c3c2c833dfb4d47 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 4 Oct 2020 23:39:45 +0100 Subject: [PATCH 16/38] TYP: update setup.cfg (#36854) --- setup.cfg | 6 ------ 1 file changed, 6 deletions(-) diff --git a/setup.cfg b/setup.cfg index 8702e903d825b..3279a485c9bf3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -208,9 +208,6 @@ check_untyped_defs=False [mypy-pandas.core.indexes.multi] check_untyped_defs=False -[mypy-pandas.core.indexes.period] -check_untyped_defs=False - [mypy-pandas.core.indexes.range] check_untyped_defs=False @@ -244,9 +241,6 @@ check_untyped_defs=False [mypy-pandas.core.series] check_untyped_defs=False -[mypy-pandas.core.strings] -check_untyped_defs=False - [mypy-pandas.core.window.common] check_untyped_defs=False From 7587c94c730e56d787242dbb0ffb8e0d0378b29f Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Sun, 4 Oct 2020 17:41:01 -0500 Subject: [PATCH 17/38] CI: Update error message for np_dev (#36864) * CI: Update error message for np_dev * Comma * Fix --- pandas/tests/series/indexing/test_indexing.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 1fafdf00393e1..fbdac2bb2d8e8 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -367,14 +367,17 @@ def test_2d_to_1d_assignment_raises(): x = np.random.randn(2, 2) y = pd.Series(range(2)) - msg = ( - r"shape mismatch: value array of shape \(2,2\) could not be " - r"broadcast to indexing result of shape \(2,\)" + msg = "|".join( + [ + r"shape mismatch: value array of shape \(2,2\) could not be " + r"broadcast to indexing result of shape \(2,\)", + r"cannot reshape array of size 4 into shape \(2,\)", + ] ) with pytest.raises(ValueError, match=msg): y.loc[range(2)] = x - msg = r"could not broadcast input array from shape \(2,2\) into shape \(2\)" + msg = r"could not broadcast input array from shape \(2,2\) into shape \(2,?\)" with pytest.raises(ValueError, match=msg): y.loc[:] = x From e5dcd9890e73c0e40e0950a89d7fe05b574b4fc2 Mon Sep 17 00:00:00 2001 From: Maria-Alexandra Ilie <30919494+maria-ilie@users.noreply.github.com> Date: Sun, 4 Oct 2020 19:07:28 -0700 Subject: [PATCH 18/38] DOC: ran blacken docs tool and checked output to improve formatting #36777 (#36802) --- doc/source/user_guide/10min.rst | 10 +- doc/source/user_guide/advanced.rst | 251 +++++++------ doc/source/user_guide/basics.rst | 564 ++++++++++++++++------------- setup.cfg | 1 + 4 files changed, 444 insertions(+), 382 deletions(-) diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index 8270b2ee49bd8..08f83a4674ada 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -667,9 +667,10 @@ pandas can include categorical data in a :class:`DataFrame`. For full docs, see .. ipython:: python - df = pd.DataFrame( - {"id": [1, 2, 3, 4, 5, 6], "raw_grade": ["a", "b", "b", "a", "a", "e"]} - ) + df = pd.DataFrame( + {"id": [1, 2, 3, 4, 5, 6], "raw_grade": ["a", "b", "b", "a", "a", "e"]} + ) + Convert the raw grades to a categorical data type. @@ -718,7 +719,8 @@ We use the standard convention for referencing the matplotlib API: .. ipython:: python import matplotlib.pyplot as plt - plt.close('all') + + plt.close("all") .. ipython:: python diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 8cd35e94ae743..cec777e0f021e 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -62,12 +62,14 @@ demonstrate different ways to initialize MultiIndexes. .. ipython:: python - arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] tuples = list(zip(*arrays)) tuples - index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) + index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"]) index s = pd.Series(np.random.randn(8), index=index) @@ -78,8 +80,8 @@ to use the :meth:`MultiIndex.from_product` method: .. ipython:: python - iterables = [['bar', 'baz', 'foo', 'qux'], ['one', 'two']] - pd.MultiIndex.from_product(iterables, names=['first', 'second']) + iterables = [["bar", "baz", "foo", "qux"], ["one", "two"]] + pd.MultiIndex.from_product(iterables, names=["first", "second"]) You can also construct a ``MultiIndex`` from a ``DataFrame`` directly, using the method :meth:`MultiIndex.from_frame`. This is a complementary method to @@ -89,9 +91,10 @@ the method :meth:`MultiIndex.from_frame`. This is a complementary method to .. ipython:: python - df = pd.DataFrame([['bar', 'one'], ['bar', 'two'], - ['foo', 'one'], ['foo', 'two']], - columns=['first', 'second']) + df = pd.DataFrame( + [["bar", "one"], ["bar", "two"], ["foo", "one"], ["foo", "two"]], + columns=["first", "second"], + ) pd.MultiIndex.from_frame(df) As a convenience, you can pass a list of arrays directly into ``Series`` or @@ -99,8 +102,10 @@ As a convenience, you can pass a list of arrays directly into ``Series`` or .. ipython:: python - arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']), - np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'])] + arrays = [ + np.array(["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"]), + np.array(["one", "two", "one", "two", "one", "two", "one", "two"]), + ] s = pd.Series(np.random.randn(8), index=arrays) s df = pd.DataFrame(np.random.randn(8, 4), index=arrays) @@ -119,7 +124,7 @@ of the index is up to you: .. ipython:: python - df = pd.DataFrame(np.random.randn(3, 8), index=['A', 'B', 'C'], columns=index) + df = pd.DataFrame(np.random.randn(3, 8), index=["A", "B", "C"], columns=index) df pd.DataFrame(np.random.randn(6, 6), index=index[:6], columns=index[:6]) @@ -129,7 +134,7 @@ bit easier on the eyes. Note that how the index is displayed can be controlled u .. ipython:: python - with pd.option_context('display.multi_sparse', False): + with pd.option_context("display.multi_sparse", False): df It's worth keeping in mind that there's nothing preventing you from using @@ -157,7 +162,7 @@ location at a particular level: .. ipython:: python index.get_level_values(0) - index.get_level_values('second') + index.get_level_values("second") Basic indexing on axis with MultiIndex ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -169,10 +174,10 @@ completely analogous way to selecting a column in a regular DataFrame: .. ipython:: python - df['bar'] - df['bar', 'one'] - df['bar']['one'] - s['qux'] + df["bar"] + df["bar", "one"] + df["bar"]["one"] + s["qux"] See :ref:`Cross-section with hierarchical index ` for how to select on a deeper level. @@ -190,7 +195,7 @@ For example:   df.columns.levels # original MultiIndex - df[['foo','qux']].columns.levels # sliced + df[["foo","qux"]].columns.levels # sliced This is done to avoid a recomputation of the levels in order to make slicing highly performant. If you want to see only the used levels, you can use the @@ -198,17 +203,17 @@ highly performant. If you want to see only the used levels, you can use the .. ipython:: python - df[['foo', 'qux']].columns.to_numpy() + df[["foo", "qux"]].columns.to_numpy() # for a specific level - df[['foo', 'qux']].columns.get_level_values(0) + df[["foo", "qux"]].columns.get_level_values(0) To reconstruct the ``MultiIndex`` with only the used levels, the :meth:`~MultiIndex.remove_unused_levels` method may be used. .. ipython:: python - new_mi = df[['foo', 'qux']].columns.remove_unused_levels() + new_mi = df[["foo", "qux"]].columns.remove_unused_levels() new_mi.levels Data alignment and using ``reindex`` @@ -229,7 +234,7 @@ called with another ``MultiIndex``, or even a list or array of tuples: .. ipython:: python s.reindex(index[:3]) - s.reindex([('foo', 'two'), ('bar', 'one'), ('qux', 'one'), ('baz', 'one')]) + s.reindex([("foo", "two"), ("bar", "one"), ("qux", "one"), ("baz", "one")]) .. _advanced.advanced_hierarchical: @@ -244,7 +249,7 @@ keys take the form of tuples. For example, the following works as you would expe df = df.T df - df.loc[('bar', 'two')] + df.loc[("bar", "two")] Note that ``df.loc['bar', 'two']`` would also work in this example, but this shorthand notation can lead to ambiguity in general. @@ -254,7 +259,7 @@ like this: .. ipython:: python - df.loc[('bar', 'two'), 'A'] + df.loc[("bar", "two"), "A"] You don't have to specify all levels of the ``MultiIndex`` by passing only the first elements of the tuple. For example, you can use "partial" indexing to @@ -262,7 +267,7 @@ get all elements with ``bar`` in the first level as follows: .. ipython:: python - df.loc['bar'] + df.loc["bar"] This is a shortcut for the slightly more verbose notation ``df.loc[('bar',),]`` (equivalent to ``df.loc['bar',]`` in this example). @@ -271,20 +276,20 @@ to ``df.loc['bar',]`` in this example). .. ipython:: python - df.loc['baz':'foo'] + df.loc["baz":"foo"] You can slice with a 'range' of values, by providing a slice of tuples. .. ipython:: python - df.loc[('baz', 'two'):('qux', 'one')] - df.loc[('baz', 'two'):'foo'] + df.loc[("baz", "two"):("qux", "one")] + df.loc[("baz", "two"):"foo"] Passing a list of labels or tuples works similar to reindexing: .. ipython:: python - df.loc[[('bar', 'two'), ('qux', 'one')]] + df.loc[[("bar", "two"), ("qux", "one")]] .. note:: @@ -298,8 +303,9 @@ whereas a tuple of lists refer to several values within a level: .. ipython:: python - s = pd.Series([1, 2, 3, 4, 5, 6], - index=pd.MultiIndex.from_product([["A", "B"], ["c", "d", "e"]])) + s = pd.Series( + [1, 2, 3, 4, 5, 6], index=pd.MultiIndex.from_product([["A", "B"], ["c", "d", "e"]]) + ) s.loc[[("A", "c"), ("B", "d")]] # list of tuples s.loc[(["A", "B"], ["c", "d"])] # tuple of lists @@ -329,37 +335,44 @@ As usual, **both sides** of the slicers are included as this is label indexing. .. code-block:: python - df.loc[(slice('A1', 'A3'), ...), :] # noqa: E999 + df.loc[(slice("A1", "A3"), ...), :] # noqa: E999   You should **not** do this:   .. code-block:: python - df.loc[(slice('A1', 'A3'), ...)] # noqa: E999 + df.loc[(slice("A1", "A3"), ...)] # noqa: E999 .. ipython:: python def mklbl(prefix, n): return ["%s%s" % (prefix, i) for i in range(n)] - miindex = pd.MultiIndex.from_product([mklbl('A', 4), - mklbl('B', 2), - mklbl('C', 4), - mklbl('D', 2)]) - micolumns = pd.MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), - ('b', 'foo'), ('b', 'bah')], - names=['lvl0', 'lvl1']) - dfmi = pd.DataFrame(np.arange(len(miindex) * len(micolumns)) - .reshape((len(miindex), len(micolumns))), - index=miindex, - columns=micolumns).sort_index().sort_index(axis=1) + + miindex = pd.MultiIndex.from_product( + [mklbl("A", 4), mklbl("B", 2), mklbl("C", 4), mklbl("D", 2)] + ) + micolumns = pd.MultiIndex.from_tuples( + [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], names=["lvl0", "lvl1"] + ) + dfmi = ( + pd.DataFrame( + np.arange(len(miindex) * len(micolumns)).reshape( + (len(miindex), len(micolumns)) + ), + index=miindex, + columns=micolumns, + ) + .sort_index() + .sort_index(axis=1) + ) dfmi Basic MultiIndex slicing using slices, lists, and labels. .. ipython:: python - dfmi.loc[(slice('A1', 'A3'), slice(None), ['C1', 'C3']), :] + dfmi.loc[(slice("A1", "A3"), slice(None), ["C1", "C3"]), :] You can use :class:`pandas.IndexSlice` to facilitate a more natural syntax @@ -368,36 +381,36 @@ using ``:``, rather than using ``slice(None)``. .. ipython:: python idx = pd.IndexSlice - dfmi.loc[idx[:, :, ['C1', 'C3']], idx[:, 'foo']] + dfmi.loc[idx[:, :, ["C1", "C3"]], idx[:, "foo"]] It is possible to perform quite complicated selections using this method on multiple axes at the same time. .. ipython:: python - dfmi.loc['A1', (slice(None), 'foo')] - dfmi.loc[idx[:, :, ['C1', 'C3']], idx[:, 'foo']] + dfmi.loc["A1", (slice(None), "foo")] + dfmi.loc[idx[:, :, ["C1", "C3"]], idx[:, "foo"]] Using a boolean indexer you can provide selection related to the *values*. .. ipython:: python - mask = dfmi[('a', 'foo')] > 200 - dfmi.loc[idx[mask, :, ['C1', 'C3']], idx[:, 'foo']] + mask = dfmi[("a", "foo")] > 200 + dfmi.loc[idx[mask, :, ["C1", "C3"]], idx[:, "foo"]] You can also specify the ``axis`` argument to ``.loc`` to interpret the passed slicers on a single axis. .. ipython:: python - dfmi.loc(axis=0)[:, :, ['C1', 'C3']] + dfmi.loc(axis=0)[:, :, ["C1", "C3"]] Furthermore, you can *set* the values using the following methods. .. ipython:: python df2 = dfmi.copy() - df2.loc(axis=0)[:, :, ['C1', 'C3']] = -10 + df2.loc(axis=0)[:, :, ["C1", "C3"]] = -10 df2 You can use a right-hand-side of an alignable object as well. @@ -405,7 +418,7 @@ You can use a right-hand-side of an alignable object as well. .. ipython:: python df2 = dfmi.copy() - df2.loc[idx[:, :, ['C1', 'C3']], :] = df2 * 1000 + df2.loc[idx[:, :, ["C1", "C3"]], :] = df2 * 1000 df2 .. _advanced.xs: @@ -419,12 +432,12 @@ selecting data at a particular level of a ``MultiIndex`` easier. .. ipython:: python df - df.xs('one', level='second') + df.xs("one", level="second") .. ipython:: python # using the slicers - df.loc[(slice(None), 'one'), :] + df.loc[(slice(None), "one"), :] You can also select on the columns with ``xs``, by providing the axis argument. @@ -432,36 +445,36 @@ providing the axis argument. .. ipython:: python df = df.T - df.xs('one', level='second', axis=1) + df.xs("one", level="second", axis=1) .. ipython:: python # using the slicers - df.loc[:, (slice(None), 'one')] + df.loc[:, (slice(None), "one")] ``xs`` also allows selection with multiple keys. .. ipython:: python - df.xs(('one', 'bar'), level=('second', 'first'), axis=1) + df.xs(("one", "bar"), level=("second", "first"), axis=1) .. ipython:: python # using the slicers - df.loc[:, ('bar', 'one')] + df.loc[:, ("bar", "one")] You can pass ``drop_level=False`` to ``xs`` to retain the level that was selected. .. ipython:: python - df.xs('one', level='second', axis=1, drop_level=False) + df.xs("one", level="second", axis=1, drop_level=False) Compare the above with the result using ``drop_level=True`` (the default value). .. ipython:: python - df.xs('one', level='second', axis=1, drop_level=True) + df.xs("one", level="second", axis=1, drop_level=True) .. ipython:: python :suppress: @@ -479,8 +492,9 @@ values across a level. For instance: .. ipython:: python - midx = pd.MultiIndex(levels=[['zero', 'one'], ['x', 'y']], - codes=[[1, 1, 0, 0], [1, 0, 1, 0]]) + midx = pd.MultiIndex( + levels=[["zero", "one"], ["x", "y"]], codes=[[1, 1, 0, 0], [1, 0, 1, 0]] + ) df = pd.DataFrame(np.random.randn(4, 2), index=midx) df df2 = df.mean(level=0) @@ -543,7 +557,7 @@ used to move the values from the ``MultiIndex`` to a column. .. ipython:: python - df.rename_axis(index=['abc', 'def']) + df.rename_axis(index=["abc", "def"]) Note that the columns of a ``DataFrame`` are an index, so that using ``rename_axis`` with the ``columns`` argument will change the name of that @@ -561,7 +575,7 @@ When working with an ``Index`` object directly, rather than via a ``DataFrame``, .. ipython:: python - mi = pd.MultiIndex.from_product([[1, 2], ['a', 'b']], names=['x', 'y']) + mi = pd.MultiIndex.from_product([[1, 2], ["a", "b"]], names=["x", "y"]) mi.names mi2 = mi.rename("new name", level=0) @@ -586,6 +600,7 @@ they need to be sorted. As with any index, you can use :meth:`~DataFrame.sort_in .. ipython:: python import random + random.shuffle(tuples) s = pd.Series(np.random.randn(8), index=pd.MultiIndex.from_tuples(tuples)) s @@ -600,9 +615,9 @@ are named. .. ipython:: python - s.index.set_names(['L1', 'L2'], inplace=True) - s.sort_index(level='L1') - s.sort_index(level='L2') + s.index.set_names(["L1", "L2"], inplace=True) + s.sort_index(level="L1") + s.sort_index(level="L2") On higher dimensional objects, you can sort any of the other axes by level if they have a ``MultiIndex``: @@ -617,10 +632,10 @@ return a copy of the data rather than a view: .. ipython:: python - dfm = pd.DataFrame({'jim': [0, 0, 1, 1], - 'joe': ['x', 'x', 'z', 'y'], - 'jolie': np.random.rand(4)}) - dfm = dfm.set_index(['jim', 'joe']) + dfm = pd.DataFrame( + {"jim": [0, 0, 1, 1], "joe": ["x", "x", "z", "y"], "jolie": np.random.rand(4)} + ) + dfm = dfm.set_index(["jim", "joe"]) dfm .. code-block:: ipython @@ -661,7 +676,7 @@ And now selection works as expected. .. ipython:: python - dfm.loc[(0, 'y'):(1, 'z')] + dfm.loc[(0, "y"):(1, "z")] Take methods ------------ @@ -754,18 +769,18 @@ and allows efficient indexing and storage of an index with a large number of dup .. ipython:: python from pandas.api.types import CategoricalDtype - df = pd.DataFrame({'A': np.arange(6), - 'B': list('aabbca')}) - df['B'] = df['B'].astype(CategoricalDtype(list('cab'))) + + df = pd.DataFrame({"A": np.arange(6), "B": list("aabbca")}) + df["B"] = df["B"].astype(CategoricalDtype(list("cab"))) df df.dtypes - df['B'].cat.categories + df["B"].cat.categories Setting the index will create a ``CategoricalIndex``. .. ipython:: python - df2 = df.set_index('B') + df2 = df.set_index("B") df2.index Indexing with ``__getitem__/.iloc/.loc`` works similarly to an ``Index`` with duplicates. @@ -773,13 +788,13 @@ The indexers **must** be in the category or the operation will raise a ``KeyErro .. ipython:: python - df2.loc['a'] + df2.loc["a"] The ``CategoricalIndex`` is **preserved** after indexing: .. ipython:: python - df2.loc['a'].index + df2.loc["a"].index Sorting the index will sort by the order of the categories (recall that we created the index with ``CategoricalDtype(list('cab'))``, so the sorted @@ -804,17 +819,16 @@ values **not** in the categories, similarly to how you can reindex **any** panda .. ipython:: python - df3 = pd.DataFrame({'A': np.arange(3), - 'B': pd.Series(list('abc')).astype('category')}) - df3 = df3.set_index('B') + df3 = pd.DataFrame({"A": np.arange(3), "B": pd.Series(list("abc")).astype("category")}) + df3 = df3.set_index("B") df3 .. ipython:: python - df3.reindex(['a', 'e']) - df3.reindex(['a', 'e']).index - df3.reindex(pd.Categorical(['a', 'e'], categories=list('abe'))) - df3.reindex(pd.Categorical(['a', 'e'], categories=list('abe'))).index + df3.reindex(["a", "e"]) + df3.reindex(["a", "e"]).index + df3.reindex(pd.Categorical(["a", "e"], categories=list("abe"))) + df3.reindex(pd.Categorical(["a", "e"], categories=list("abe"))).index .. warning:: @@ -823,16 +837,14 @@ values **not** in the categories, similarly to how you can reindex **any** panda .. ipython:: python - df4 = pd.DataFrame({'A': np.arange(2), - 'B': list('ba')}) - df4['B'] = df4['B'].astype(CategoricalDtype(list('ab'))) - df4 = df4.set_index('B') + df4 = pd.DataFrame({"A": np.arange(2), "B": list("ba")}) + df4["B"] = df4["B"].astype(CategoricalDtype(list("ab"))) + df4 = df4.set_index("B") df4.index - df5 = pd.DataFrame({'A': np.arange(2), - 'B': list('bc')}) - df5['B'] = df5['B'].astype(CategoricalDtype(list('bc'))) - df5 = df5.set_index('B') + df5 = pd.DataFrame({"A": np.arange(2), "B": list("bc")}) + df5["B"] = df5["B"].astype(CategoricalDtype(list("bc"))) + df5 = df5.set_index("B") df5.index .. code-block:: ipython @@ -916,12 +928,16 @@ example, be millisecond offsets. .. ipython:: python - dfir = pd.concat([pd.DataFrame(np.random.randn(5, 2), - index=np.arange(5) * 250.0, - columns=list('AB')), - pd.DataFrame(np.random.randn(6, 2), - index=np.arange(4, 10) * 250.1, - columns=list('AB'))]) + dfir = pd.concat( + [ + pd.DataFrame( + np.random.randn(5, 2), index=np.arange(5) * 250.0, columns=list("AB") + ), + pd.DataFrame( + np.random.randn(6, 2), index=np.arange(4, 10) * 250.1, columns=list("AB") + ), + ] + ) dfir Selection operations then will always work on a value basis, for all selection operators. @@ -929,7 +945,7 @@ Selection operations then will always work on a value basis, for all selection o .. ipython:: python dfir[0:1000.4] - dfir.loc[0:1001, 'A'] + dfir.loc[0:1001, "A"] dfir.loc[1000.4] You could retrieve the first 1 second (1000 ms) of data as such: @@ -963,8 +979,9 @@ An ``IntervalIndex`` can be used in ``Series`` and in ``DataFrame`` as the index .. ipython:: python - df = pd.DataFrame({'A': [1, 2, 3, 4]}, - index=pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4])) + df = pd.DataFrame( + {"A": [1, 2, 3, 4]}, index=pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4]) + ) df Label based indexing via ``.loc`` along the edges of an interval works as you would expect, @@ -1041,9 +1058,9 @@ datetime-like intervals: pd.interval_range(start=0, end=5) - pd.interval_range(start=pd.Timestamp('2017-01-01'), periods=4) + pd.interval_range(start=pd.Timestamp("2017-01-01"), periods=4) - pd.interval_range(end=pd.Timedelta('3 days'), periods=3) + pd.interval_range(end=pd.Timedelta("3 days"), periods=3) The ``freq`` parameter can used to specify non-default frequencies, and can utilize a variety of :ref:`frequency aliases ` with datetime-like intervals: @@ -1052,18 +1069,18 @@ of :ref:`frequency aliases ` with datetime-like inter pd.interval_range(start=0, periods=5, freq=1.5) - pd.interval_range(start=pd.Timestamp('2017-01-01'), periods=4, freq='W') + pd.interval_range(start=pd.Timestamp("2017-01-01"), periods=4, freq="W") - pd.interval_range(start=pd.Timedelta('0 days'), periods=3, freq='9H') + pd.interval_range(start=pd.Timedelta("0 days"), periods=3, freq="9H") Additionally, the ``closed`` parameter can be used to specify which side(s) the intervals are closed on. Intervals are closed on the right side by default. .. ipython:: python - pd.interval_range(start=0, end=4, closed='both') + pd.interval_range(start=0, end=4, closed="both") - pd.interval_range(start=0, end=4, closed='neither') + pd.interval_range(start=0, end=4, closed="neither") Specifying ``start``, ``end``, and ``periods`` will generate a range of evenly spaced intervals from ``start`` to ``end`` inclusively, with ``periods`` number of elements @@ -1073,8 +1090,7 @@ in the resulting ``IntervalIndex``: pd.interval_range(start=0, end=6, periods=4) - pd.interval_range(pd.Timestamp('2018-01-01'), - pd.Timestamp('2018-02-28'), periods=3) + pd.interval_range(pd.Timestamp("2018-01-01"), pd.Timestamp("2018-02-28"), periods=3) Miscellaneous indexing FAQ -------------------------- @@ -1112,7 +1128,7 @@ normal Python ``list``. Monotonicity of an index can be tested with the :meth:`~ .. ipython:: python - df = pd.DataFrame(index=[2, 3, 3, 4, 5], columns=['data'], data=list(range(5))) + df = pd.DataFrame(index=[2, 3, 3, 4, 5], columns=["data"], data=list(range(5))) df.index.is_monotonic_increasing # no rows 0 or 1, but still returns rows 2, 3 (both of them), and 4: @@ -1126,8 +1142,7 @@ On the other hand, if the index is not monotonic, then both slice bounds must be .. ipython:: python - df = pd.DataFrame(index=[2, 3, 1, 4, 3, 5], - columns=['data'], data=list(range(6))) + df = pd.DataFrame(index=[2, 3, 1, 4, 3, 5], columns=["data"], data=list(range(6))) df.index.is_monotonic_increasing # OK because 2 and 4 are in the index @@ -1149,7 +1164,7 @@ the :meth:`~Index.is_unique` attribute. .. ipython:: python - weakly_monotonic = pd.Index(['a', 'b', 'c', 'c']) + weakly_monotonic = pd.Index(["a", "b", "c", "c"]) weakly_monotonic weakly_monotonic.is_monotonic_increasing weakly_monotonic.is_monotonic_increasing & weakly_monotonic.is_unique @@ -1167,7 +1182,7 @@ consider the following ``Series``: .. ipython:: python - s = pd.Series(np.random.randn(6), index=list('abcdef')) + s = pd.Series(np.random.randn(6), index=list("abcdef")) s Suppose we wished to slice from ``c`` to ``e``, using integers this would be @@ -1190,7 +1205,7 @@ slicing include both endpoints: .. ipython:: python - s.loc['c':'e'] + s.loc["c":"e"] This is most definitely a "practicality beats purity" sort of thing, but it is something to watch out for if you expect label-based slicing to behave exactly diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 5fa214d2ed389..8c01913e55318 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -12,10 +12,9 @@ the :ref:`10 minutes to pandas <10min>` section: .. ipython:: python - index = pd.date_range('1/1/2000', periods=8) - s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) - df = pd.DataFrame(np.random.randn(8, 3), index=index, - columns=['A', 'B', 'C']) + index = pd.date_range("1/1/2000", periods=8) + s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"]) + df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=["A", "B", "C"]) .. _basics.head_tail: @@ -97,7 +96,7 @@ Timezones may be preserved with ``dtype=object`` .. ipython:: python - ser = pd.Series(pd.date_range('2000', periods=2, tz="CET")) + ser = pd.Series(pd.date_range("2000", periods=2, tz="CET")) ser.to_numpy(dtype=object) Or thrown away with ``dtype='datetime64[ns]'`` @@ -174,8 +173,8 @@ These are both enabled to be used by default, you can control this by setting th .. code-block:: python - pd.set_option('compute.use_bottleneck', False) - pd.set_option('compute.use_numexpr', False) + pd.set_option("compute.use_bottleneck", False) + pd.set_option("compute.use_numexpr", False) .. _basics.binop: @@ -204,18 +203,21 @@ either match on the *index* or *columns* via the **axis** keyword: .. ipython:: python - df = pd.DataFrame({ - 'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']), - 'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']), - 'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])}) + df = pd.DataFrame( + { + "one": pd.Series(np.random.randn(3), index=["a", "b", "c"]), + "two": pd.Series(np.random.randn(4), index=["a", "b", "c", "d"]), + "three": pd.Series(np.random.randn(3), index=["b", "c", "d"]), + } + ) df row = df.iloc[1] - column = df['two'] + column = df["two"] - df.sub(row, axis='columns') + df.sub(row, axis="columns") df.sub(row, axis=1) - df.sub(column, axis='index') + df.sub(column, axis="index") df.sub(column, axis=0) .. ipython:: python @@ -228,10 +230,10 @@ Furthermore you can align a level of a MultiIndexed DataFrame with a Series. .. ipython:: python dfmi = df.copy() - dfmi.index = pd.MultiIndex.from_tuples([(1, 'a'), (1, 'b'), - (1, 'c'), (2, 'a')], - names=['first', 'second']) - dfmi.sub(column, axis=0, level='second') + dfmi.index = pd.MultiIndex.from_tuples( + [(1, "a"), (1, "b"), (1, "c"), (2, "a")], names=["first", "second"] + ) + dfmi.sub(column, axis=0, level="second") Series and Index also support the :func:`divmod` builtin. This function takes the floor division and modulo operation at the same time returning a two-tuple @@ -273,7 +275,7 @@ using ``fillna`` if you wish). :suppress: df2 = df.copy() - df2['three']['a'] = 1. + df2["three"]["a"] = 1.0 .. ipython:: python @@ -325,7 +327,7 @@ You can test if a pandas object is empty, via the :attr:`~DataFrame.empty` prope .. ipython:: python df.empty - pd.DataFrame(columns=list('ABC')).empty + pd.DataFrame(columns=list("ABC")).empty To evaluate single-element pandas objects in a boolean context, use the method :meth:`~DataFrame.bool`: @@ -394,8 +396,8 @@ equality to be True: .. ipython:: python - df1 = pd.DataFrame({'col': ['foo', 0, np.nan]}) - df2 = pd.DataFrame({'col': [np.nan, 0, 'foo']}, index=[2, 1, 0]) + df1 = pd.DataFrame({"col": ["foo", 0, np.nan]}) + df2 = pd.DataFrame({"col": [np.nan, 0, "foo"]}, index=[2, 1, 0]) df1.equals(df2) df1.equals(df2.sort_index()) @@ -407,16 +409,16 @@ data structure with a scalar value: .. ipython:: python - pd.Series(['foo', 'bar', 'baz']) == 'foo' - pd.Index(['foo', 'bar', 'baz']) == 'foo' + pd.Series(["foo", "bar", "baz"]) == "foo" + pd.Index(["foo", "bar", "baz"]) == "foo" pandas also handles element-wise comparisons between different array-like objects of the same length: .. ipython:: python - pd.Series(['foo', 'bar', 'baz']) == pd.Index(['foo', 'bar', 'qux']) - pd.Series(['foo', 'bar', 'baz']) == np.array(['foo', 'bar', 'qux']) + pd.Series(["foo", "bar", "baz"]) == pd.Index(["foo", "bar", "qux"]) + pd.Series(["foo", "bar", "baz"]) == np.array(["foo", "bar", "qux"]) Trying to compare ``Index`` or ``Series`` objects of different lengths will raise a ValueError: @@ -458,10 +460,12 @@ which we illustrate: .. ipython:: python - df1 = pd.DataFrame({'A': [1., np.nan, 3., 5., np.nan], - 'B': [np.nan, 2., 3., np.nan, 6.]}) - df2 = pd.DataFrame({'A': [5., 2., 4., np.nan, 3., 7.], - 'B': [np.nan, np.nan, 3., 4., 6., 8.]}) + df1 = pd.DataFrame( + {"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]} + ) + df2 = pd.DataFrame( + {"A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0], "B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0]} + ) df1 df2 df1.combine_first(df2) @@ -480,6 +484,8 @@ So, for instance, to reproduce :meth:`~DataFrame.combine_first` as above: def combiner(x, y): return np.where(pd.isna(x), y, x) + + df1.combine(df2, combiner) .. _basics.stats: @@ -570,8 +576,8 @@ will exclude NAs on Series input by default: .. ipython:: python - np.mean(df['one']) - np.mean(df['one'].to_numpy()) + np.mean(df["one"]) + np.mean(df["one"].to_numpy()) :meth:`Series.nunique` will return the number of unique non-NA values in a Series: @@ -597,8 +603,7 @@ course): series = pd.Series(np.random.randn(1000)) series[::2] = np.nan series.describe() - frame = pd.DataFrame(np.random.randn(1000, 5), - columns=['a', 'b', 'c', 'd', 'e']) + frame = pd.DataFrame(np.random.randn(1000, 5), columns=["a", "b", "c", "d", "e"]) frame.iloc[::2] = np.nan frame.describe() @@ -606,7 +611,7 @@ You can select specific percentiles to include in the output: .. ipython:: python - series.describe(percentiles=[.05, .25, .75, .95]) + series.describe(percentiles=[0.05, 0.25, 0.75, 0.95]) By default, the median is always included. @@ -615,7 +620,7 @@ summary of the number of unique values and most frequently occurring values: .. ipython:: python - s = pd.Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a']) + s = pd.Series(["a", "a", "b", "b", "a", "a", np.nan, "c", "d", "a"]) s.describe() Note that on a mixed-type DataFrame object, :meth:`~DataFrame.describe` will @@ -624,7 +629,7 @@ categorical columns: .. ipython:: python - frame = pd.DataFrame({'a': ['Yes', 'Yes', 'No', 'No'], 'b': range(4)}) + frame = pd.DataFrame({"a": ["Yes", "Yes", "No", "No"], "b": range(4)}) frame.describe() This behavior can be controlled by providing a list of types as ``include``/``exclude`` @@ -632,9 +637,9 @@ arguments. The special value ``all`` can also be used: .. ipython:: python - frame.describe(include=['object']) - frame.describe(include=['number']) - frame.describe(include='all') + frame.describe(include=["object"]) + frame.describe(include=["number"]) + frame.describe(include="all") That feature relies on :ref:`select_dtypes `. Refer to there for details about accepted inputs. @@ -654,7 +659,7 @@ corresponding values: s1 s1.idxmin(), s1.idxmax() - df1 = pd.DataFrame(np.random.randn(5, 3), columns=['A', 'B', 'C']) + df1 = pd.DataFrame(np.random.randn(5, 3), columns=["A", "B", "C"]) df1 df1.idxmin(axis=0) df1.idxmax(axis=1) @@ -665,9 +670,9 @@ matching index: .. ipython:: python - df3 = pd.DataFrame([2, 1, 1, 3, np.nan], columns=['A'], index=list('edcba')) + df3 = pd.DataFrame([2, 1, 1, 3, np.nan], columns=["A"], index=list("edcba")) df3 - df3['A'].idxmin() + df3["A"].idxmin() .. note:: @@ -706,8 +711,9 @@ Similarly, you can get the most frequently occurring value(s), i.e. the mode, of s5 = pd.Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7]) s5.mode() - df5 = pd.DataFrame({"A": np.random.randint(0, 7, size=50), - "B": np.random.randint(-10, 15, size=50)}) + df5 = pd.DataFrame( + {"A": np.random.randint(0, 7, size=50), "B": np.random.randint(-10, 15, size=50)} + ) df5.mode() @@ -732,7 +738,7 @@ normally distributed data into equal-size quartiles like so: .. ipython:: python arr = np.random.randn(30) - factor = pd.qcut(arr, [0, .25, .5, .75, 1]) + factor = pd.qcut(arr, [0, 0.25, 0.5, 0.75, 1]) factor pd.value_counts(factor) @@ -775,18 +781,20 @@ First some setup: """ Chicago, IL -> Chicago for city_name column """ - df['city_name'] = df['city_and_code'].str.split(",").str.get(0) + df["city_name"] = df["city_and_code"].str.split(",").str.get(0) return df + def add_country_name(df, country_name=None): """ Chicago -> Chicago-US for city_name column """ - col = 'city_name' - df['city_and_country'] = df[col] + country_name + col = "city_name" + df["city_and_country"] = df[col] + country_name return df - df_p = pd.DataFrame({'city_and_code': ['Chicago, IL']}) + + df_p = pd.DataFrame({"city_and_code": ["Chicago, IL"]}) ``extract_city_name`` and ``add_country_name`` are functions taking and returning ``DataFrames``. @@ -795,14 +803,13 @@ Now compare the following: .. ipython:: python - add_country_name(extract_city_name(df_p), country_name='US') + add_country_name(extract_city_name(df_p), country_name="US") Is equivalent to: .. ipython:: python - (df_p.pipe(extract_city_name) - .pipe(add_country_name, country_name="US")) + df_p.pipe(extract_city_name).pipe(add_country_name, country_name="US") pandas encourages the second style, which is known as method chaining. ``pipe`` makes it easy to use your own or another library's functions @@ -820,14 +827,15 @@ For example, we can fit a regression using statsmodels. Their API expects a form import statsmodels.formula.api as sm - bb = pd.read_csv('data/baseball.csv', index_col='id') + bb = pd.read_csv("data/baseball.csv", index_col="id") - (bb.query('h > 0') - .assign(ln_h=lambda df: np.log(df.h)) - .pipe((sm.ols, 'data'), 'hr ~ ln_h + year + g + C(lg)') - .fit() - .summary() - ) + ( + bb.query("h > 0") + .assign(ln_h=lambda df: np.log(df.h)) + .pipe((sm.ols, "data"), "hr ~ ln_h + year + g + C(lg)") + .fit() + .summary() + ) The pipe method is inspired by unix pipes and more recently dplyr_ and magrittr_, which have introduced the popular ``(%>%)`` (read pipe) operator for R_. @@ -858,8 +866,8 @@ The :meth:`~DataFrame.apply` method will also dispatch on a string method name. .. ipython:: python - df.apply('mean') - df.apply('mean', axis=1) + df.apply("mean") + df.apply("mean", axis=1) The return type of the function passed to :meth:`~DataFrame.apply` affects the type of the final output from ``DataFrame.apply`` for the default behaviour: @@ -878,8 +886,11 @@ maximum value for each column occurred: .. ipython:: python - tsdf = pd.DataFrame(np.random.randn(1000, 3), columns=['A', 'B', 'C'], - index=pd.date_range('1/1/2000', periods=1000)) + tsdf = pd.DataFrame( + np.random.randn(1000, 3), + columns=["A", "B", "C"], + index=pd.date_range("1/1/2000", periods=1000), + ) tsdf.apply(lambda x: x.idxmax()) You may also pass additional arguments and keyword arguments to the :meth:`~DataFrame.apply` @@ -902,8 +913,11 @@ Series operation on each column or row: .. ipython:: python :suppress: - tsdf = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'], - index=pd.date_range('1/1/2000', periods=10)) + tsdf = pd.DataFrame( + np.random.randn(10, 3), + columns=["A", "B", "C"], + index=pd.date_range("1/1/2000", periods=10), + ) tsdf.iloc[3:7] = np.nan .. ipython:: python @@ -933,8 +947,11 @@ We will use a similar starting frame from above: .. ipython:: python - tsdf = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'], - index=pd.date_range('1/1/2000', periods=10)) + tsdf = pd.DataFrame( + np.random.randn(10, 3), + columns=["A", "B", "C"], + index=pd.date_range("1/1/2000", periods=10), + ) tsdf.iloc[3:7] = np.nan tsdf @@ -946,7 +963,7 @@ output: tsdf.agg(np.sum) - tsdf.agg('sum') + tsdf.agg("sum") # these are equivalent to a ``.sum()`` because we are aggregating # on a single function @@ -956,7 +973,7 @@ Single aggregations on a ``Series`` this will return a scalar value: .. ipython:: python - tsdf['A'].agg('sum') + tsdf["A"].agg("sum") Aggregating with multiple functions @@ -968,25 +985,25 @@ These are naturally named from the aggregation function. .. ipython:: python - tsdf.agg(['sum']) + tsdf.agg(["sum"]) Multiple functions yield multiple rows: .. ipython:: python - tsdf.agg(['sum', 'mean']) + tsdf.agg(["sum", "mean"]) On a ``Series``, multiple functions return a ``Series``, indexed by the function names: .. ipython:: python - tsdf['A'].agg(['sum', 'mean']) + tsdf["A"].agg(["sum", "mean"]) Passing a ``lambda`` function will yield a ```` named row: .. ipython:: python - tsdf['A'].agg(['sum', lambda x: x.mean()]) + tsdf["A"].agg(["sum", lambda x: x.mean()]) Passing a named function will yield that name for the row: @@ -995,7 +1012,8 @@ Passing a named function will yield that name for the row: def mymean(x): return x.mean() - tsdf['A'].agg(['sum', mymean]) + + tsdf["A"].agg(["sum", mymean]) Aggregating with a dict +++++++++++++++++++++++ @@ -1006,7 +1024,7 @@ are not in any particular order, you can use an ``OrderedDict`` instead to guara .. ipython:: python - tsdf.agg({'A': 'mean', 'B': 'sum'}) + tsdf.agg({"A": "mean", "B": "sum"}) Passing a list-like will generate a ``DataFrame`` output. You will get a matrix-like output of all of the aggregators. The output will consist of all unique functions. Those that are @@ -1014,7 +1032,7 @@ not noted for a particular column will be ``NaN``: .. ipython:: python - tsdf.agg({'A': ['mean', 'min'], 'B': 'sum'}) + tsdf.agg({"A": ["mean", "min"], "B": "sum"}) .. _basics.aggregation.mixed_string: @@ -1026,15 +1044,19 @@ aggregations. This is similar to how ``.groupby.agg`` works. .. ipython:: python - mdf = pd.DataFrame({'A': [1, 2, 3], - 'B': [1., 2., 3.], - 'C': ['foo', 'bar', 'baz'], - 'D': pd.date_range('20130101', periods=3)}) + mdf = pd.DataFrame( + { + "A": [1, 2, 3], + "B": [1.0, 2.0, 3.0], + "C": ["foo", "bar", "baz"], + "D": pd.date_range("20130101", periods=3), + } + ) mdf.dtypes .. ipython:: python - mdf.agg(['min', 'sum']) + mdf.agg(["min", "sum"]) .. _basics.aggregation.custom_describe: @@ -1049,11 +1071,11 @@ to the built in :ref:`describe function `. from functools import partial q_25 = partial(pd.Series.quantile, q=0.25) - q_25.__name__ = '25%' + q_25.__name__ = "25%" q_75 = partial(pd.Series.quantile, q=0.75) - q_75.__name__ = '75%' + q_75.__name__ = "75%" - tsdf.agg(['count', 'mean', 'std', 'min', q_25, 'median', q_75, 'max']) + tsdf.agg(["count", "mean", "std", "min", q_25, "median", q_75, "max"]) .. _basics.transform: @@ -1068,8 +1090,11 @@ We create a frame similar to the one used in the above sections. .. ipython:: python - tsdf = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'], - index=pd.date_range('1/1/2000', periods=10)) + tsdf = pd.DataFrame( + np.random.randn(10, 3), + columns=["A", "B", "C"], + index=pd.date_range("1/1/2000", periods=10), + ) tsdf.iloc[3:7] = np.nan tsdf @@ -1080,7 +1105,7 @@ function name or a user defined function. :okwarning: tsdf.transform(np.abs) - tsdf.transform('abs') + tsdf.transform("abs") tsdf.transform(lambda x: x.abs()) Here :meth:`~DataFrame.transform` received a single function; this is equivalent to a `ufunc @@ -1094,7 +1119,7 @@ Passing a single function to ``.transform()`` with a ``Series`` will yield a sin .. ipython:: python - tsdf['A'].transform(np.abs) + tsdf["A"].transform(np.abs) Transform with multiple functions @@ -1113,7 +1138,7 @@ resulting column names will be the transforming functions. .. ipython:: python - tsdf['A'].transform([np.abs, lambda x: x + 1]) + tsdf["A"].transform([np.abs, lambda x: x + 1]) Transforming with a dict @@ -1124,7 +1149,7 @@ Passing a dict of functions will allow selective transforming per column. .. ipython:: python - tsdf.transform({'A': np.abs, 'B': lambda x: x + 1}) + tsdf.transform({"A": np.abs, "B": lambda x: x + 1}) Passing a dict of lists will generate a MultiIndexed DataFrame with these selective transforms. @@ -1132,7 +1157,7 @@ selective transforms. .. ipython:: python :okwarning: - tsdf.transform({'A': np.abs, 'B': [lambda x: x + 1, 'sqrt']}) + tsdf.transform({"A": np.abs, "B": [lambda x: x + 1, "sqrt"]}) .. _basics.elementwise: @@ -1153,10 +1178,12 @@ a single value and returning a single value. For example: df4 + def f(x): return len(str(x)) - df4['one'].map(f) + + df4["one"].map(f) df4.applymap(f) :meth:`Series.map` has an additional feature; it can be used to easily @@ -1165,9 +1192,8 @@ to :ref:`merging/joining functionality `: .. ipython:: python - s = pd.Series(['six', 'seven', 'six', 'seven', 'six'], - index=['a', 'b', 'c', 'd', 'e']) - t = pd.Series({'six': 6., 'seven': 7.}) + s = pd.Series(["six", "seven", "six", "seven", "six"], index=["a", "b", "c", "d", "e"]) + t = pd.Series({"six": 6.0, "seven": 7.0}) s s.map(t) @@ -1192,9 +1218,9 @@ Here is a simple example: .. ipython:: python - s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) + s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"]) s - s.reindex(['e', 'b', 'f', 'd']) + s.reindex(["e", "b", "f", "d"]) Here, the ``f`` label was not contained in the Series and hence appears as ``NaN`` in the result. @@ -1204,13 +1230,13 @@ With a DataFrame, you can simultaneously reindex the index and columns: .. ipython:: python df - df.reindex(index=['c', 'f', 'b'], columns=['three', 'two', 'one']) + df.reindex(index=["c", "f", "b"], columns=["three", "two", "one"]) You may also use ``reindex`` with an ``axis`` keyword: .. ipython:: python - df.reindex(['c', 'f', 'b'], axis='index') + df.reindex(["c", "f", "b"], axis="index") Note that the ``Index`` objects containing the actual axis labels can be **shared** between objects. So if we have a Series and a DataFrame, the @@ -1230,8 +1256,8 @@ where you specify a single ``labels`` argument and the ``axis`` it applies to. .. ipython:: python - df.reindex(['c', 'f', 'b'], axis='index') - df.reindex(['three', 'two', 'one'], axis='columns') + df.reindex(["c", "f", "b"], axis="index") + df.reindex(["three", "two", "one"], axis="columns") .. seealso:: @@ -1261,7 +1287,7 @@ available to make this simpler: .. ipython:: python :suppress: - df2 = df.reindex(['a', 'b', 'c'], columns=['one', 'two']) + df2 = df.reindex(["a", "b", "c"], columns=["one", "two"]) df3 = df2 - df2.mean() @@ -1288,12 +1314,12 @@ It returns a tuple with both of the reindexed Series: .. ipython:: python - s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) + s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"]) s1 = s[:4] s2 = s[1:] s1.align(s2) - s1.align(s2, join='inner') - s1.align(s2, join='left') + s1.align(s2, join="inner") + s1.align(s2, join="left") .. _basics.df_join: @@ -1302,13 +1328,13 @@ columns by default: .. ipython:: python - df.align(df2, join='inner') + df.align(df2, join="inner") You can also pass an ``axis`` option to only align on the specified axis: .. ipython:: python - df.align(df2, join='inner', axis=0) + df.align(df2, join="inner", axis=0) .. _basics.align.frame.series: @@ -1339,16 +1365,16 @@ We illustrate these fill methods on a simple Series: .. ipython:: python - rng = pd.date_range('1/3/2000', periods=8) + rng = pd.date_range("1/3/2000", periods=8) ts = pd.Series(np.random.randn(8), index=rng) ts2 = ts[[0, 3, 6]] ts ts2 ts2.reindex(ts.index) - ts2.reindex(ts.index, method='ffill') - ts2.reindex(ts.index, method='bfill') - ts2.reindex(ts.index, method='nearest') + ts2.reindex(ts.index, method="ffill") + ts2.reindex(ts.index, method="bfill") + ts2.reindex(ts.index, method="nearest") These methods require that the indexes are **ordered** increasing or decreasing. @@ -1359,7 +1385,7 @@ Note that the same result could have been achieved using .. ipython:: python - ts2.reindex(ts.index).fillna(method='ffill') + ts2.reindex(ts.index).fillna(method="ffill") :meth:`~Series.reindex` will raise a ValueError if the index is not monotonically increasing or decreasing. :meth:`~Series.fillna` and :meth:`~Series.interpolate` @@ -1376,14 +1402,14 @@ matches: .. ipython:: python - ts2.reindex(ts.index, method='ffill', limit=1) + ts2.reindex(ts.index, method="ffill", limit=1) In contrast, tolerance specifies the maximum distance between the index and indexer values: .. ipython:: python - ts2.reindex(ts.index, method='ffill', tolerance='1 day') + ts2.reindex(ts.index, method="ffill", tolerance="1 day") Notice that when used on a ``DatetimeIndex``, ``TimedeltaIndex`` or ``PeriodIndex``, ``tolerance`` will coerced into a ``Timedelta`` if possible. @@ -1400,14 +1426,14 @@ It removes a set of labels from an axis: .. ipython:: python df - df.drop(['a', 'd'], axis=0) - df.drop(['one'], axis=1) + df.drop(["a", "d"], axis=0) + df.drop(["one"], axis=1) Note that the following also works, but is a bit less obvious / clean: .. ipython:: python - df.reindex(df.index.difference(['a', 'd'])) + df.reindex(df.index.difference(["a", "d"])) .. _basics.rename: @@ -1428,8 +1454,10 @@ Series can also be used: .. ipython:: python - df.rename(columns={'one': 'foo', 'two': 'bar'}, - index={'a': 'apple', 'b': 'banana', 'd': 'durian'}) + df.rename( + columns={"one": "foo", "two": "bar"}, + index={"a": "apple", "b": "banana", "d": "durian"}, + ) If the mapping doesn't include a column/index label, it isn't renamed. Note that extra labels in the mapping don't throw an error. @@ -1439,8 +1467,8 @@ you specify a single ``mapper`` and the ``axis`` to apply that mapping to. .. ipython:: python - df.rename({'one': 'foo', 'two': 'bar'}, axis='columns') - df.rename({'a': 'apple', 'b': 'banana', 'd': 'durian'}, axis='index') + df.rename({"one": "foo", "two": "bar"}, axis="columns") + df.rename({"a": "apple", "b": "banana", "d": "durian"}, axis="index") The :meth:`~DataFrame.rename` method also provides an ``inplace`` named @@ -1464,12 +1492,12 @@ labels). .. ipython:: python - df = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], - 'y': [10, 20, 30, 40, 50, 60]}, - index=pd.MultiIndex.from_product([['a', 'b', 'c'], [1, 2]], - names=['let', 'num'])) + df = pd.DataFrame( + {"x": [1, 2, 3, 4, 5, 6], "y": [10, 20, 30, 40, 50, 60]}, + index=pd.MultiIndex.from_product([["a", "b", "c"], [1, 2]], names=["let", "num"]), + ) df - df.rename_axis(index={'let': 'abc'}) + df.rename_axis(index={"let": "abc"}) df.rename_axis(index=str.upper) .. _basics.iteration: @@ -1491,8 +1519,9 @@ Thus, for example, iterating over a DataFrame gives you the column names: .. ipython:: python - df = pd.DataFrame({'col1': np.random.randn(3), - 'col2': np.random.randn(3)}, index=['a', 'b', 'c']) + df = pd.DataFrame( + {"col1": np.random.randn(3), "col2": np.random.randn(3)}, index=["a", "b", "c"] + ) for col in df: print(col) @@ -1540,10 +1569,10 @@ To iterate over the rows of a DataFrame, you can use the following methods: .. ipython:: python - df = pd.DataFrame({'a': [1, 2, 3], 'b': ['a', 'b', 'c']}) + df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) for index, row in df.iterrows(): - row['a'] = 10 + row["a"] = 10 df @@ -1576,7 +1605,7 @@ index value along with a Series containing the data in each row: .. ipython:: python for row_index, row in df.iterrows(): - print(row_index, row, sep='\n') + print(row_index, row, sep="\n") .. note:: @@ -1586,7 +1615,7 @@ index value along with a Series containing the data in each row: .. ipython:: python - df_orig = pd.DataFrame([[1, 1.5]], columns=['int', 'float']) + df_orig = pd.DataFrame([[1, 1.5]], columns=["int", "float"]) df_orig.dtypes row = next(df_orig.iterrows())[1] row @@ -1596,8 +1625,8 @@ index value along with a Series containing the data in each row: .. ipython:: python - row['int'].dtype - df_orig['int'].dtype + row["int"].dtype + df_orig["int"].dtype To preserve dtypes while iterating over the rows, it is better to use :meth:`~DataFrame.itertuples` which returns namedtuples of the values @@ -1607,7 +1636,7 @@ For instance, a contrived way to transpose the DataFrame would be: .. ipython:: python - df2 = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) + df2 = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) print(df2) print(df2.T) @@ -1652,7 +1681,7 @@ This will return a Series, indexed like the existing Series. .. ipython:: python # datetime - s = pd.Series(pd.date_range('20130101 09:10:12', periods=4)) + s = pd.Series(pd.date_range("20130101 09:10:12", periods=4)) s s.dt.hour s.dt.second @@ -1668,7 +1697,7 @@ You can easily produces tz aware transformations: .. ipython:: python - stz = s.dt.tz_localize('US/Eastern') + stz = s.dt.tz_localize("US/Eastern") stz stz.dt.tz @@ -1676,7 +1705,7 @@ You can also chain these types of operations: .. ipython:: python - s.dt.tz_localize('UTC').dt.tz_convert('US/Eastern') + s.dt.tz_localize("UTC").dt.tz_convert("US/Eastern") You can also format datetime values as strings with :meth:`Series.dt.strftime` which supports the same format as the standard :meth:`~datetime.datetime.strftime`. @@ -1684,23 +1713,23 @@ supports the same format as the standard :meth:`~datetime.datetime.strftime`. .. ipython:: python # DatetimeIndex - s = pd.Series(pd.date_range('20130101', periods=4)) + s = pd.Series(pd.date_range("20130101", periods=4)) s - s.dt.strftime('%Y/%m/%d') + s.dt.strftime("%Y/%m/%d") .. ipython:: python # PeriodIndex - s = pd.Series(pd.period_range('20130101', periods=4)) + s = pd.Series(pd.period_range("20130101", periods=4)) s - s.dt.strftime('%Y/%m/%d') + s.dt.strftime("%Y/%m/%d") The ``.dt`` accessor works for period and timedelta dtypes. .. ipython:: python # period - s = pd.Series(pd.period_range('20130101', periods=4, freq='D')) + s = pd.Series(pd.period_range("20130101", periods=4, freq="D")) s s.dt.year s.dt.day @@ -1708,7 +1737,7 @@ The ``.dt`` accessor works for period and timedelta dtypes. .. ipython:: python # timedelta - s = pd.Series(pd.timedelta_range('1 day 00:00:05', periods=4, freq='s')) + s = pd.Series(pd.timedelta_range("1 day 00:00:05", periods=4, freq="s")) s s.dt.days s.dt.seconds @@ -1729,8 +1758,9 @@ built-in string methods. For example: .. ipython:: python - s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'], - dtype="string") + s = pd.Series( + ["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"], dtype="string" + ) s.str.lower() Powerful pattern-matching methods are provided as well, but note that @@ -1765,13 +1795,15 @@ used to sort a pandas object by its index levels. .. ipython:: python - df = pd.DataFrame({ - 'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']), - 'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']), - 'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])}) + df = pd.DataFrame( + { + "one": pd.Series(np.random.randn(3), index=["a", "b", "c"]), + "two": pd.Series(np.random.randn(4), index=["a", "b", "c", "d"]), + "three": pd.Series(np.random.randn(3), index=["b", "c", "d"]), + } + ) - unsorted_df = df.reindex(index=['a', 'd', 'c', 'b'], - columns=['three', 'two', 'one']) + unsorted_df = df.reindex(index=["a", "d", "c", "b"], columns=["three", "two", "one"]) unsorted_df # DataFrame @@ -1780,7 +1812,7 @@ used to sort a pandas object by its index levels. unsorted_df.sort_index(axis=1) # Series - unsorted_df['three'].sort_index() + unsorted_df["three"].sort_index() .. _basics.sort_index_key: @@ -1792,11 +1824,9 @@ the key is applied per-level to the levels specified by ``level``. .. ipython:: python - s1 = pd.DataFrame({ - "a": ['B', 'a', 'C'], - "b": [1, 2, 3], - "c": [2, 3, 4] - }).set_index(list("ab")) + s1 = pd.DataFrame({"a": ["B", "a", "C"], "b": [1, 2, 3], "c": [2, 3, 4]}).set_index( + list("ab") + ) s1 .. ipython:: python @@ -1819,16 +1849,14 @@ to use to determine the sorted order. .. ipython:: python - df1 = pd.DataFrame({'one': [2, 1, 1, 1], - 'two': [1, 3, 2, 4], - 'three': [5, 4, 3, 2]}) - df1.sort_values(by='two') + df1 = pd.DataFrame({"one": [2, 1, 1, 1], "two": [1, 3, 2, 4], "three": [5, 4, 3, 2]}) + df1.sort_values(by="two") The ``by`` parameter can take a list of column names, e.g.: .. ipython:: python - df1[['one', 'two', 'three']].sort_values(by=['one', 'two']) + df1[["one", "two", "three"]].sort_values(by=["one", "two"]) These methods have special treatment of NA values via the ``na_position`` argument: @@ -1837,7 +1865,7 @@ argument: s[2] = np.nan s.sort_values() - s.sort_values(na_position='first') + s.sort_values(na_position="first") .. _basics.sort_value_key: @@ -1848,7 +1876,7 @@ to apply to the values being sorted. .. ipython:: python - s1 = pd.Series(['B', 'a', 'C']) + s1 = pd.Series(["B", "a", "C"]) .. ipython:: python @@ -1862,12 +1890,12 @@ a Series, e.g. .. ipython:: python - df = pd.DataFrame({"a": ['B', 'a', 'C'], "b": [1, 2, 3]}) + df = pd.DataFrame({"a": ["B", "a", "C"], "b": [1, 2, 3]}) .. ipython:: python - df.sort_values(by='a') - df.sort_values(by='a', key=lambda col: col.str.lower()) + df.sort_values(by="a") + df.sort_values(by="a", key=lambda col: col.str.lower()) The name or type of each column can be used to apply different functions to different columns. @@ -1883,20 +1911,20 @@ refer to either columns or index level names. .. ipython:: python # Build MultiIndex - idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 2), - ('b', 2), ('b', 1), ('b', 1)]) - idx.names = ['first', 'second'] + idx = pd.MultiIndex.from_tuples( + [("a", 1), ("a", 2), ("a", 2), ("b", 2), ("b", 1), ("b", 1)] + ) + idx.names = ["first", "second"] # Build DataFrame - df_multi = pd.DataFrame({'A': np.arange(6, 0, -1)}, - index=idx) + df_multi = pd.DataFrame({"A": np.arange(6, 0, -1)}, index=idx) df_multi Sort by 'second' (index) and 'A' (column) .. ipython:: python - df_multi.sort_values(by=['second', 'A']) + df_multi.sort_values(by=["second", "A"]) .. note:: @@ -1917,8 +1945,8 @@ Series has the :meth:`~Series.searchsorted` method, which works similarly to ser = pd.Series([1, 2, 3]) ser.searchsorted([0, 3]) ser.searchsorted([0, 4]) - ser.searchsorted([1, 3], side='right') - ser.searchsorted([1, 3], side='left') + ser.searchsorted([1, 3], side="right") + ser.searchsorted([1, 3], side="left") ser = pd.Series([3, 1, 2]) ser.searchsorted([0, 3], sorter=np.argsort(ser)) @@ -1943,13 +1971,17 @@ faster than sorting the entire Series and calling ``head(n)`` on the result. .. ipython:: python - df = pd.DataFrame({'a': [-2, -1, 1, 10, 8, 11, -1], - 'b': list('abdceff'), - 'c': [1.0, 2.0, 4.0, 3.2, np.nan, 3.0, 4.0]}) - df.nlargest(3, 'a') - df.nlargest(5, ['a', 'c']) - df.nsmallest(3, 'a') - df.nsmallest(5, ['a', 'c']) + df = pd.DataFrame( + { + "a": [-2, -1, 1, 10, 8, 11, -1], + "b": list("abdceff"), + "c": [1.0, 2.0, 4.0, 3.2, np.nan, 3.0, 4.0], + } + ) + df.nlargest(3, "a") + df.nlargest(5, ["a", "c"]) + df.nsmallest(3, "a") + df.nsmallest(5, ["a", "c"]) .. _basics.multiindex_sorting: @@ -1962,10 +1994,8 @@ all levels to ``by``. .. ipython:: python - df1.columns = pd.MultiIndex.from_tuples([('a', 'one'), - ('a', 'two'), - ('b', 'three')]) - df1.sort_values(by=('a', 'two')) + df1.columns = pd.MultiIndex.from_tuples([("a", "one"), ("a", "two"), ("b", "three")]) + df1.sort_values(by=("a", "two")) Copying @@ -2048,13 +2078,17 @@ with the data type of each column. .. ipython:: python - dft = pd.DataFrame({'A': np.random.rand(3), - 'B': 1, - 'C': 'foo', - 'D': pd.Timestamp('20010102'), - 'E': pd.Series([1.0] * 3).astype('float32'), - 'F': False, - 'G': pd.Series([1] * 3, dtype='int8')}) + dft = pd.DataFrame( + { + "A": np.random.rand(3), + "B": 1, + "C": "foo", + "D": pd.Timestamp("20010102"), + "E": pd.Series([1.0] * 3).astype("float32"), + "F": False, + "G": pd.Series([1] * 3, dtype="int8"), + } + ) dft dft.dtypes @@ -2062,7 +2096,7 @@ On a ``Series`` object, use the :attr:`~Series.dtype` attribute. .. ipython:: python - dft['A'].dtype + dft["A"].dtype If a pandas object contains data with multiple dtypes *in a single column*, the dtype of the column will be chosen to accommodate all of the data types @@ -2071,10 +2105,10 @@ dtype of the column will be chosen to accommodate all of the data types .. ipython:: python # these ints are coerced to floats - pd.Series([1, 2, 3, 4, 5, 6.]) + pd.Series([1, 2, 3, 4, 5, 6.0]) # string data forces an ``object`` dtype - pd.Series([1, 2, 3, 6., 'foo']) + pd.Series([1, 2, 3, 6.0, "foo"]) The number of columns of each type in a ``DataFrame`` can be found by calling ``DataFrame.dtypes.value_counts()``. @@ -2090,13 +2124,16 @@ different numeric dtypes will **NOT** be combined. The following example will gi .. ipython:: python - df1 = pd.DataFrame(np.random.randn(8, 1), columns=['A'], dtype='float32') + df1 = pd.DataFrame(np.random.randn(8, 1), columns=["A"], dtype="float32") df1 df1.dtypes - df2 = pd.DataFrame({'A': pd.Series(np.random.randn(8), dtype='float16'), - 'B': pd.Series(np.random.randn(8)), - 'C': pd.Series(np.array(np.random.randn(8), - dtype='uint8'))}) + df2 = pd.DataFrame( + { + "A": pd.Series(np.random.randn(8), dtype="float16"), + "B": pd.Series(np.random.randn(8)), + "C": pd.Series(np.array(np.random.randn(8), dtype="uint8")), + } + ) df2 df2.dtypes @@ -2109,9 +2146,9 @@ The following will all result in ``int64`` dtypes. .. ipython:: python - pd.DataFrame([1, 2], columns=['a']).dtypes - pd.DataFrame({'a': [1, 2]}).dtypes - pd.DataFrame({'a': 1}, index=list(range(2))).dtypes + pd.DataFrame([1, 2], columns=["a"]).dtypes + pd.DataFrame({"a": [1, 2]}).dtypes + pd.DataFrame({"a": 1}, index=list(range(2))).dtypes Note that Numpy will choose *platform-dependent* types when creating arrays. The following **WILL** result in ``int32`` on 32-bit platform. @@ -2159,15 +2196,15 @@ then the more *general* one will be used as the result of the operation. df3.dtypes # conversion of dtypes - df3.astype('float32').dtypes + df3.astype("float32").dtypes Convert a subset of columns to a specified type using :meth:`~DataFrame.astype`. .. ipython:: python - dft = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) - dft[['a', 'b']] = dft[['a', 'b']].astype(np.uint8) + dft = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + dft[["a", "b"]] = dft[["a", "b"]].astype(np.uint8) dft dft.dtypes @@ -2175,8 +2212,8 @@ Convert certain columns to a specific dtype by passing a dict to :meth:`~DataFra .. ipython:: python - dft1 = pd.DataFrame({'a': [1, 0, 1], 'b': [4, 5, 6], 'c': [7, 8, 9]}) - dft1 = dft1.astype({'a': np.bool, 'c': np.float64}) + dft1 = pd.DataFrame({"a": [1, 0, 1], "b": [4, 5, 6], "c": [7, 8, 9]}) + dft1 = dft1.astype({"a": np.bool, "c": np.float64}) dft1 dft1.dtypes @@ -2188,9 +2225,9 @@ Convert certain columns to a specific dtype by passing a dict to :meth:`~DataFra .. ipython:: python - dft = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) - dft.loc[:, ['a', 'b']].astype(np.uint8).dtypes - dft.loc[:, ['a', 'b']] = dft.loc[:, ['a', 'b']].astype(np.uint8) + dft = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + dft.loc[:, ["a", "b"]].astype(np.uint8).dtypes + dft.loc[:, ["a", "b"]] = dft.loc[:, ["a", "b"]].astype(np.uint8) dft.dtypes .. _basics.object_conversion: @@ -2206,10 +2243,10 @@ to the correct type. .. ipython:: python import datetime - df = pd.DataFrame([[1, 2], - ['a', 'b'], - [datetime.datetime(2016, 3, 2), - datetime.datetime(2016, 3, 2)]]) + + df = pd.DataFrame( + [[1, 2], ["a", "b"], [datetime.datetime(2016, 3, 2), datetime.datetime(2016, 3, 2)]] + ) df = df.T df df.dtypes @@ -2228,7 +2265,7 @@ hard conversion of objects to a specified type: .. ipython:: python - m = ['1.1', 2, 3] + m = ["1.1", 2, 3] pd.to_numeric(m) * :meth:`~pandas.to_datetime` (conversion to datetime objects) @@ -2236,14 +2273,15 @@ hard conversion of objects to a specified type: .. ipython:: python import datetime - m = ['2016-07-09', datetime.datetime(2016, 3, 2)] + + m = ["2016-07-09", datetime.datetime(2016, 3, 2)] pd.to_datetime(m) * :meth:`~pandas.to_timedelta` (conversion to timedelta objects) .. ipython:: python - m = ['5us', pd.Timedelta('1day')] + m = ["5us", pd.Timedelta("1day")] pd.to_timedelta(m) To force a conversion, we can pass in an ``errors`` argument, which specifies how pandas should deal with elements @@ -2256,14 +2294,15 @@ non-conforming elements intermixed that you want to represent as missing: .. ipython:: python import datetime - m = ['apple', datetime.datetime(2016, 3, 2)] - pd.to_datetime(m, errors='coerce') - m = ['apple', 2, 3] - pd.to_numeric(m, errors='coerce') + m = ["apple", datetime.datetime(2016, 3, 2)] + pd.to_datetime(m, errors="coerce") - m = ['apple', pd.Timedelta('1day')] - pd.to_timedelta(m, errors='coerce') + m = ["apple", 2, 3] + pd.to_numeric(m, errors="coerce") + + m = ["apple", pd.Timedelta("1day")] + pd.to_timedelta(m, errors="coerce") The ``errors`` parameter has a third option of ``errors='ignore'``, which will simply return the passed in data if it encounters any errors with the conversion to a desired data type: @@ -2271,25 +2310,26 @@ encounters any errors with the conversion to a desired data type: .. ipython:: python import datetime - m = ['apple', datetime.datetime(2016, 3, 2)] - pd.to_datetime(m, errors='ignore') - m = ['apple', 2, 3] - pd.to_numeric(m, errors='ignore') + m = ["apple", datetime.datetime(2016, 3, 2)] + pd.to_datetime(m, errors="ignore") + + m = ["apple", 2, 3] + pd.to_numeric(m, errors="ignore") - m = ['apple', pd.Timedelta('1day')] - pd.to_timedelta(m, errors='ignore') + m = ["apple", pd.Timedelta("1day")] + pd.to_timedelta(m, errors="ignore") In addition to object conversion, :meth:`~pandas.to_numeric` provides another argument ``downcast``, which gives the option of downcasting the newly (or already) numeric data to a smaller dtype, which can conserve memory: .. ipython:: python - m = ['1', 2, 3] - pd.to_numeric(m, downcast='integer') # smallest signed int dtype - pd.to_numeric(m, downcast='signed') # same as 'integer' - pd.to_numeric(m, downcast='unsigned') # smallest unsigned int dtype - pd.to_numeric(m, downcast='float') # smallest float dtype + m = ["1", 2, 3] + pd.to_numeric(m, downcast="integer") # smallest signed int dtype + pd.to_numeric(m, downcast="signed") # same as 'integer' + pd.to_numeric(m, downcast="unsigned") # smallest unsigned int dtype + pd.to_numeric(m, downcast="float") # smallest float dtype As these methods apply only to one-dimensional arrays, lists or scalars; they cannot be used directly on multi-dimensional objects such as DataFrames. However, with :meth:`~pandas.DataFrame.apply`, we can "apply" the function over each column efficiently: @@ -2297,16 +2337,16 @@ as DataFrames. However, with :meth:`~pandas.DataFrame.apply`, we can "apply" the .. ipython:: python import datetime - df = pd.DataFrame([ - ['2016-07-09', datetime.datetime(2016, 3, 2)]] * 2, dtype='O') + + df = pd.DataFrame([["2016-07-09", datetime.datetime(2016, 3, 2)]] * 2, dtype="O") df df.apply(pd.to_datetime) - df = pd.DataFrame([['1.1', 2, 3]] * 2, dtype='O') + df = pd.DataFrame([["1.1", 2, 3]] * 2, dtype="O") df df.apply(pd.to_numeric) - df = pd.DataFrame([['5us', pd.Timedelta('1day')]] * 2, dtype='O') + df = pd.DataFrame([["5us", pd.Timedelta("1day")]] * 2, dtype="O") df df.apply(pd.to_timedelta) @@ -2319,8 +2359,8 @@ See also :ref:`Support for integer NA `. .. ipython:: python - dfi = df3.astype('int32') - dfi['E'] = 1 + dfi = df3.astype("int32") + dfi["E"] = 1 dfi dfi.dtypes @@ -2333,7 +2373,7 @@ While float dtypes are unchanged. .. ipython:: python dfa = df3.copy() - dfa['A'] = dfa['A'].astype('float32') + dfa["A"] = dfa["A"].astype("float32") dfa.dtypes casted = dfa[df2 > 0] @@ -2353,18 +2393,22 @@ dtypes: .. ipython:: python - df = pd.DataFrame({'string': list('abc'), - 'int64': list(range(1, 4)), - 'uint8': np.arange(3, 6).astype('u1'), - 'float64': np.arange(4.0, 7.0), - 'bool1': [True, False, True], - 'bool2': [False, True, False], - 'dates': pd.date_range('now', periods=3), - 'category': pd.Series(list("ABC")).astype('category')}) - df['tdeltas'] = df.dates.diff() - df['uint64'] = np.arange(3, 6).astype('u8') - df['other_dates'] = pd.date_range('20130101', periods=3) - df['tz_aware_dates'] = pd.date_range('20130101', periods=3, tz='US/Eastern') + df = pd.DataFrame( + { + "string": list("abc"), + "int64": list(range(1, 4)), + "uint8": np.arange(3, 6).astype("u1"), + "float64": np.arange(4.0, 7.0), + "bool1": [True, False, True], + "bool2": [False, True, False], + "dates": pd.date_range("now", periods=3), + "category": pd.Series(list("ABC")).astype("category"), + } + ) + df["tdeltas"] = df.dates.diff() + df["uint64"] = np.arange(3, 6).astype("u8") + df["other_dates"] = pd.date_range("20130101", periods=3) + df["tz_aware_dates"] = pd.date_range("20130101", periods=3, tz="US/Eastern") df And the dtypes: @@ -2388,7 +2432,7 @@ You can also pass the name of a dtype in the `NumPy dtype hierarchy .. ipython:: python - df.select_dtypes(include=['bool']) + df.select_dtypes(include=["bool"]) :meth:`~pandas.DataFrame.select_dtypes` also works with generic dtypes as well. @@ -2397,13 +2441,13 @@ integers: .. ipython:: python - df.select_dtypes(include=['number', 'bool'], exclude=['unsignedinteger']) + df.select_dtypes(include=["number", "bool"], exclude=["unsignedinteger"]) To select string columns you must use the ``object`` dtype: .. ipython:: python - df.select_dtypes(include=['object']) + df.select_dtypes(include=["object"]) To see all the child dtypes of a generic ``dtype`` like ``numpy.number`` you can define a function that returns a tree of child dtypes: diff --git a/setup.cfg b/setup.cfg index 3279a485c9bf3..a7c0f3484517f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -42,6 +42,7 @@ bootstrap = ignore = E203, # space before : (needed for how black formats slicing) E402, # module level import not at top of file W503, # line break before binary operator + E203, # space before : (needed for how black formats slicing) # Classes/functions in different blocks can generate those errors E302, # expected 2 blank lines, found 0 E305, # expected 2 blank lines after class or function definition, found 0 From cd8208640e503848a7f3276fc62c36cc95206700 Mon Sep 17 00:00:00 2001 From: beanan Date: Mon, 5 Oct 2020 16:05:51 +0800 Subject: [PATCH 19/38] CLN: Remove the duplicate configuration of flake8-rst in setup.cfg (#36877) --- setup.cfg | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index a7c0f3484517f..3279a485c9bf3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -42,7 +42,6 @@ bootstrap = ignore = E203, # space before : (needed for how black formats slicing) E402, # module level import not at top of file W503, # line break before binary operator - E203, # space before : (needed for how black formats slicing) # Classes/functions in different blocks can generate those errors E302, # expected 2 blank lines, found 0 E305, # expected 2 blank lines after class or function definition, found 0 From 0b5f2dd479ec7bea0c687a4220996c3ff5053af3 Mon Sep 17 00:00:00 2001 From: "T. JEGHAM" <41241424+Tazminia@users.noreply.github.com> Date: Mon, 5 Oct 2020 12:15:10 +0200 Subject: [PATCH 20/38] upgrade flake8 to 3.8.4 (#36882) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d0c9f12614d0d..6a311c6f702e8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,7 +4,7 @@ repos: hooks: - id: black - repo: https://gitlab.com/pycqa/flake8 - rev: 3.8.3 + rev: 3.8.4 hooks: - id: flake8 additional_dependencies: [flake8-comprehensions>=3.1.0] From 185d2eaf08c5e217bd09357b31f91b588e21a8e8 Mon Sep 17 00:00:00 2001 From: Meghana Varanasi Date: Mon, 5 Oct 2020 18:28:09 +0530 Subject: [PATCH 21/38] DOC: doc/source/whatsnew (#36857) --- doc/source/whatsnew/v0.10.0.rst | 29 ++-- doc/source/whatsnew/v0.10.1.rst | 64 ++++---- doc/source/whatsnew/v0.12.0.rst | 50 +++--- doc/source/whatsnew/v0.13.1.rst | 53 +++---- doc/source/whatsnew/v0.14.1.rst | 10 +- doc/source/whatsnew/v0.15.1.rst | 17 ++- doc/source/whatsnew/v0.16.1.rst | 58 ++++--- doc/source/whatsnew/v0.16.2.rst | 22 +-- doc/source/whatsnew/v0.17.0.rst | 156 +++++++++---------- doc/source/whatsnew/v0.17.1.rst | 14 +- doc/source/whatsnew/v0.18.1.rst | 88 ++++++----- doc/source/whatsnew/v0.19.0.rst | 261 +++++++++++++++++--------------- doc/source/whatsnew/v0.19.1.rst | 2 +- doc/source/whatsnew/v0.19.2.rst | 2 +- doc/source/whatsnew/v0.20.2.rst | 2 +- doc/source/whatsnew/v0.20.3.rst | 2 +- doc/source/whatsnew/v0.21.1.rst | 2 +- doc/source/whatsnew/v0.22.0.rst | 13 +- doc/source/whatsnew/v0.5.0.rst | 2 +- doc/source/whatsnew/v0.6.0.rst | 2 +- doc/source/whatsnew/v0.7.3.rst | 32 ++-- doc/source/whatsnew/v0.8.0.rst | 17 ++- doc/source/whatsnew/v0.9.0.rst | 10 +- 23 files changed, 472 insertions(+), 436 deletions(-) diff --git a/doc/source/whatsnew/v0.10.0.rst b/doc/source/whatsnew/v0.10.0.rst index 443250592a4a7..aa2749c85a232 100644 --- a/doc/source/whatsnew/v0.10.0.rst +++ b/doc/source/whatsnew/v0.10.0.rst @@ -49,8 +49,8 @@ talking about: :okwarning: import pandas as pd - df = pd.DataFrame(np.random.randn(6, 4), - index=pd.date_range('1/1/2000', periods=6)) + + df = pd.DataFrame(np.random.randn(6, 4), index=pd.date_range("1/1/2000", periods=6)) df # deprecated now df - df[0] @@ -184,12 +184,14 @@ labeled the aggregated group with the end of the interval: the next day). import io - data = ('a,b,c\n' - '1,Yes,2\n' - '3,No,4') + data = """ + a,b,c + 1,Yes,2 + 3,No,4 + """ print(data) pd.read_csv(io.StringIO(data), header=None) - pd.read_csv(io.StringIO(data), header=None, prefix='X') + pd.read_csv(io.StringIO(data), header=None, prefix="X") - Values like ``'Yes'`` and ``'No'`` are not interpreted as boolean by default, though this can be controlled by new ``true_values`` and ``false_values`` @@ -199,7 +201,7 @@ labeled the aggregated group with the end of the interval: the next day). print(data) pd.read_csv(io.StringIO(data)) - pd.read_csv(io.StringIO(data), true_values=['Yes'], false_values=['No']) + pd.read_csv(io.StringIO(data), true_values=["Yes"], false_values=["No"]) - The file parsers will not recognize non-string values arising from a converter function as NA if passed in the ``na_values`` argument. It's better @@ -210,10 +212,10 @@ labeled the aggregated group with the end of the interval: the next day). .. ipython:: python - s = pd.Series([np.nan, 1., 2., np.nan, 4]) + s = pd.Series([np.nan, 1.0, 2.0, np.nan, 4]) s s.fillna(0) - s.fillna(method='pad') + s.fillna(method="pad") Convenience methods ``ffill`` and ``bfill`` have been added: @@ -229,7 +231,8 @@ Convenience methods ``ffill`` and ``bfill`` have been added: .. ipython:: python def f(x): - return pd.Series([x, x**2], index=['x', 'x^2']) + return pd.Series([x, x ** 2], index=["x", "x^2"]) + s = pd.Series(np.random.rand(5)) s @@ -272,20 +275,20 @@ The old behavior of printing out summary information can be achieved via the .. ipython:: python - pd.set_option('expand_frame_repr', False) + pd.set_option("expand_frame_repr", False) wide_frame .. ipython:: python :suppress: - pd.reset_option('expand_frame_repr') + pd.reset_option("expand_frame_repr") The width of each line can be changed via 'line_width' (80 by default): .. code-block:: python - pd.set_option('line_width', 40) + pd.set_option("line_width", 40) wide_frame diff --git a/doc/source/whatsnew/v0.10.1.rst b/doc/source/whatsnew/v0.10.1.rst index 3dc680c46a4d9..d71a0d5ca68cd 100644 --- a/doc/source/whatsnew/v0.10.1.rst +++ b/doc/source/whatsnew/v0.10.1.rst @@ -45,29 +45,31 @@ You may need to upgrade your existing data files. Please visit the import os - os.remove('store.h5') + os.remove("store.h5") You can designate (and index) certain columns that you want to be able to perform queries on a table, by passing a list to ``data_columns`` .. ipython:: python - store = pd.HDFStore('store.h5') - df = pd.DataFrame(np.random.randn(8, 3), - index=pd.date_range('1/1/2000', periods=8), - columns=['A', 'B', 'C']) - df['string'] = 'foo' - df.loc[df.index[4:6], 'string'] = np.nan - df.loc[df.index[7:9], 'string'] = 'bar' - df['string2'] = 'cool' + store = pd.HDFStore("store.h5") + df = pd.DataFrame( + np.random.randn(8, 3), + index=pd.date_range("1/1/2000", periods=8), + columns=["A", "B", "C"], + ) + df["string"] = "foo" + df.loc[df.index[4:6], "string"] = np.nan + df.loc[df.index[7:9], "string"] = "bar" + df["string2"] = "cool" df # on-disk operations - store.append('df', df, data_columns=['B', 'C', 'string', 'string2']) - store.select('df', "B>0 and string=='foo'") + store.append("df", df, data_columns=["B", "C", "string", "string2"]) + store.select("df", "B>0 and string=='foo'") # this is in-memory version of this type of selection - df[(df.B > 0) & (df.string == 'foo')] + df[(df.B > 0) & (df.string == "foo")] Retrieving unique values in an indexable or data column. @@ -75,19 +77,19 @@ Retrieving unique values in an indexable or data column. # note that this is deprecated as of 0.14.0 # can be replicated by: store.select_column('df','index').unique() - store.unique('df', 'index') - store.unique('df', 'string') + store.unique("df", "index") + store.unique("df", "string") You can now store ``datetime64`` in data columns .. ipython:: python df_mixed = df.copy() - df_mixed['datetime64'] = pd.Timestamp('20010102') - df_mixed.loc[df_mixed.index[3:4], ['A', 'B']] = np.nan + df_mixed["datetime64"] = pd.Timestamp("20010102") + df_mixed.loc[df_mixed.index[3:4], ["A", "B"]] = np.nan - store.append('df_mixed', df_mixed) - df_mixed1 = store.select('df_mixed') + store.append("df_mixed", df_mixed) + df_mixed1 = store.select("df_mixed") df_mixed1 df_mixed1.dtypes.value_counts() @@ -97,7 +99,7 @@ columns, this is equivalent to passing a .. ipython:: python - store.select('df', columns=['A', 'B']) + store.select("df", columns=["A", "B"]) ``HDFStore`` now serializes MultiIndex dataframes when appending tables. @@ -160,29 +162,31 @@ combined result, by using ``where`` on a selector table. .. ipython:: python - df_mt = pd.DataFrame(np.random.randn(8, 6), - index=pd.date_range('1/1/2000', periods=8), - columns=['A', 'B', 'C', 'D', 'E', 'F']) - df_mt['foo'] = 'bar' + df_mt = pd.DataFrame( + np.random.randn(8, 6), + index=pd.date_range("1/1/2000", periods=8), + columns=["A", "B", "C", "D", "E", "F"], + ) + df_mt["foo"] = "bar" # you can also create the tables individually - store.append_to_multiple({'df1_mt': ['A', 'B'], 'df2_mt': None}, - df_mt, selector='df1_mt') + store.append_to_multiple( + {"df1_mt": ["A", "B"], "df2_mt": None}, df_mt, selector="df1_mt" + ) store # individual tables were created - store.select('df1_mt') - store.select('df2_mt') + store.select("df1_mt") + store.select("df2_mt") # as a multiple - store.select_as_multiple(['df1_mt', 'df2_mt'], where=['A>0', 'B>0'], - selector='df1_mt') + store.select_as_multiple(["df1_mt", "df2_mt"], where=["A>0", "B>0"], selector="df1_mt") .. ipython:: python :suppress: store.close() - os.remove('store.h5') + os.remove("store.h5") **Enhancements** diff --git a/doc/source/whatsnew/v0.12.0.rst b/doc/source/whatsnew/v0.12.0.rst index 9971ae22822f6..4de76510c6bc1 100644 --- a/doc/source/whatsnew/v0.12.0.rst +++ b/doc/source/whatsnew/v0.12.0.rst @@ -47,7 +47,7 @@ API changes .. ipython:: python - p = pd.DataFrame({'first': [4, 5, 8], 'second': [0, 0, 3]}) + p = pd.DataFrame({"first": [4, 5, 8], "second": [0, 0, 3]}) p % 0 p % p p / p @@ -95,8 +95,8 @@ API changes .. ipython:: python - df = pd.DataFrame(range(5), index=list('ABCDE'), columns=['a']) - mask = (df.a % 2 == 0) + df = pd.DataFrame(range(5), index=list("ABCDE"), columns=["a"]) + mask = df.a % 2 == 0 mask # this is what you should use @@ -141,21 +141,24 @@ API changes .. code-block:: python from pandas.io.parsers import ExcelFile - xls = ExcelFile('path_to_file.xls') - xls.parse('Sheet1', index_col=None, na_values=['NA']) + + xls = ExcelFile("path_to_file.xls") + xls.parse("Sheet1", index_col=None, na_values=["NA"]) With .. code-block:: python import pandas as pd - pd.read_excel('path_to_file.xls', 'Sheet1', index_col=None, na_values=['NA']) + + pd.read_excel("path_to_file.xls", "Sheet1", index_col=None, na_values=["NA"]) - added top-level function ``read_sql`` that is equivalent to the following .. code-block:: python from pandas.io.sql import read_frame + read_frame(...) - ``DataFrame.to_html`` and ``DataFrame.to_latex`` now accept a path for @@ -200,7 +203,7 @@ IO enhancements .. ipython:: python :okwarning: - df = pd.DataFrame({'a': range(3), 'b': list('abc')}) + df = pd.DataFrame({"a": range(3), "b": list("abc")}) print(df) html = df.to_html() alist = pd.read_html(html, index_col=0) @@ -248,16 +251,18 @@ IO enhancements .. ipython:: python from pandas._testing import makeCustomDataframe as mkdf + df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) - df.to_csv('mi.csv') - print(open('mi.csv').read()) - pd.read_csv('mi.csv', header=[0, 1, 2, 3], index_col=[0, 1]) + df.to_csv("mi.csv") + print(open("mi.csv").read()) + pd.read_csv("mi.csv", header=[0, 1, 2, 3], index_col=[0, 1]) .. ipython:: python :suppress: import os - os.remove('mi.csv') + + os.remove("mi.csv") - Support for ``HDFStore`` (via ``PyTables 3.0.0``) on Python3 @@ -304,8 +309,8 @@ Other enhancements .. ipython:: python - df = pd.DataFrame({'a': list('ab..'), 'b': [1, 2, 3, 4]}) - df.replace(regex=r'\s*\.\s*', value=np.nan) + df = pd.DataFrame({"a": list("ab.."), "b": [1, 2, 3, 4]}) + df.replace(regex=r"\s*\.\s*", value=np.nan) to replace all occurrences of the string ``'.'`` with zero or more instances of surrounding white space with ``NaN``. @@ -314,7 +319,7 @@ Other enhancements .. ipython:: python - df.replace('.', np.nan) + df.replace(".", np.nan) to replace all occurrences of the string ``'.'`` with ``NaN``. @@ -359,8 +364,8 @@ Other enhancements .. ipython:: python - dff = pd.DataFrame({'A': np.arange(8), 'B': list('aabbbbcc')}) - dff.groupby('B').filter(lambda x: len(x) > 2) + dff = pd.DataFrame({"A": np.arange(8), "B": list("aabbbbcc")}) + dff.groupby("B").filter(lambda x: len(x) > 2) Alternatively, instead of dropping the offending groups, we can return a like-indexed objects where the groups that do not pass the filter are @@ -368,7 +373,7 @@ Other enhancements .. ipython:: python - dff.groupby('B').filter(lambda x: len(x) > 2, dropna=False) + dff.groupby("B").filter(lambda x: len(x) > 2, dropna=False) - Series and DataFrame hist methods now take a ``figsize`` argument (:issue:`3834`) @@ -397,17 +402,18 @@ Experimental features from pandas.tseries.offsets import CustomBusinessDay from datetime import datetime + # As an interesting example, let's look at Egypt where # a Friday-Saturday weekend is observed. - weekmask_egypt = 'Sun Mon Tue Wed Thu' + weekmask_egypt = "Sun Mon Tue Wed Thu" # They also observe International Workers' Day so let's # add that for a couple of years - holidays = ['2012-05-01', datetime(2013, 5, 1), np.datetime64('2014-05-01')] + holidays = ["2012-05-01", datetime(2013, 5, 1), np.datetime64("2014-05-01")] bday_egypt = CustomBusinessDay(holidays=holidays, weekmask=weekmask_egypt) dt = datetime(2013, 4, 30) print(dt + 2 * bday_egypt) dts = pd.date_range(dt, periods=5, freq=bday_egypt) - print(pd.Series(dts.weekday, dts).map(pd.Series('Mon Tue Wed Thu Fri Sat Sun'.split()))) + print(pd.Series(dts.weekday, dts).map(pd.Series("Mon Tue Wed Thu Fri Sat Sun".split()))) Bug fixes ~~~~~~~~~ @@ -430,14 +436,14 @@ Bug fixes .. ipython:: python :okwarning: - strs = 'go', 'bow', 'joe', 'slow' + strs = "go", "bow", "joe", "slow" ds = pd.Series(strs) for s in ds.str: print(s) s - s.dropna().values.item() == 'w' + s.dropna().values.item() == "w" The last element yielded by the iterator will be a ``Series`` containing the last element of the longest string in the ``Series`` with all other diff --git a/doc/source/whatsnew/v0.13.1.rst b/doc/source/whatsnew/v0.13.1.rst index 9e416f8eeb3f1..1215786b4cccc 100644 --- a/doc/source/whatsnew/v0.13.1.rst +++ b/doc/source/whatsnew/v0.13.1.rst @@ -31,16 +31,16 @@ Highlights include: .. ipython:: python - df = pd.DataFrame({'A': np.array(['foo', 'bar', 'bah', 'foo', 'bar'])}) - df['A'].iloc[0] = np.nan + df = pd.DataFrame({"A": np.array(["foo", "bar", "bah", "foo", "bar"])}) + df["A"].iloc[0] = np.nan df The recommended way to do this type of assignment is: .. ipython:: python - df = pd.DataFrame({'A': np.array(['foo', 'bar', 'bah', 'foo', 'bar'])}) - df.loc[0, 'A'] = np.nan + df = pd.DataFrame({"A": np.array(["foo", "bar", "bah", "foo", "bar"])}) + df.loc[0, "A"] = np.nan df Output formatting enhancements @@ -52,24 +52,27 @@ Output formatting enhancements .. ipython:: python - max_info_rows = pd.get_option('max_info_rows') + max_info_rows = pd.get_option("max_info_rows") - df = pd.DataFrame({'A': np.random.randn(10), - 'B': np.random.randn(10), - 'C': pd.date_range('20130101', periods=10) - }) + df = pd.DataFrame( + { + "A": np.random.randn(10), + "B": np.random.randn(10), + "C": pd.date_range("20130101", periods=10), + } + ) df.iloc[3:6, [0, 2]] = np.nan .. ipython:: python # set to not display the null counts - pd.set_option('max_info_rows', 0) + pd.set_option("max_info_rows", 0) df.info() .. ipython:: python # this is the default (same as in 0.13.0) - pd.set_option('max_info_rows', max_info_rows) + pd.set_option("max_info_rows", max_info_rows) df.info() - Add ``show_dimensions`` display option for the new DataFrame repr to control whether the dimensions print. @@ -77,10 +80,10 @@ Output formatting enhancements .. ipython:: python df = pd.DataFrame([[1, 2], [3, 4]]) - pd.set_option('show_dimensions', False) + pd.set_option("show_dimensions", False) df - pd.set_option('show_dimensions', True) + pd.set_option("show_dimensions", True) df - The ``ArrayFormatter`` for ``datetime`` and ``timedelta64`` now intelligently @@ -98,10 +101,9 @@ Output formatting enhancements .. ipython:: python - df = pd.DataFrame([pd.Timestamp('20010101'), - pd.Timestamp('20040601')], columns=['age']) - df['today'] = pd.Timestamp('20130419') - df['diff'] = df['today'] - df['age'] + df = pd.DataFrame([pd.Timestamp("20010101"), pd.Timestamp("20040601")], columns=["age"]) + df["today"] = pd.Timestamp("20130419") + df["diff"] = df["today"] - df["age"] df API changes @@ -115,8 +117,8 @@ API changes .. ipython:: python - s = pd.Series(['a', 'a|b', np.nan, 'a|c']) - s.str.get_dummies(sep='|') + s = pd.Series(["a", "a|b", np.nan, "a|c"]) + s.str.get_dummies(sep="|") - Added the ``NDFrame.equals()`` method to compare if two NDFrames are equal have equal axes, dtypes, and values. Added the @@ -126,8 +128,8 @@ API changes .. code-block:: python - df = pd.DataFrame({'col': ['foo', 0, np.nan]}) - df2 = pd.DataFrame({'col': [np.nan, 0, 'foo']}, index=[2, 1, 0]) + df = pd.DataFrame({"col": ["foo", 0, np.nan]}) + df2 = pd.DataFrame({"col": [np.nan, 0, "foo"]}, index=[2, 1, 0]) df.equals(df2) df.equals(df2.sort_index()) @@ -204,8 +206,7 @@ Enhancements .. code-block:: python # Try to infer the format for the index column - df = pd.read_csv('foo.csv', index_col=0, parse_dates=True, - infer_datetime_format=True) + df = pd.read_csv("foo.csv", index_col=0, parse_dates=True, infer_datetime_format=True) - ``date_format`` and ``datetime_format`` keywords can now be specified when writing to ``excel`` files (:issue:`4133`) @@ -215,10 +216,10 @@ Enhancements .. ipython:: python - shades = ['light', 'dark'] - colors = ['red', 'green', 'blue'] + shades = ["light", "dark"] + colors = ["red", "green", "blue"] - pd.MultiIndex.from_product([shades, colors], names=['shade', 'color']) + pd.MultiIndex.from_product([shades, colors], names=["shade", "color"]) - Panel :meth:`~pandas.Panel.apply` will work on non-ufuncs. See :ref:`the docs`. diff --git a/doc/source/whatsnew/v0.14.1.rst b/doc/source/whatsnew/v0.14.1.rst index 354d67a525d0e..78fd182ea86c3 100644 --- a/doc/source/whatsnew/v0.14.1.rst +++ b/doc/source/whatsnew/v0.14.1.rst @@ -68,7 +68,8 @@ API changes :suppress: import pandas.tseries.offsets as offsets - d = pd.Timestamp('2014-01-01 09:00') + + d = pd.Timestamp("2014-01-01 09:00") .. ipython:: python @@ -100,10 +101,10 @@ Enhancements import pandas.tseries.offsets as offsets day = offsets.Day() - day.apply(pd.Timestamp('2014-01-01 09:00')) + day.apply(pd.Timestamp("2014-01-01 09:00")) day = offsets.Day(normalize=True) - day.apply(pd.Timestamp('2014-01-01 09:00')) + day.apply(pd.Timestamp("2014-01-01 09:00")) - ``PeriodIndex`` is represented as the same format as ``DatetimeIndex`` (:issue:`7601`) - ``StringMethods`` now work on empty Series (:issue:`7242`) @@ -123,8 +124,7 @@ Enhancements .. ipython:: python - rng = pd.date_range('3/6/2012 00:00', periods=10, freq='D', - tz='dateutil/Europe/London') + rng = pd.date_range("3/6/2012 00:00", periods=10, freq="D", tz="dateutil/Europe/London") rng.tz See :ref:`the docs `. diff --git a/doc/source/whatsnew/v0.15.1.rst b/doc/source/whatsnew/v0.15.1.rst index da56f07e84d9f..a1d4f9d14a905 100644 --- a/doc/source/whatsnew/v0.15.1.rst +++ b/doc/source/whatsnew/v0.15.1.rst @@ -23,7 +23,7 @@ API changes .. ipython:: python - s = pd.Series(pd.date_range('20130101', periods=5, freq='D')) + s = pd.Series(pd.date_range("20130101", periods=5, freq="D")) s.iloc[2] = np.nan s @@ -52,8 +52,7 @@ API changes .. ipython:: python np.random.seed(2718281) - df = pd.DataFrame(np.random.randint(0, 100, (10, 2)), - columns=['jim', 'joe']) + df = pd.DataFrame(np.random.randint(0, 100, (10, 2)), columns=["jim", "joe"]) df.head() ts = pd.Series(5 * np.random.randint(0, 3, 10)) @@ -80,9 +79,9 @@ API changes .. ipython:: python - df = pd.DataFrame({'jim': range(5), 'joe': range(5, 10)}) + df = pd.DataFrame({"jim": range(5), "joe": range(5, 10)}) df - gr = df.groupby(df['jim'] < 2) + gr = df.groupby(df["jim"] < 2) previous behavior (excludes 1st column from output): @@ -106,7 +105,7 @@ API changes .. ipython:: python - s = pd.Series(['a', 'b', 'c', 'd'], [4, 3, 2, 1]) + s = pd.Series(["a", "b", "c", "d"], [4, 3, 2, 1]) s previous behavior: @@ -208,6 +207,7 @@ Enhancements .. ipython:: python from collections import deque + df1 = pd.DataFrame([1, 2, 3]) df2 = pd.DataFrame([4, 5, 6]) @@ -228,8 +228,9 @@ Enhancements .. ipython:: python - dfi = pd.DataFrame(1, index=pd.MultiIndex.from_product([['a'], - range(1000)]), columns=['A']) + dfi = pd.DataFrame( + 1, index=pd.MultiIndex.from_product([["a"], range(1000)]), columns=["A"] + ) previous behavior: diff --git a/doc/source/whatsnew/v0.16.1.rst b/doc/source/whatsnew/v0.16.1.rst index a89ede8f024a0..39767684c01d0 100644 --- a/doc/source/whatsnew/v0.16.1.rst +++ b/doc/source/whatsnew/v0.16.1.rst @@ -209,9 +209,8 @@ when sampling from rows. .. ipython:: python - df = pd.DataFrame({'col1': [9, 8, 7, 6], - 'weight_column': [0.5, 0.4, 0.1, 0]}) - df.sample(n=3, weights='weight_column') + df = pd.DataFrame({"col1": [9, 8, 7, 6], "weight_column": [0.5, 0.4, 0.1, 0]}) + df.sample(n=3, weights="weight_column") .. _whatsnew_0161.enhancements.string: @@ -229,7 +228,7 @@ enhancements make string operations easier and more consistent with standard pyt .. ipython:: python - idx = pd.Index([' jack', 'jill ', ' jesse ', 'frank']) + idx = pd.Index([" jack", "jill ", " jesse ", "frank"]) idx.str.strip() One special case for the ``.str`` accessor on ``Index`` is that if a string method returns ``bool``, the ``.str`` accessor @@ -238,11 +237,11 @@ enhancements make string operations easier and more consistent with standard pyt .. ipython:: python - idx = pd.Index(['a1', 'a2', 'b1', 'b2']) + idx = pd.Index(["a1", "a2", "b1", "b2"]) s = pd.Series(range(4), index=idx) s - idx.str.startswith('a') - s[s.index.str.startswith('a')] + idx.str.startswith("a") + s[s.index.str.startswith("a")] - The following new methods are accessible via ``.str`` accessor to apply the function to each values. (:issue:`9766`, :issue:`9773`, :issue:`10031`, :issue:`10045`, :issue:`10052`) @@ -257,21 +256,21 @@ enhancements make string operations easier and more consistent with standard pyt .. ipython:: python - s = pd.Series(['a,b', 'a,c', 'b,c']) + s = pd.Series(["a,b", "a,c", "b,c"]) # return Series - s.str.split(',') + s.str.split(",") # return DataFrame - s.str.split(',', expand=True) + s.str.split(",", expand=True) - idx = pd.Index(['a,b', 'a,c', 'b,c']) + idx = pd.Index(["a,b", "a,c", "b,c"]) # return Index - idx.str.split(',') + idx.str.split(",") # return MultiIndex - idx.str.split(',', expand=True) + idx.str.split(",", expand=True) - Improved ``extract`` and ``get_dummies`` methods for ``Index.str`` (:issue:`9980`) @@ -286,9 +285,9 @@ Other enhancements .. ipython:: python - pd.Timestamp('2014-08-01 09:00') + pd.tseries.offsets.BusinessHour() - pd.Timestamp('2014-08-01 07:00') + pd.tseries.offsets.BusinessHour() - pd.Timestamp('2014-08-01 16:30') + pd.tseries.offsets.BusinessHour() + pd.Timestamp("2014-08-01 09:00") + pd.tseries.offsets.BusinessHour() + pd.Timestamp("2014-08-01 07:00") + pd.tseries.offsets.BusinessHour() + pd.Timestamp("2014-08-01 16:30") + pd.tseries.offsets.BusinessHour() - ``DataFrame.diff`` now takes an ``axis`` parameter that determines the direction of differencing (:issue:`9727`) @@ -300,8 +299,8 @@ Other enhancements .. ipython:: python - df = pd.DataFrame(np.random.randn(3, 3), columns=['A', 'B', 'C']) - df.drop(['A', 'X'], axis=1, errors='ignore') + df = pd.DataFrame(np.random.randn(3, 3), columns=["A", "B", "C"]) + df.drop(["A", "X"], axis=1, errors="ignore") - Add support for separating years and quarters using dashes, for example 2014-Q1. (:issue:`9688`) @@ -382,19 +381,16 @@ New behavior .. ipython:: python - pd.set_option('display.width', 80) - pd.Index(range(4), name='foo') - pd.Index(range(30), name='foo') - pd.Index(range(104), name='foo') - pd.CategoricalIndex(['a', 'bb', 'ccc', 'dddd'], - ordered=True, name='foobar') - pd.CategoricalIndex(['a', 'bb', 'ccc', 'dddd'] * 10, - ordered=True, name='foobar') - pd.CategoricalIndex(['a', 'bb', 'ccc', 'dddd'] * 100, - ordered=True, name='foobar') - pd.date_range('20130101', periods=4, name='foo', tz='US/Eastern') - pd.date_range('20130101', periods=25, freq='D') - pd.date_range('20130101', periods=104, name='foo', tz='US/Eastern') + pd.set_option("display.width", 80) + pd.Index(range(4), name="foo") + pd.Index(range(30), name="foo") + pd.Index(range(104), name="foo") + pd.CategoricalIndex(["a", "bb", "ccc", "dddd"], ordered=True, name="foobar") + pd.CategoricalIndex(["a", "bb", "ccc", "dddd"] * 10, ordered=True, name="foobar") + pd.CategoricalIndex(["a", "bb", "ccc", "dddd"] * 100, ordered=True, name="foobar") + pd.date_range("20130101", periods=4, name="foo", tz="US/Eastern") + pd.date_range("20130101", periods=25, freq="D") + pd.date_range("20130101", periods=104, name="foo", tz="US/Eastern") .. _whatsnew_0161.performance: diff --git a/doc/source/whatsnew/v0.16.2.rst b/doc/source/whatsnew/v0.16.2.rst index 2cb0cbec68eff..bb2aa166419b4 100644 --- a/doc/source/whatsnew/v0.16.2.rst +++ b/doc/source/whatsnew/v0.16.2.rst @@ -48,9 +48,10 @@ This can be rewritten as .. code-block:: python - (df.pipe(h) # noqa F821 - .pipe(g, arg1=1) # noqa F821 - .pipe(f, arg2=2, arg3=3) # noqa F821 + ( + df.pipe(h) # noqa F821 + .pipe(g, arg1=1) # noqa F821 + .pipe(f, arg2=2, arg3=3) # noqa F821 ) Now both the code and the logic flow from top to bottom. Keyword arguments are next to @@ -64,15 +65,16 @@ of ``(function, keyword)`` indicating where the DataFrame should flow. For examp import statsmodels.formula.api as sm - bb = pd.read_csv('data/baseball.csv', index_col='id') + bb = pd.read_csv("data/baseball.csv", index_col="id") # sm.ols takes (formula, data) - (bb.query('h > 0') - .assign(ln_h=lambda df: np.log(df.h)) - .pipe((sm.ols, 'data'), 'hr ~ ln_h + year + g + C(lg)') - .fit() - .summary() - ) + ( + bb.query("h > 0") + .assign(ln_h=lambda df: np.log(df.h)) + .pipe((sm.ols, "data"), "hr ~ ln_h + year + g + C(lg)") + .fit() + .summary() + ) The pipe method is inspired by unix pipes, which stream text through processes. More recently dplyr_ and magrittr_ have introduced the diff --git a/doc/source/whatsnew/v0.17.0.rst b/doc/source/whatsnew/v0.17.0.rst index 3e49bb30401a3..1658f877f5523 100644 --- a/doc/source/whatsnew/v0.17.0.rst +++ b/doc/source/whatsnew/v0.17.0.rst @@ -80,9 +80,13 @@ The new implementation allows for having a single-timezone across all rows, with .. ipython:: python - df = pd.DataFrame({'A': pd.date_range('20130101', periods=3), - 'B': pd.date_range('20130101', periods=3, tz='US/Eastern'), - 'C': pd.date_range('20130101', periods=3, tz='CET')}) + df = pd.DataFrame( + { + "A": pd.date_range("20130101", periods=3), + "B": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "C": pd.date_range("20130101", periods=3, tz="CET"), + } + ) df df.dtypes @@ -95,8 +99,8 @@ This uses a new-dtype representation as well, that is very similar in look-and-f .. ipython:: python - df['B'].dtype - type(df['B'].dtype) + df["B"].dtype + type(df["B"].dtype) .. note:: @@ -119,8 +123,8 @@ This uses a new-dtype representation as well, that is very similar in look-and-f .. ipython:: python - pd.date_range('20130101', periods=3, tz='US/Eastern') - pd.date_range('20130101', periods=3, tz='US/Eastern').dtype + pd.date_range("20130101", periods=3, tz="US/Eastern") + pd.date_range("20130101", periods=3, tz="US/Eastern").dtype .. _whatsnew_0170.gil: @@ -138,9 +142,10 @@ as well as the ``.sum()`` operation. N = 1000000 ngroups = 10 - df = DataFrame({'key': np.random.randint(0, ngroups, size=N), - 'data': np.random.randn(N)}) - df.groupby('key')['data'].sum() + df = DataFrame( + {"key": np.random.randint(0, ngroups, size=N), "data": np.random.randn(N)} + ) + df.groupby("key")["data"].sum() Releasing of the GIL could benefit an application that uses threads for user interactions (e.g. QT_), or performing multi-threaded computations. A nice example of a library that can handle these types of computation-in-parallel is the dask_ library. @@ -189,16 +194,16 @@ We are now supporting a ``Series.dt.strftime`` method for datetime-likes to gene .. ipython:: python # DatetimeIndex - s = pd.Series(pd.date_range('20130101', periods=4)) + s = pd.Series(pd.date_range("20130101", periods=4)) s - s.dt.strftime('%Y/%m/%d') + s.dt.strftime("%Y/%m/%d") .. ipython:: python # PeriodIndex - s = pd.Series(pd.period_range('20130101', periods=4)) + s = pd.Series(pd.period_range("20130101", periods=4)) s - s.dt.strftime('%Y/%m/%d') + s.dt.strftime("%Y/%m/%d") The string format is as the python standard library and details can be found `here `_ @@ -210,7 +215,7 @@ Series.dt.total_seconds .. ipython:: python # TimedeltaIndex - s = pd.Series(pd.timedelta_range('1 minutes', periods=4)) + s = pd.Series(pd.timedelta_range("1 minutes", periods=4)) s s.dt.total_seconds() @@ -225,18 +230,18 @@ A multiplied freq represents a span of corresponding length. The example below c .. ipython:: python - p = pd.Period('2015-08-01', freq='3D') + p = pd.Period("2015-08-01", freq="3D") p p + 1 p - 2 p.to_timestamp() - p.to_timestamp(how='E') + p.to_timestamp(how="E") You can use the multiplied freq in ``PeriodIndex`` and ``period_range``. .. ipython:: python - idx = pd.period_range('2015-08-01', periods=4, freq='2D') + idx = pd.period_range("2015-08-01", periods=4, freq="2D") idx idx + 1 @@ -249,14 +254,14 @@ Support for SAS XPORT files .. code-block:: python - df = pd.read_sas('sas_xport.xpt') + df = pd.read_sas("sas_xport.xpt") It is also possible to obtain an iterator and read an XPORT file incrementally. .. code-block:: python - for df in pd.read_sas('sas_xport.xpt', chunksize=10000): + for df in pd.read_sas("sas_xport.xpt", chunksize=10000): do_something(df) See the :ref:`docs ` for more details. @@ -270,7 +275,7 @@ Support for math functions in .eval() .. code-block:: python - df = pd.DataFrame({'a': np.random.randn(10)}) + df = pd.DataFrame({"a": np.random.randn(10)}) df.eval("b = sin(a)") The support math functions are ``sin``, ``cos``, ``exp``, ``log``, ``expm1``, ``log1p``, @@ -292,23 +297,26 @@ See the :ref:`documentation ` for more details. .. ipython:: python - df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], - columns=pd.MultiIndex.from_product( - [['foo', 'bar'], ['a', 'b']], names=['col1', 'col2']), - index=pd.MultiIndex.from_product([['j'], ['l', 'k']], - names=['i1', 'i2'])) + df = pd.DataFrame( + [[1, 2, 3, 4], [5, 6, 7, 8]], + columns=pd.MultiIndex.from_product( + [["foo", "bar"], ["a", "b"]], names=["col1", "col2"] + ), + index=pd.MultiIndex.from_product([["j"], ["l", "k"]], names=["i1", "i2"]), + ) df - df.to_excel('test.xlsx') + df.to_excel("test.xlsx") - df = pd.read_excel('test.xlsx', header=[0, 1], index_col=[0, 1]) + df = pd.read_excel("test.xlsx", header=[0, 1], index_col=[0, 1]) df .. ipython:: python :suppress: import os - os.remove('test.xlsx') + + os.remove("test.xlsx") Previously, it was necessary to specify the ``has_index_names`` argument in ``read_excel``, if the serialized data had index names. For version 0.17.0 the output format of ``to_excel`` @@ -354,14 +362,14 @@ Some East Asian countries use Unicode characters its width is corresponding to 2 .. ipython:: python - df = pd.DataFrame({u'国籍': ['UK', u'日本'], u'名前': ['Alice', u'しのぶ']}) + df = pd.DataFrame({u"国籍": ["UK", u"日本"], u"名前": ["Alice", u"しのぶ"]}) df; .. image:: ../_static/option_unicode01.png .. ipython:: python - pd.set_option('display.unicode.east_asian_width', True) + pd.set_option("display.unicode.east_asian_width", True) df; .. image:: ../_static/option_unicode02.png @@ -371,7 +379,7 @@ For further details, see :ref:`here ` .. ipython:: python :suppress: - pd.set_option('display.unicode.east_asian_width', False) + pd.set_option("display.unicode.east_asian_width", False) .. _whatsnew_0170.enhancements.other: @@ -391,9 +399,9 @@ Other enhancements .. ipython:: python - df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']}) - df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]}) - pd.merge(df1, df2, on='col1', how='outer', indicator=True) + df1 = pd.DataFrame({"col1": [0, 1], "col_left": ["a", "b"]}) + df2 = pd.DataFrame({"col1": [1, 2, 2], "col_right": [2, 2, 2]}) + pd.merge(df1, df2, on="col1", how="outer", indicator=True) For more, see the :ref:`updated docs ` @@ -407,7 +415,7 @@ Other enhancements .. ipython:: python - foo = pd.Series([1, 2], name='foo') + foo = pd.Series([1, 2], name="foo") bar = pd.Series([1, 2]) baz = pd.Series([4, 5]) @@ -434,46 +442,43 @@ Other enhancements .. ipython:: python ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, np.nan, 13]) - ser.interpolate(limit=1, limit_direction='both') + ser.interpolate(limit=1, limit_direction="both") - Added a ``DataFrame.round`` method to round the values to a variable number of decimal places (:issue:`10568`). .. ipython:: python - df = pd.DataFrame(np.random.random([3, 3]), - columns=['A', 'B', 'C'], - index=['first', 'second', 'third']) + df = pd.DataFrame( + np.random.random([3, 3]), + columns=["A", "B", "C"], + index=["first", "second", "third"], + ) df df.round(2) - df.round({'A': 0, 'C': 2}) + df.round({"A": 0, "C": 2}) - ``drop_duplicates`` and ``duplicated`` now accept a ``keep`` keyword to target first, last, and all duplicates. The ``take_last`` keyword is deprecated, see :ref:`here ` (:issue:`6511`, :issue:`8505`) .. ipython:: python - s = pd.Series(['A', 'B', 'C', 'A', 'B', 'D']) + s = pd.Series(["A", "B", "C", "A", "B", "D"]) s.drop_duplicates() - s.drop_duplicates(keep='last') + s.drop_duplicates(keep="last") s.drop_duplicates(keep=False) - Reindex now has a ``tolerance`` argument that allows for finer control of :ref:`basics.limits_on_reindex_fill` (:issue:`10411`): .. ipython:: python - df = pd.DataFrame({'x': range(5), - 't': pd.date_range('2000-01-01', periods=5)}) - df.reindex([0.1, 1.9, 3.5], - method='nearest', - tolerance=0.2) + df = pd.DataFrame({"x": range(5), "t": pd.date_range("2000-01-01", periods=5)}) + df.reindex([0.1, 1.9, 3.5], method="nearest", tolerance=0.2) When used on a ``DatetimeIndex``, ``TimedeltaIndex`` or ``PeriodIndex``, ``tolerance`` will coerced into a ``Timedelta`` if possible. This allows you to specify tolerance with a string: .. ipython:: python - df = df.set_index('t') - df.reindex(pd.to_datetime(['1999-12-31']), - method='nearest', - tolerance='1 day') + df = df.set_index("t") + df.reindex(pd.to_datetime(["1999-12-31"]), method="nearest", tolerance="1 day") ``tolerance`` is also exposed by the lower level ``Index.get_indexer`` and ``Index.get_loc`` methods. @@ -627,13 +632,13 @@ Of course you can coerce this as well. .. ipython:: python - pd.to_datetime(['2009-07-31', 'asd'], errors='coerce') + pd.to_datetime(["2009-07-31", "asd"], errors="coerce") To keep the previous behavior, you can use ``errors='ignore'``: .. ipython:: python - pd.to_datetime(['2009-07-31', 'asd'], errors='ignore') + pd.to_datetime(["2009-07-31", "asd"], errors="ignore") Furthermore, ``pd.to_timedelta`` has gained a similar API, of ``errors='raise'|'ignore'|'coerce'``, and the ``coerce`` keyword has been deprecated in favor of ``errors='coerce'``. @@ -667,9 +672,9 @@ New behavior: .. ipython:: python - pd.Timestamp('2012Q2') - pd.Timestamp('2014') - pd.DatetimeIndex(['2012Q2', '2014']) + pd.Timestamp("2012Q2") + pd.Timestamp("2014") + pd.DatetimeIndex(["2012Q2", "2014"]) .. note:: @@ -678,6 +683,7 @@ New behavior: .. ipython:: python import pandas.tseries.offsets as offsets + pd.Timestamp.now() pd.Timestamp.now() + offsets.DateOffset(years=1) @@ -780,8 +786,7 @@ Previous behavior: .. ipython:: python - df_with_missing = pd.DataFrame({'col1': [0, np.nan, 2], - 'col2': [1, np.nan, np.nan]}) + df_with_missing = pd.DataFrame({"col1": [0, np.nan, 2], "col2": [1, np.nan, np.nan]}) df_with_missing @@ -806,18 +811,16 @@ New behavior: .. ipython:: python - df_with_missing.to_hdf('file.h5', - 'df_with_missing', - format='table', - mode='w') + df_with_missing.to_hdf("file.h5", "df_with_missing", format="table", mode="w") - pd.read_hdf('file.h5', 'df_with_missing') + pd.read_hdf("file.h5", "df_with_missing") .. ipython:: python :suppress: import os - os.remove('file.h5') + + os.remove("file.h5") See the :ref:`docs ` for more details. @@ -848,8 +851,8 @@ regular formatting as well as scientific notation, similar to how numpy's ``prec .. ipython:: python - pd.set_option('display.precision', 2) - pd.DataFrame({'x': [123.456789]}) + pd.set_option("display.precision", 2) + pd.DataFrame({"x": [123.456789]}) To preserve output behavior with prior versions the default value of ``display.precision`` has been reduced to ``6`` from ``7``. @@ -857,7 +860,7 @@ from ``7``. .. ipython:: python :suppress: - pd.set_option('display.precision', 6) + pd.set_option("display.precision", 6) .. _whatsnew_0170.api_breaking.categorical_unique: @@ -871,14 +874,11 @@ Changes to ``Categorical.unique`` .. ipython:: python - cat = pd.Categorical(['C', 'A', 'B', 'C'], - categories=['A', 'B', 'C'], - ordered=True) + cat = pd.Categorical(["C", "A", "B", "C"], categories=["A", "B", "C"], ordered=True) cat cat.unique() - cat = pd.Categorical(['C', 'A', 'B', 'C'], - categories=['A', 'B', 'C']) + cat = pd.Categorical(["C", "A", "B", "C"], categories=["A", "B", "C"]) cat cat.unique() @@ -980,9 +980,11 @@ Removal of prior version deprecations/changes .. ipython:: python np.random.seed(1234) - df = pd.DataFrame(np.random.randn(5, 2), - columns=list('AB'), - index=pd.date_range('2013-01-01', periods=5)) + df = pd.DataFrame( + np.random.randn(5, 2), + columns=list("AB"), + index=pd.date_range("2013-01-01", periods=5), + ) df Previously @@ -1005,7 +1007,7 @@ Removal of prior version deprecations/changes .. ipython:: python - df.add(df.A, axis='index') + df.add(df.A, axis="index") - Remove ``table`` keyword in ``HDFStore.put/append``, in favor of using ``format=`` (:issue:`4645`) diff --git a/doc/source/whatsnew/v0.17.1.rst b/doc/source/whatsnew/v0.17.1.rst index 5d15a01aee5a0..6b0a28ec47568 100644 --- a/doc/source/whatsnew/v0.17.1.rst +++ b/doc/source/whatsnew/v0.17.1.rst @@ -52,8 +52,8 @@ Here's a quick example: .. ipython:: python np.random.seed(123) - df = pd.DataFrame(np.random.randn(10, 5), columns=list('abcde')) - html = df.style.background_gradient(cmap='viridis', low=.5) + df = pd.DataFrame(np.random.randn(10, 5), columns=list("abcde")) + html = df.style.background_gradient(cmap="viridis", low=0.5) We can render the HTML to get the following table. @@ -80,14 +80,14 @@ Enhancements .. ipython:: python - df = pd.DataFrame({'A': ['foo'] * 1000}) # noqa: F821 - df['B'] = df['A'].astype('category') + df = pd.DataFrame({"A": ["foo"] * 1000}) # noqa: F821 + df["B"] = df["A"].astype("category") # shows the '+' as we have object dtypes df.info() # we have an accurate memory assessment (but can be expensive to compute this) - df.info(memory_usage='deep') + df.info(memory_usage="deep") - ``Index`` now has a ``fillna`` method (:issue:`10089`) @@ -99,11 +99,11 @@ Enhancements .. ipython:: python - s = pd.Series(list('aabb')).astype('category') + s = pd.Series(list("aabb")).astype("category") s s.str.contains("a") - date = pd.Series(pd.date_range('1/1/2015', periods=5)).astype('category') + date = pd.Series(pd.date_range("1/1/2015", periods=5)).astype("category") date date.dt.day diff --git a/doc/source/whatsnew/v0.18.1.rst b/doc/source/whatsnew/v0.18.1.rst index 13ed6bc38163b..3db00f686d62c 100644 --- a/doc/source/whatsnew/v0.18.1.rst +++ b/doc/source/whatsnew/v0.18.1.rst @@ -42,6 +42,7 @@ see :ref:`Custom Business Hour ` (:issue:`11514`) from pandas.tseries.offsets import CustomBusinessHour from pandas.tseries.holiday import USFederalHolidayCalendar + bhour_us = CustomBusinessHour(calendar=USFederalHolidayCalendar()) Friday before MLK Day @@ -49,6 +50,7 @@ Friday before MLK Day .. ipython:: python import datetime + dt = datetime.datetime(2014, 1, 17, 15) dt + bhour_us @@ -72,41 +74,42 @@ Previously you would have to do this to get a rolling window mean per-group: .. ipython:: python - df = pd.DataFrame({'A': [1] * 20 + [2] * 12 + [3] * 8, - 'B': np.arange(40)}) + df = pd.DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}) df .. ipython:: python - df.groupby('A').apply(lambda x: x.rolling(4).B.mean()) + df.groupby("A").apply(lambda x: x.rolling(4).B.mean()) Now you can do: .. ipython:: python - df.groupby('A').rolling(4).B.mean() + df.groupby("A").rolling(4).B.mean() For ``.resample(..)`` type of operations, previously you would have to: .. ipython:: python - df = pd.DataFrame({'date': pd.date_range(start='2016-01-01', - periods=4, - freq='W'), - 'group': [1, 1, 2, 2], - 'val': [5, 6, 7, 8]}).set_index('date') + df = pd.DataFrame( + { + "date": pd.date_range(start="2016-01-01", periods=4, freq="W"), + "group": [1, 1, 2, 2], + "val": [5, 6, 7, 8], + } + ).set_index("date") df .. ipython:: python - df.groupby('group').apply(lambda x: x.resample('1D').ffill()) + df.groupby("group").apply(lambda x: x.resample("1D").ffill()) Now you can do: .. ipython:: python - df.groupby('group').resample('1D').ffill() + df.groupby("group").resample("1D").ffill() .. _whatsnew_0181.enhancements.method_chain: @@ -129,9 +132,7 @@ arguments. .. ipython:: python - df = pd.DataFrame({'A': [1, 2, 3], - 'B': [4, 5, 6], - 'C': [7, 8, 9]}) + df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) df.where(lambda x: x > 4, lambda x: x + 10) Methods ``.loc[]``, ``.iloc[]``, ``.ix[]`` @@ -146,7 +147,7 @@ can return a valid boolean indexer or anything which is valid for these indexer' df.loc[lambda x: x.A >= 2, lambda x: x.sum() > 10] # callable returns list of labels - df.loc[lambda x: [1, 2], lambda x: ['A', 'B']] + df.loc[lambda x: [1, 2], lambda x: ["A", "B"]] Indexing with``[]`` """"""""""""""""""" @@ -157,17 +158,15 @@ class and index type. .. ipython:: python - df[lambda x: 'A'] + df[lambda x: "A"] Using these methods / indexers, you can chain data selection operations without using temporary variable. .. ipython:: python - bb = pd.read_csv('data/baseball.csv', index_col='id') - (bb.groupby(['year', 'team']) - .sum() - .loc[lambda df: df.r > 100]) + bb = pd.read_csv("data/baseball.csv", index_col="id") + (bb.groupby(["year", "team"]).sum().loc[lambda df: df.r > 100]) .. _whatsnew_0181.partial_string_indexing: @@ -180,13 +179,13 @@ Partial string indexing now matches on ``DateTimeIndex`` when part of a ``MultiI dft2 = pd.DataFrame( np.random.randn(20, 1), - columns=['A'], - index=pd.MultiIndex.from_product([pd.date_range('20130101', - periods=10, - freq='12H'), - ['a', 'b']])) + columns=["A"], + index=pd.MultiIndex.from_product( + [pd.date_range("20130101", periods=10, freq="12H"), ["a", "b"]] + ), + ) dft2 - dft2.loc['2013-01-05'] + dft2.loc["2013-01-05"] On other levels @@ -195,7 +194,7 @@ On other levels idx = pd.IndexSlice dft2 = dft2.swaplevel(0, 1).sort_index() dft2 - dft2.loc[idx[:, '2013-01-05'], :] + dft2.loc[idx[:, "2013-01-05"], :] .. _whatsnew_0181.enhancements.assembling: @@ -206,10 +205,9 @@ Assembling datetimes .. ipython:: python - df = pd.DataFrame({'year': [2015, 2016], - 'month': [2, 3], - 'day': [4, 5], - 'hour': [2, 3]}) + df = pd.DataFrame( + {"year": [2015, 2016], "month": [2, 3], "day": [4, 5], "hour": [2, 3]} + ) df Assembling using the passed frame. @@ -222,7 +220,7 @@ You can pass only the columns that you need to assemble. .. ipython:: python - pd.to_datetime(df[['year', 'month', 'day']]) + pd.to_datetime(df[["year", "month", "day"]]) .. _whatsnew_0181.other: @@ -243,7 +241,7 @@ Other enhancements .. ipython:: python - idx = pd.Index([1., 2., 3., 4.], dtype='float') + idx = pd.Index([1.0, 2.0, 3.0, 4.0], dtype="float") # default, allow_fill=True, fill_value=None idx.take([2, -1]) @@ -253,8 +251,8 @@ Other enhancements .. ipython:: python - idx = pd.Index(['a|b', 'a|c', 'b|c']) - idx.str.get_dummies('|') + idx = pd.Index(["a|b", "a|c", "b|c"]) + idx.str.get_dummies("|") - ``pd.crosstab()`` has gained a ``normalize`` argument for normalizing frequency tables (:issue:`12569`). Examples in the updated docs :ref:`here `. @@ -313,8 +311,7 @@ The index in ``.groupby(..).nth()`` output is now more consistent when the ``as_ .. ipython:: python - df = pd.DataFrame({'A': ['a', 'b', 'a'], - 'B': [1, 2, 3]}) + df = pd.DataFrame({"A": ["a", "b", "a"], "B": [1, 2, 3]}) df Previous behavior: @@ -337,16 +334,16 @@ New behavior: .. ipython:: python - df.groupby('A', as_index=True)['B'].nth(0) - df.groupby('A', as_index=False)['B'].nth(0) + df.groupby("A", as_index=True)["B"].nth(0) + df.groupby("A", as_index=False)["B"].nth(0) Furthermore, previously, a ``.groupby`` would always sort, regardless if ``sort=False`` was passed with ``.nth()``. .. ipython:: python np.random.seed(1234) - df = pd.DataFrame(np.random.randn(100, 2), columns=['a', 'b']) - df['c'] = np.random.randint(0, 4, 100) + df = pd.DataFrame(np.random.randn(100, 2), columns=["a", "b"]) + df["c"] = np.random.randint(0, 4, 100) Previous behavior: @@ -374,8 +371,8 @@ New behavior: .. ipython:: python - df.groupby('c', sort=True).nth(1) - df.groupby('c', sort=False).nth(1) + df.groupby("c", sort=True).nth(1) + df.groupby("c", sort=False).nth(1) .. _whatsnew_0181.numpy_compatibility: @@ -421,8 +418,9 @@ Using ``apply`` on resampling groupby operations (using a ``pd.TimeGrouper``) no .. ipython:: python - df = pd.DataFrame({'date': pd.to_datetime(['10/10/2000', '11/10/2000']), - 'value': [10, 13]}) + df = pd.DataFrame( + {"date": pd.to_datetime(["10/10/2000", "11/10/2000"]), "value": [10, 13]} + ) df Previous behavior: diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index 5732367a69af2..08ccc1565125f 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -49,10 +49,8 @@ except that we match on nearest key rather than equal keys. .. ipython:: python - left = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c']}) - right = pd.DataFrame({'a': [1, 2, 3, 6, 7], - 'right_val': [1, 2, 3, 6, 7]}) + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}) left right @@ -62,13 +60,13 @@ recent value otherwise. .. ipython:: python - pd.merge_asof(left, right, on='a') + pd.merge_asof(left, right, on="a") We can also match rows ONLY with prior data, and not an exact match. .. ipython:: python - pd.merge_asof(left, right, on='a', allow_exact_matches=False) + pd.merge_asof(left, right, on="a", allow_exact_matches=False) In a typical time-series example, we have ``trades`` and ``quotes`` and we want to ``asof-join`` them. @@ -76,36 +74,44 @@ This also illustrates using the ``by`` parameter to group data before merging. .. ipython:: python - trades = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.038', - '20160525 13:30:00.048', - '20160525 13:30:00.048', - '20160525 13:30:00.048']), - 'ticker': ['MSFT', 'MSFT', - 'GOOG', 'GOOG', 'AAPL'], - 'price': [51.95, 51.95, - 720.77, 720.92, 98.00], - 'quantity': [75, 155, - 100, 100, 100]}, - columns=['time', 'ticker', 'price', 'quantity']) - - quotes = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.023', - '20160525 13:30:00.030', - '20160525 13:30:00.041', - '20160525 13:30:00.048', - '20160525 13:30:00.049', - '20160525 13:30:00.072', - '20160525 13:30:00.075']), - 'ticker': ['GOOG', 'MSFT', 'MSFT', 'MSFT', - 'GOOG', 'AAPL', 'GOOG', 'MSFT'], - 'bid': [720.50, 51.95, 51.97, 51.99, - 720.50, 97.99, 720.50, 52.01], - 'ask': [720.93, 51.96, 51.98, 52.00, - 720.93, 98.01, 720.88, 52.03]}, - columns=['time', 'ticker', 'bid', 'ask']) + trades = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.038", + "20160525 13:30:00.048", + "20160525 13:30:00.048", + "20160525 13:30:00.048", + ] + ), + "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + "price": [51.95, 51.95, 720.77, 720.92, 98.00], + "quantity": [75, 155, 100, 100, 100], + }, + columns=["time", "ticker", "price", "quantity"], + ) + + quotes = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.023", + "20160525 13:30:00.030", + "20160525 13:30:00.041", + "20160525 13:30:00.048", + "20160525 13:30:00.049", + "20160525 13:30:00.072", + "20160525 13:30:00.075", + ] + ), + "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"], + "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], + "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], + }, + columns=["time", "ticker", "bid", "ask"], + ) .. ipython:: python @@ -118,9 +124,7 @@ that forward filling happens automatically taking the most recent non-NaN value. .. ipython:: python - pd.merge_asof(trades, quotes, - on='time', - by='ticker') + pd.merge_asof(trades, quotes, on="time", by="ticker") This returns a merged DataFrame with the entries in the same order as the original left passed DataFrame (``trades`` in this case), with the fields of the ``quotes`` merged. @@ -135,9 +139,10 @@ See the full documentation :ref:`here `. .. ipython:: python - dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, - index=pd.date_range('20130101 09:00:00', - periods=5, freq='s')) + dft = pd.DataFrame( + {"B": [0, 1, 2, np.nan, 4]}, + index=pd.date_range("20130101 09:00:00", periods=5, freq="s"), + ) dft This is a regular frequency index. Using an integer window parameter works to roll along the window frequency. @@ -151,20 +156,26 @@ Specifying an offset allows a more intuitive specification of the rolling freque .. ipython:: python - dft.rolling('2s').sum() + dft.rolling("2s").sum() Using a non-regular, but still monotonic index, rolling with an integer window does not impart any special calculation. .. ipython:: python - dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, - index=pd.Index([pd.Timestamp('20130101 09:00:00'), - pd.Timestamp('20130101 09:00:02'), - pd.Timestamp('20130101 09:00:03'), - pd.Timestamp('20130101 09:00:05'), - pd.Timestamp('20130101 09:00:06')], - name='foo')) + dft = pd.DataFrame( + {"B": [0, 1, 2, np.nan, 4]}, + index=pd.Index( + [ + pd.Timestamp("20130101 09:00:00"), + pd.Timestamp("20130101 09:00:02"), + pd.Timestamp("20130101 09:00:03"), + pd.Timestamp("20130101 09:00:05"), + pd.Timestamp("20130101 09:00:06"), + ], + name="foo", + ), + ) dft dft.rolling(2).sum() @@ -173,7 +184,7 @@ Using the time-specification generates variable windows for this sparse data. .. ipython:: python - dft.rolling('2s').sum() + dft.rolling("2s").sum() Furthermore, we now allow an optional ``on`` parameter to specify a column (rather than the default of the index) in a DataFrame. @@ -182,7 +193,7 @@ default of the index) in a DataFrame. dft = dft.reset_index() dft - dft.rolling('2s', on='foo').sum() + dft.rolling("2s", on="foo").sum() .. _whatsnew_0190.enhancements.read_csv_dupe_col_names_support: @@ -199,8 +210,8 @@ they are in the file or passed in as the ``names`` parameter (:issue:`7160`, :is .. ipython:: python - data = '0,1,2\n3,4,5' - names = ['a', 'b', 'a'] + data = "0,1,2\n3,4,5" + names = ["a", "b", "a"] **Previous behavior**: @@ -235,17 +246,22 @@ converting to ``Categorical`` after parsing. See the io :ref:`docs here ` (:issue:`10008`, :issue:`13156`) @@ -415,7 +431,7 @@ The ``pd.get_dummies`` function now returns dummy-encoded columns as small integ .. ipython:: python - pd.get_dummies(['a', 'b', 'a', 'c']).dtypes + pd.get_dummies(["a", "b", "a", "c"]).dtypes .. _whatsnew_0190.enhancements.to_numeric_downcast: @@ -427,9 +443,9 @@ Downcast values to smallest possible dtype in ``to_numeric`` .. ipython:: python - s = ['1', 2, 3] - pd.to_numeric(s, downcast='unsigned') - pd.to_numeric(s, downcast='integer') + s = ["1", 2, 3] + pd.to_numeric(s, downcast="unsigned") + pd.to_numeric(s, downcast="integer") .. _whatsnew_0190.dev_api: @@ -447,7 +463,8 @@ The following are now part of this API: import pprint from pandas.api import types - funcs = [f for f in dir(types) if not f.startswith('_')] + + funcs = [f for f in dir(types) if not f.startswith("_")] pprint.pprint(funcs) .. note:: @@ -472,16 +489,16 @@ Other enhancements .. ipython:: python - df = pd.DataFrame({'date': pd.date_range('2015-01-01', freq='W', periods=5), - 'a': np.arange(5)}, - index=pd.MultiIndex.from_arrays([[1, 2, 3, 4, 5], - pd.date_range('2015-01-01', - freq='W', - periods=5) - ], names=['v', 'd'])) + df = pd.DataFrame( + {"date": pd.date_range("2015-01-01", freq="W", periods=5), "a": np.arange(5)}, + index=pd.MultiIndex.from_arrays( + [[1, 2, 3, 4, 5], pd.date_range("2015-01-01", freq="W", periods=5)], + names=["v", "d"], + ), + ) df - df.resample('M', on='date').sum() - df.resample('M', level='d').sum() + df.resample("M", on="date").sum() + df.resample("M", level="d").sum() - The ``.get_credentials()`` method of ``GbqConnector`` can now first try to fetch `the application default credentials `__. See the docs for more details (:issue:`13577`). - The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behavior remains to raising a ``NonExistentTimeError`` (:issue:`13057`) @@ -507,10 +524,9 @@ Other enhancements .. ipython:: python - df = pd.DataFrame({'A': [2, 7], 'B': [3, 5], 'C': [4, 8]}, - index=['row1', 'row2']) + df = pd.DataFrame({"A": [2, 7], "B": [3, 5], "C": [4, 8]}, index=["row1", "row2"]) df - df.sort_values(by='row2', axis=1) + df.sort_values(by="row2", axis=1) - Added documentation to :ref:`I/O` regarding the perils of reading in columns with mixed dtypes and how to handle it (:issue:`13746`) - :meth:`~DataFrame.to_html` now has a ``border`` argument to control the value in the opening ```` tag. The default is the value of the ``html.border`` option, which defaults to 1. This also affects the notebook HTML repr, but since Jupyter's CSS includes a border-width attribute, the visual effect is the same. (:issue:`11563`). @@ -583,12 +599,12 @@ Arithmetic operators align both ``index`` (no changes). .. ipython:: python - s1 = pd.Series([1, 2, 3], index=list('ABC')) - s2 = pd.Series([2, 2, 2], index=list('ABD')) + s1 = pd.Series([1, 2, 3], index=list("ABC")) + s2 = pd.Series([2, 2, 2], index=list("ABD")) s1 + s2 - df1 = pd.DataFrame([1, 2, 3], index=list('ABC')) - df2 = pd.DataFrame([2, 2, 2], index=list('ABD')) + df1 = pd.DataFrame([1, 2, 3], index=list("ABC")) + df2 = pd.DataFrame([2, 2, 2], index=list("ABD")) df1 + df2 Comparison operators @@ -661,8 +677,8 @@ Logical operators align both ``.index`` of left and right hand side. .. ipython:: python - s1 = pd.Series([True, False, True], index=list('ABC')) - s2 = pd.Series([True, True, True], index=list('ABD')) + s1 = pd.Series([True, False, True], index=list("ABC")) + s2 = pd.Series([True, True, True], index=list("ABD")) s1 & s2 .. note:: @@ -679,8 +695,8 @@ Logical operators align both ``.index`` of left and right hand side. .. ipython:: python - df1 = pd.DataFrame([True, False, True], index=list('ABC')) - df2 = pd.DataFrame([True, True, True], index=list('ABD')) + df1 = pd.DataFrame([True, False, True], index=list("ABC")) + df2 = pd.DataFrame([True, True, True], index=list("ABD")) df1 & df2 Flexible comparison methods @@ -691,8 +707,8 @@ which has the different ``index``. .. ipython:: python - s1 = pd.Series([1, 2, 3], index=['a', 'b', 'c']) - s2 = pd.Series([2, 2, 2], index=['b', 'c', 'd']) + s1 = pd.Series([1, 2, 3], index=["a", "b", "c"]) + s2 = pd.Series([2, 2, 2], index=["b", "c", "d"]) s1.eq(s2) s1.ge(s2) @@ -749,7 +765,7 @@ This will now convert integers/floats with the default unit of ``ns``. .. ipython:: python - pd.to_datetime([1, 'foo'], errors='coerce') + pd.to_datetime([1, "foo"], errors="coerce") Bug fixes related to ``.to_datetime()``: @@ -768,9 +784,9 @@ Merging will now preserve the dtype of the join keys (:issue:`8596`) .. ipython:: python - df1 = pd.DataFrame({'key': [1], 'v1': [10]}) + df1 = pd.DataFrame({"key": [1], "v1": [10]}) df1 - df2 = pd.DataFrame({'key': [1, 2], 'v1': [20, 30]}) + df2 = pd.DataFrame({"key": [1, 2], "v1": [20, 30]}) df2 **Previous behavior**: @@ -796,16 +812,16 @@ We are able to preserve the join keys .. ipython:: python - pd.merge(df1, df2, how='outer') - pd.merge(df1, df2, how='outer').dtypes + pd.merge(df1, df2, how="outer") + pd.merge(df1, df2, how="outer").dtypes Of course if you have missing values that are introduced, then the resulting dtype will be upcast, which is unchanged from previous. .. ipython:: python - pd.merge(df1, df2, how='outer', on='key') - pd.merge(df1, df2, how='outer', on='key').dtypes + pd.merge(df1, df2, how="outer", on="key") + pd.merge(df1, df2, how="outer", on="key").dtypes .. _whatsnew_0190.api.describe: @@ -889,7 +905,7 @@ As a consequence of this change, ``PeriodIndex`` no longer has an integer dtype: .. ipython:: python - pi = pd.PeriodIndex(['2016-08-01'], freq='D') + pi = pd.PeriodIndex(["2016-08-01"], freq="D") pi pd.api.types.is_integer_dtype(pi) pd.api.types.is_period_dtype(pi) @@ -916,7 +932,7 @@ These result in ``pd.NaT`` without providing ``freq`` option. .. ipython:: python - pd.Period('NaT') + pd.Period("NaT") pd.Period(None) @@ -955,7 +971,7 @@ of integers (:issue:`13988`). .. ipython:: python - pi = pd.PeriodIndex(['2011-01', '2011-02'], freq='M') + pi = pd.PeriodIndex(["2011-01", "2011-02"], freq="M") pi.values @@ -985,7 +1001,7 @@ Previous behavior: .. ipython:: python - pd.Index(['a', 'b']) + pd.Index(['a', 'c']) + pd.Index(["a", "b"]) + pd.Index(["a", "c"]) Note that numeric Index objects already performed element-wise operations. For example, the behavior of adding two integer Indexes is unchanged. @@ -1011,8 +1027,10 @@ DatetimeIndex objects resulting in a TimedeltaIndex: .. ipython:: python - (pd.DatetimeIndex(['2016-01-01', '2016-01-02']) - - pd.DatetimeIndex(['2016-01-02', '2016-01-03'])) + ( + pd.DatetimeIndex(["2016-01-01", "2016-01-02"]) + - pd.DatetimeIndex(["2016-01-02", "2016-01-03"]) + ) .. _whatsnew_0190.api.difference: @@ -1073,8 +1091,7 @@ Previously, most ``Index`` classes returned ``np.ndarray``, and ``DatetimeIndex` .. ipython:: python pd.Index([1, 2, 3]).unique() - pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], - tz='Asia/Tokyo').unique() + pd.DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"], tz="Asia/Tokyo").unique() .. _whatsnew_0190.api.multiindex: @@ -1086,8 +1103,8 @@ in ``MultiIndex`` levels (:issue:`13743`, :issue:`13854`). .. ipython:: python - cat = pd.Categorical(['a', 'b'], categories=list("bac")) - lvl1 = ['foo', 'bar'] + cat = pd.Categorical(["a", "b"], categories=list("bac")) + lvl1 = ["foo", "bar"] midx = pd.MultiIndex.from_arrays([cat, lvl1]) midx @@ -1113,9 +1130,9 @@ As a consequence, ``groupby`` and ``set_index`` also preserve categorical dtypes .. ipython:: python - df = pd.DataFrame({'A': [0, 1], 'B': [10, 11], 'C': cat}) - df_grouped = df.groupby(by=['A', 'C']).first() - df_set_idx = df.set_index(['A', 'C']) + df = pd.DataFrame({"A": [0, 1], "B": [10, 11], "C": cat}) + df_grouped = df.groupby(by=["A", "C"]).first() + df_set_idx = df.set_index(["A", "C"]) **Previous behavior**: @@ -1163,7 +1180,7 @@ the result of calling :func:`read_csv` without the ``chunksize=`` argument .. ipython:: python - data = 'A,B\n0,1\n2,3\n4,5\n6,7' + data = "A,B\n0,1\n2,3\n4,5\n6,7" **Previous behavior**: @@ -1248,7 +1265,7 @@ Operators now preserve dtypes .. code-block:: python - s = pd.SparseSeries([1., 0., 2., 0.], fill_value=0) + s = pd.SparseSeries([1.0, 0.0, 2.0, 0.0], fill_value=0) s s.astype(np.int64) diff --git a/doc/source/whatsnew/v0.19.1.rst b/doc/source/whatsnew/v0.19.1.rst index f8b60f457b33f..6ff3fb6900a99 100644 --- a/doc/source/whatsnew/v0.19.1.rst +++ b/doc/source/whatsnew/v0.19.1.rst @@ -8,7 +8,7 @@ Version 0.19.1 (November 3, 2016) .. ipython:: python :suppress: - from pandas import * # noqa F401, F403 + from pandas import * # noqa F401, F403 This is a minor bug-fix release from 0.19.0 and includes some small regression fixes, diff --git a/doc/source/whatsnew/v0.19.2.rst b/doc/source/whatsnew/v0.19.2.rst index 924c95f21ceff..bba89d78be869 100644 --- a/doc/source/whatsnew/v0.19.2.rst +++ b/doc/source/whatsnew/v0.19.2.rst @@ -8,7 +8,7 @@ Version 0.19.2 (December 24, 2016) .. ipython:: python :suppress: - from pandas import * # noqa F401, F403 + from pandas import * # noqa F401, F403 This is a minor bug-fix release in the 0.19.x series and includes some small regression fixes, diff --git a/doc/source/whatsnew/v0.20.2.rst b/doc/source/whatsnew/v0.20.2.rst index 7f84c6b3f17bd..430a39d2d2e97 100644 --- a/doc/source/whatsnew/v0.20.2.rst +++ b/doc/source/whatsnew/v0.20.2.rst @@ -8,7 +8,7 @@ Version 0.20.2 (June 4, 2017) .. ipython:: python :suppress: - from pandas import * # noqa F401, F403 + from pandas import * # noqa F401, F403 This is a minor bug-fix release in the 0.20.x series and includes some small regression fixes, diff --git a/doc/source/whatsnew/v0.20.3.rst b/doc/source/whatsnew/v0.20.3.rst index 888d0048ca9f3..ff28f6830783e 100644 --- a/doc/source/whatsnew/v0.20.3.rst +++ b/doc/source/whatsnew/v0.20.3.rst @@ -8,7 +8,7 @@ Version 0.20.3 (July 7, 2017) .. ipython:: python :suppress: - from pandas import * # noqa F401, F403 + from pandas import * # noqa F401, F403 This is a minor bug-fix release in the 0.20.x series and includes some small regression fixes diff --git a/doc/source/whatsnew/v0.21.1.rst b/doc/source/whatsnew/v0.21.1.rst index 2d72f6470fc81..090a988d6406a 100644 --- a/doc/source/whatsnew/v0.21.1.rst +++ b/doc/source/whatsnew/v0.21.1.rst @@ -8,7 +8,7 @@ Version 0.21.1 (December 12, 2017) .. ipython:: python :suppress: - from pandas import * # noqa F401, F403 + from pandas import * # noqa F401, F403 This is a minor bug-fix release in the 0.21.x series and includes some small regression fixes, diff --git a/doc/source/whatsnew/v0.22.0.rst b/doc/source/whatsnew/v0.22.0.rst index 92b514ce59660..ec9769c22e76b 100644 --- a/doc/source/whatsnew/v0.22.0.rst +++ b/doc/source/whatsnew/v0.22.0.rst @@ -8,7 +8,7 @@ Version 0.22.0 (December 29, 2017) .. ipython:: python :suppress: - from pandas import * # noqa F401, F403 + from pandas import * # noqa F401, F403 This is a major release from 0.21.1 and includes a single, API-breaking change. @@ -119,7 +119,7 @@ instead of ``NaN``. .. ipython:: python - grouper = pd.Categorical(['a', 'a'], categories=['a', 'b']) + grouper = pd.Categorical(["a", "a"], categories=["a", "b"]) pd.Series([1, 2]).groupby(grouper).sum() To restore the 0.21 behavior of returning ``NaN`` for unobserved groups, @@ -159,15 +159,14 @@ sum and ``1`` for product. .. ipython:: python - s = pd.Series([1, 1, np.nan, np.nan], - index=pd.date_range('2017', periods=4)) - s.resample('2d').sum() + s = pd.Series([1, 1, np.nan, np.nan], index=pd.date_range("2017", periods=4)) + s.resample("2d").sum() To restore the 0.21 behavior of returning ``NaN``, use ``min_count>=1``. .. ipython:: python - s.resample('2d').sum(min_count=1) + s.resample("2d").sum(min_count=1) In particular, upsampling and taking the sum or product is affected, as upsampling introduces missing values even if the original series was @@ -190,7 +189,7 @@ entirely valid. .. ipython:: python - idx = pd.DatetimeIndex(['2017-01-01', '2017-01-02']) + idx = pd.DatetimeIndex(["2017-01-01", "2017-01-02"]) pd.Series([1, 2], index=idx).resample("12H").sum() Once again, the ``min_count`` keyword is available to restore the 0.21 behavior. diff --git a/doc/source/whatsnew/v0.5.0.rst b/doc/source/whatsnew/v0.5.0.rst index 7ccb141260f18..7447a10fa1d6b 100644 --- a/doc/source/whatsnew/v0.5.0.rst +++ b/doc/source/whatsnew/v0.5.0.rst @@ -9,7 +9,7 @@ Version 0.5.0 (October 24, 2011) .. ipython:: python :suppress: - from pandas import * # noqa F401, F403 + from pandas import * # noqa F401, F403 New features diff --git a/doc/source/whatsnew/v0.6.0.rst b/doc/source/whatsnew/v0.6.0.rst index 1cb9dcbe159aa..8ff688eaa91e7 100644 --- a/doc/source/whatsnew/v0.6.0.rst +++ b/doc/source/whatsnew/v0.6.0.rst @@ -8,7 +8,7 @@ Version 0.6.0 (November 25, 2011) .. ipython:: python :suppress: - from pandas import * # noqa F401, F403 + from pandas import * # noqa F401, F403 New features diff --git a/doc/source/whatsnew/v0.7.3.rst b/doc/source/whatsnew/v0.7.3.rst index 5ed48c0d8d6d9..4ca31baf560bb 100644 --- a/doc/source/whatsnew/v0.7.3.rst +++ b/doc/source/whatsnew/v0.7.3.rst @@ -23,7 +23,8 @@ New features .. code-block:: python from pandas.tools.plotting import scatter_matrix - scatter_matrix(df, alpha=0.2) # noqa F821 + + scatter_matrix(df, alpha=0.2) # noqa F821 - Add ``stacked`` argument to Series and DataFrame's ``plot`` method for @@ -31,12 +32,12 @@ New features .. code-block:: python - df.plot(kind='bar', stacked=True) # noqa F821 + df.plot(kind="bar", stacked=True) # noqa F821 .. code-block:: python - df.plot(kind='barh', stacked=True) # noqa F821 + df.plot(kind="barh", stacked=True) # noqa F821 - Add log x and y :ref:`scaling options ` to @@ -52,9 +53,9 @@ Reverted some changes to how NA values (represented typically as ``NaN`` or .. ipython:: python - series = pd.Series(['Steve', np.nan, 'Joe']) - series == 'Steve' - series != 'Steve' + series = pd.Series(["Steve", np.nan, "Joe"]) + series == "Steve" + series != "Steve" In comparisons, NA / NaN will always come through as ``False`` except with ``!=`` which is ``True``. *Be very careful* with boolean arithmetic, especially @@ -63,7 +64,7 @@ filter into boolean array operations if you are worried about this: .. ipython:: python - mask = series == 'Steve' + mask = series == "Steve" series[mask & series.notnull()] While propagating NA in comparisons may seem like the right behavior to some @@ -82,15 +83,18 @@ Series, to be more consistent with the ``groupby`` behavior with DataFrame: .. ipython:: python :okwarning: - df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), 'D': np.random.randn(8)}) + df = pd.DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + } + ) df - grouped = df.groupby('A')['C'] + grouped = df.groupby("A")["C"] grouped.describe() - grouped.apply(lambda x: x.sort_values()[-2:]) # top 2 values + grouped.apply(lambda x: x.sort_values()[-2:]) # top 2 values .. _whatsnew_0.7.3.contributors: diff --git a/doc/source/whatsnew/v0.8.0.rst b/doc/source/whatsnew/v0.8.0.rst index 9bba68d8c331d..8a84630a28b34 100644 --- a/doc/source/whatsnew/v0.8.0.rst +++ b/doc/source/whatsnew/v0.8.0.rst @@ -159,7 +159,8 @@ New plotting methods .. code-block:: python import pandas as pd - fx = pd.read_pickle('data/fx_prices') + + fx = pd.read_pickle("data/fx_prices") import matplotlib.pyplot as plt ``Series.plot`` now supports a ``secondary_y`` option: @@ -168,20 +169,19 @@ New plotting methods plt.figure() - fx['FR'].plot(style='g') + fx["FR"].plot(style="g") - fx['IT'].plot(style='k--', secondary_y=True) + fx["IT"].plot(style="k--", secondary_y=True) Vytautas Jancauskas, the 2012 GSOC participant, has added many new plot types. For example, ``'kde'`` is a new option: .. ipython:: python - s = pd.Series(np.concatenate((np.random.randn(1000), - np.random.randn(1000) * 0.5 + 3))) + s = pd.Series(np.concatenate((np.random.randn(1000), np.random.randn(1000) * 0.5 + 3))) plt.figure() s.hist(density=True, alpha=0.2) - s.plot(kind='kde') + s.plot(kind="kde") See :ref:`the plotting page ` for much more. @@ -205,7 +205,8 @@ with code using scalar values because you are handing control over to NumPy: .. ipython:: python import datetime - rng = pd.date_range('1/1/2000', periods=10) + + rng = pd.date_range("1/1/2000", periods=10) rng[5] isinstance(rng[5], datetime.datetime) rng_asarray = np.asarray(rng) @@ -251,7 +252,7 @@ type. See `matplotlib documentation .. ipython:: python - rng = pd.date_range('1/1/2000', periods=10) + rng = pd.date_range("1/1/2000", periods=10) rng np.asarray(rng) converted = np.asarray(rng, dtype=object) diff --git a/doc/source/whatsnew/v0.9.0.rst b/doc/source/whatsnew/v0.9.0.rst index 5172b1989765d..44ded51e31fda 100644 --- a/doc/source/whatsnew/v0.9.0.rst +++ b/doc/source/whatsnew/v0.9.0.rst @@ -41,9 +41,11 @@ API changes import io - data = ('0,0,1\n' - '1,1,0\n' - '0,1,0') + data = """ + 0,0,1 + 1,1,0 + 0,1,0 + """ df = pd.read_csv(io.StringIO(data), header=None) df @@ -59,7 +61,7 @@ API changes s1 = pd.Series([1, 2, 3]) s1 - s2 = pd.Series(s1, index=['foo', 'bar', 'baz']) + s2 = pd.Series(s1, index=["foo", "bar", "baz"]) s2 - Deprecated ``day_of_year`` API removed from PeriodIndex, use ``dayofyear`` From fa9358591bfacd09b01975b86b5841b95855a126 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 5 Oct 2020 13:58:39 +0100 Subject: [PATCH 22/38] TYP: check_untyped_defs core.arrays.base (#36885) --- pandas/core/arrays/base.py | 56 +++++++++++++++++++------------------- setup.cfg | 3 -- 2 files changed, 28 insertions(+), 31 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index c2fc72ff753a8..94d6428b44043 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1176,22 +1176,22 @@ def _create_arithmetic_method(cls, op): @classmethod def _add_arithmetic_ops(cls): - cls.__add__ = cls._create_arithmetic_method(operator.add) - cls.__radd__ = cls._create_arithmetic_method(ops.radd) - cls.__sub__ = cls._create_arithmetic_method(operator.sub) - cls.__rsub__ = cls._create_arithmetic_method(ops.rsub) - cls.__mul__ = cls._create_arithmetic_method(operator.mul) - cls.__rmul__ = cls._create_arithmetic_method(ops.rmul) - cls.__pow__ = cls._create_arithmetic_method(operator.pow) - cls.__rpow__ = cls._create_arithmetic_method(ops.rpow) - cls.__mod__ = cls._create_arithmetic_method(operator.mod) - cls.__rmod__ = cls._create_arithmetic_method(ops.rmod) - cls.__floordiv__ = cls._create_arithmetic_method(operator.floordiv) - cls.__rfloordiv__ = cls._create_arithmetic_method(ops.rfloordiv) - cls.__truediv__ = cls._create_arithmetic_method(operator.truediv) - cls.__rtruediv__ = cls._create_arithmetic_method(ops.rtruediv) - cls.__divmod__ = cls._create_arithmetic_method(divmod) - cls.__rdivmod__ = cls._create_arithmetic_method(ops.rdivmod) + setattr(cls, "__add__", cls._create_arithmetic_method(operator.add)) + setattr(cls, "__radd__", cls._create_arithmetic_method(ops.radd)) + setattr(cls, "__sub__", cls._create_arithmetic_method(operator.sub)) + setattr(cls, "__rsub__", cls._create_arithmetic_method(ops.rsub)) + setattr(cls, "__mul__", cls._create_arithmetic_method(operator.mul)) + setattr(cls, "__rmul__", cls._create_arithmetic_method(ops.rmul)) + setattr(cls, "__pow__", cls._create_arithmetic_method(operator.pow)) + setattr(cls, "__rpow__", cls._create_arithmetic_method(ops.rpow)) + setattr(cls, "__mod__", cls._create_arithmetic_method(operator.mod)) + setattr(cls, "__rmod__", cls._create_arithmetic_method(ops.rmod)) + setattr(cls, "__floordiv__", cls._create_arithmetic_method(operator.floordiv)) + setattr(cls, "__rfloordiv__", cls._create_arithmetic_method(ops.rfloordiv)) + setattr(cls, "__truediv__", cls._create_arithmetic_method(operator.truediv)) + setattr(cls, "__rtruediv__", cls._create_arithmetic_method(ops.rtruediv)) + setattr(cls, "__divmod__", cls._create_arithmetic_method(divmod)) + setattr(cls, "__rdivmod__", cls._create_arithmetic_method(ops.rdivmod)) @classmethod def _create_comparison_method(cls, op): @@ -1199,12 +1199,12 @@ def _create_comparison_method(cls, op): @classmethod def _add_comparison_ops(cls): - cls.__eq__ = cls._create_comparison_method(operator.eq) - cls.__ne__ = cls._create_comparison_method(operator.ne) - cls.__lt__ = cls._create_comparison_method(operator.lt) - cls.__gt__ = cls._create_comparison_method(operator.gt) - cls.__le__ = cls._create_comparison_method(operator.le) - cls.__ge__ = cls._create_comparison_method(operator.ge) + setattr(cls, "__eq__", cls._create_comparison_method(operator.eq)) + setattr(cls, "__ne__", cls._create_comparison_method(operator.ne)) + setattr(cls, "__lt__", cls._create_comparison_method(operator.lt)) + setattr(cls, "__gt__", cls._create_comparison_method(operator.gt)) + setattr(cls, "__le__", cls._create_comparison_method(operator.le)) + setattr(cls, "__ge__", cls._create_comparison_method(operator.ge)) @classmethod def _create_logical_method(cls, op): @@ -1212,12 +1212,12 @@ def _create_logical_method(cls, op): @classmethod def _add_logical_ops(cls): - cls.__and__ = cls._create_logical_method(operator.and_) - cls.__rand__ = cls._create_logical_method(ops.rand_) - cls.__or__ = cls._create_logical_method(operator.or_) - cls.__ror__ = cls._create_logical_method(ops.ror_) - cls.__xor__ = cls._create_logical_method(operator.xor) - cls.__rxor__ = cls._create_logical_method(ops.rxor) + setattr(cls, "__and__", cls._create_logical_method(operator.and_)) + setattr(cls, "__rand__", cls._create_logical_method(ops.rand_)) + setattr(cls, "__or__", cls._create_logical_method(operator.or_)) + setattr(cls, "__ror__", cls._create_logical_method(ops.ror_)) + setattr(cls, "__xor__", cls._create_logical_method(operator.xor)) + setattr(cls, "__rxor__", cls._create_logical_method(ops.rxor)) class ExtensionScalarOpsMixin(ExtensionOpsMixin): diff --git a/setup.cfg b/setup.cfg index 3279a485c9bf3..75722f2a7809f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -142,9 +142,6 @@ check_untyped_defs=False [mypy-pandas.core.apply] check_untyped_defs=False -[mypy-pandas.core.arrays.base] -check_untyped_defs=False - [mypy-pandas.core.arrays.datetimelike] check_untyped_defs=False From 9216b94aa82f0af892322c3c641c7f3ad59a3907 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 5 Oct 2020 13:59:08 +0100 Subject: [PATCH 23/38] TYP: check_untyped_defs compat.pickle_compat (#36884) --- pandas/compat/pickle_compat.py | 4 ++-- setup.cfg | 3 --- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index ef9f36705a7ee..80ee1f2e20154 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -274,7 +274,7 @@ def patch_pickle(): """ orig_loads = pkl.loads try: - pkl.loads = loads + setattr(pkl, "loads", loads) yield finally: - pkl.loads = orig_loads + setattr(pkl, "loads", orig_loads) diff --git a/setup.cfg b/setup.cfg index 75722f2a7809f..e125eea226b10 100644 --- a/setup.cfg +++ b/setup.cfg @@ -136,9 +136,6 @@ check_untyped_defs=False [mypy-pandas._version] check_untyped_defs=False -[mypy-pandas.compat.pickle_compat] -check_untyped_defs=False - [mypy-pandas.core.apply] check_untyped_defs=False From 125b3e4044408a1a18318b7e33cd9cd72568d87d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 5 Oct 2020 06:01:36 -0700 Subject: [PATCH 24/38] PERF: Improve RollingGroupby.count (#36872) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/window/common.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 4c58b9923c89a..95628350ad998 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -284,6 +284,7 @@ Performance improvements - ``Styler`` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`) - Performance improvement in :meth:`pd.to_datetime` with non-ns time unit for ``float`` ``dtype`` columns (:issue:`20445`) - Performance improvement in setting values on a :class:`IntervalArray` (:issue:`36310`) +- Performance improvement in :meth:`RollingGroupby.count` (:issue:`35625`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index 6452eb8c6b3a9..2e7e7cd47c336 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -58,7 +58,6 @@ def __init__(self, obj, *args, **kwargs): self._groupby.grouper.mutated = True super().__init__(obj, *args, **kwargs) - count = _dispatch("count") corr = _dispatch("corr", other=None, pairwise=None) cov = _dispatch("cov", other=None, pairwise=None) From fa0f21679b63ab69fc8bc7b0b3ed775df5fc7d39 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 5 Oct 2020 15:03:29 +0100 Subject: [PATCH 25/38] DOC: 1.1.3 release date (#36887) --- doc/source/whatsnew/v1.1.3.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index af714b1bb2ab1..2323afbe00e5d 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -1,7 +1,7 @@ .. _whatsnew_113: -What's new in 1.1.3 (??) ------------------------- +What's new in 1.1.3 (October 5, 2020) +------------------------------------- These are the changes in pandas 1.1.3. See :ref:`release` for a full changelog including other versions of pandas. From 30cc26c56b43f708df806f944b2d8a7e027f7dff Mon Sep 17 00:00:00 2001 From: Avinash Pancham <44933366+avinashpancham@users.noreply.github.com> Date: Mon, 5 Oct 2020 19:41:29 +0200 Subject: [PATCH 26/38] Add asv benchmarks for select_dtypes (#36839) --- asv_bench/benchmarks/dtypes.py | 57 ++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py index bd17b710b108d..a5ed5c389fee4 100644 --- a/asv_bench/benchmarks/dtypes.py +++ b/asv_bench/benchmarks/dtypes.py @@ -1,5 +1,9 @@ +import string + import numpy as np +from pandas import DataFrame +import pandas._testing as tm from pandas.api.types import pandas_dtype from .pandas_vb_common import ( @@ -62,4 +66,57 @@ def time_infer(self, dtype): lib.infer_dtype(self.data_dict[dtype], skipna=False) +class SelectDtypes: + + params = [ + tm.ALL_INT_DTYPES + + tm.ALL_EA_INT_DTYPES + + tm.FLOAT_DTYPES + + tm.COMPLEX_DTYPES + + tm.DATETIME64_DTYPES + + tm.TIMEDELTA64_DTYPES + + tm.BOOL_DTYPES + ] + param_names = ["dtype"] + + def setup(self, dtype): + N, K = 5000, 50 + self.index = tm.makeStringIndex(N) + self.columns = tm.makeStringIndex(K) + + def create_df(data): + return DataFrame(data, index=self.index, columns=self.columns) + + self.df_int = create_df(np.random.randint(low=100, size=(N, K))) + self.df_float = create_df(np.random.randn(N, K)) + self.df_bool = create_df(np.random.choice([True, False], size=(N, K))) + self.df_string = create_df( + np.random.choice(list(string.ascii_letters), size=(N, K)) + ) + + def time_select_dtype_int_include(self, dtype): + self.df_int.select_dtypes(include=dtype) + + def time_select_dtype_int_exclude(self, dtype): + self.df_int.select_dtypes(exclude=dtype) + + def time_select_dtype_float_include(self, dtype): + self.df_float.select_dtypes(include=dtype) + + def time_select_dtype_float_exclude(self, dtype): + self.df_float.select_dtypes(exclude=dtype) + + def time_select_dtype_bool_include(self, dtype): + self.df_bool.select_dtypes(include=dtype) + + def time_select_dtype_bool_exclude(self, dtype): + self.df_bool.select_dtypes(exclude=dtype) + + def time_select_dtype_string_include(self, dtype): + self.df_string.select_dtypes(include=dtype) + + def time_select_dtype_string_exclude(self, dtype): + self.df_string.select_dtypes(exclude=dtype) + + from .pandas_vb_common import setup # noqa: F401 isort:skip From c77ce8b52fb042721e5ef6212e3c5820dca89666 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Mon, 5 Oct 2020 15:05:44 -0500 Subject: [PATCH 27/38] DOC: Start v1.1.4 release notes (#36689) --- doc/source/whatsnew/index.rst | 1 + doc/source/whatsnew/v1.1.3.rst | 2 +- doc/source/whatsnew/v1.1.4.rst | 42 ++++++++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 doc/source/whatsnew/v1.1.4.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 933ed3cb8babf..848121f822383 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -24,6 +24,7 @@ Version 1.1 .. toctree:: :maxdepth: 2 + v1.1.4 v1.1.3 v1.1.2 v1.1.1 diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 2323afbe00e5d..e752eb54d0c15 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -75,4 +75,4 @@ Other Contributors ~~~~~~~~~~~~ -.. contributors:: v1.1.2..v1.1.3|HEAD +.. contributors:: v1.1.2..v1.1.3 diff --git a/doc/source/whatsnew/v1.1.4.rst b/doc/source/whatsnew/v1.1.4.rst new file mode 100644 index 0000000000000..e63912ebc8fee --- /dev/null +++ b/doc/source/whatsnew/v1.1.4.rst @@ -0,0 +1,42 @@ +.. _whatsnew_114: + +What's new in 1.1.4 (??) +------------------------ + +These are the changes in pandas 1.1.4. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_114.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_114.bug_fixes: + +Bug fixes +~~~~~~~~~ +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_114.other: + +Other +~~~~~ +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_114.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.1.3..v1.1.4|HEAD From a4036103a4c9025f760a5fdeca8db9fed1959029 Mon Sep 17 00:00:00 2001 From: OlivierLuG <59281854+OlivierLuG@users.noreply.github.com> Date: Mon, 5 Oct 2020 22:52:51 +0200 Subject: [PATCH 28/38] TST: Period with Timestamp overflow (#34755) * #TST #13346 added tests * TST #13346 taken review into account * Added tests for #13346 - with review --- pandas/tests/scalar/period/test_period.py | 30 +++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 795021a260028..5006e16b6a7e0 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -6,6 +6,7 @@ from pandas._libs.tslibs import iNaT, period as libperiod from pandas._libs.tslibs.ccalendar import DAYS, MONTHS +from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas._libs.tslibs.parsing import DateParseError from pandas._libs.tslibs.period import INVALID_FREQ_ERR_MSG, IncompatibleFrequency from pandas._libs.tslibs.timezones import dateutil_gettz, maybe_get_tz @@ -776,6 +777,35 @@ def test_period_deprecated_freq(self): assert isinstance(p1, Period) assert isinstance(p2, Period) + def _period_constructor(bound, offset): + return Period( + year=bound.year, + month=bound.month, + day=bound.day, + hour=bound.hour, + minute=bound.minute, + second=bound.second + offset, + freq="us", + ) + + @pytest.mark.parametrize("bound, offset", [(Timestamp.min, -1), (Timestamp.max, 1)]) + @pytest.mark.parametrize("period_property", ["start_time", "end_time"]) + def test_outter_bounds_start_and_end_time(self, bound, offset, period_property): + # GH #13346 + period = TestPeriodProperties._period_constructor(bound, offset) + with pytest.raises(OutOfBoundsDatetime, match="Out of bounds nanosecond"): + getattr(period, period_property) + + @pytest.mark.parametrize("bound, offset", [(Timestamp.min, -1), (Timestamp.max, 1)]) + @pytest.mark.parametrize("period_property", ["start_time", "end_time"]) + def test_inner_bounds_start_and_end_time(self, bound, offset, period_property): + # GH #13346 + period = TestPeriodProperties._period_constructor(bound, -offset) + expected = period.to_timestamp().round(freq="S") + assert getattr(period, period_property).round(freq="S") == expected + expected = (bound - offset * Timedelta(1, unit="S")).floor("S") + assert getattr(period, period_property).floor("S") == expected + def test_start_time(self): freq_lst = ["A", "Q", "M", "D", "H", "T", "S"] xp = datetime(2012, 1, 1) From fa8e06609048185ee06f245fcdc8c1f86e39937f Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Mon, 5 Oct 2020 19:06:58 -0500 Subject: [PATCH 29/38] CI: Show ipython directive errors (#36863) --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 149acef72db26..46de8d466dd11 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -125,7 +125,7 @@ jobs: # This can be removed when the ipython directive fails when there are errors, # including the `tee sphinx.log` in te previous step (https://github.com/ipython/ipython/issues/11547) - name: Check ipython directive errors - run: "! grep -B1 \"^<<<-------------------------------------------------------------------------$\" sphinx.log" + run: "! grep -B10 \"^<<<-------------------------------------------------------------------------$\" sphinx.log" - name: Install ssh key run: | From ec2b663039261f9999920ba6e6b37bfbe7d43963 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 5 Oct 2020 19:29:08 -0700 Subject: [PATCH 30/38] Standardize cast_str behavior in all datetimelike fill_value validators (#36746) * Standardize cast_str behavior in all datetimelike fill_value validators * CLN: remove cast_str kwarg --- pandas/core/arrays/datetimelike.py | 24 +++++++++--------------- pandas/core/indexes/datetimelike.py | 2 +- pandas/core/indexes/timedeltas.py | 2 +- pandas/tests/arrays/test_datetimelike.py | 10 ++++++++++ pandas/tests/indexes/datetimelike.py | 21 +++++++++++++++++++++ 5 files changed, 42 insertions(+), 17 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 36fd487e92327..5e7c96ca52a91 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -752,9 +752,7 @@ def _validate_shift_value(self, fill_value): return self._unbox(fill_value) - def _validate_scalar( - self, value, msg: Optional[str] = None, cast_str: bool = False - ): + def _validate_scalar(self, value, msg: Optional[str] = None): """ Validate that the input value can be cast to our scalar_type. @@ -765,14 +763,12 @@ def _validate_scalar( Message to raise in TypeError on invalid input. If not provided, `value` is cast to a str and used as the message. - cast_str : bool, default False - Whether to try to parse string input to scalar_type. Returns ------- self._scalar_type or NaT """ - if cast_str and isinstance(value, str): + if isinstance(value, str): # NB: Careful about tzawareness try: value = self._scalar_from_string(value) @@ -794,9 +790,7 @@ def _validate_scalar( return value - def _validate_listlike( - self, value, opname: str, cast_str: bool = False, allow_object: bool = False - ): + def _validate_listlike(self, value, opname: str, allow_object: bool = False): if isinstance(value, type(self)): return value @@ -805,7 +799,7 @@ def _validate_listlike( value = array(value) value = extract_array(value, extract_numpy=True) - if cast_str and is_dtype_equal(value.dtype, "string"): + if is_dtype_equal(value.dtype, "string"): # We got a StringArray try: # TODO: Could use from_sequence_of_strings if implemented @@ -835,9 +829,9 @@ def _validate_listlike( def _validate_searchsorted_value(self, value): msg = "searchsorted requires compatible dtype or scalar" if not is_list_like(value): - value = self._validate_scalar(value, msg, cast_str=True) + value = self._validate_scalar(value, msg) else: - value = self._validate_listlike(value, "searchsorted", cast_str=True) + value = self._validate_listlike(value, "searchsorted") rv = self._unbox(value) return self._rebox_native(rv) @@ -848,15 +842,15 @@ def _validate_setitem_value(self, value): f"or array of those. Got '{type(value).__name__}' instead." ) if is_list_like(value): - value = self._validate_listlike(value, "setitem", cast_str=True) + value = self._validate_listlike(value, "setitem") else: - value = self._validate_scalar(value, msg, cast_str=True) + value = self._validate_scalar(value, msg) return self._unbox(value, setitem=True) def _validate_insert_value(self, value): msg = f"cannot insert {type(self).__name__} with incompatible label" - value = self._validate_scalar(value, msg, cast_str=False) + value = self._validate_scalar(value, msg) self._check_compatible_with(value, setitem=True) # TODO: if we dont have compat, should we raise or astype(object)? diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 2169b9fedbb31..28b7303ff5218 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -648,7 +648,7 @@ def _wrap_joined_index(self, joined: np.ndarray, other): def _convert_arr_indexer(self, keyarr): try: return self._data._validate_listlike( - keyarr, "convert_arr_indexer", cast_str=True, allow_object=True + keyarr, "convert_arr_indexer", allow_object=True ) except (ValueError, TypeError): return com.asarray_tuplesafe(keyarr) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 858387f2e1600..2a7c624b430ed 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -217,7 +217,7 @@ def get_loc(self, key, method=None, tolerance=None): raise InvalidIndexError(key) try: - key = self._data._validate_scalar(key, cast_str=True) + key = self._data._validate_scalar(key) except TypeError as err: raise KeyError(key) from err diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 3f5ab5baa7d69..91bcdf32603f4 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -160,6 +160,16 @@ def test_take_fill(self): result = arr.take([-1, 1], allow_fill=True, fill_value=pd.NaT) assert result[0] is pd.NaT + def test_take_fill_str(self, arr1d): + # Cast str fill_value matching other fill_value-taking methods + result = arr1d.take([-1, 1], allow_fill=True, fill_value=str(arr1d[-1])) + expected = arr1d[[-1, 1]] + tm.assert_equal(result, expected) + + msg = r"'fill_value' should be a <.*>\. Got 'foo'" + with pytest.raises(ValueError, match=msg): + arr1d.take([-1, 1], allow_fill=True, fill_value="foo") + def test_concat_same_type(self): data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 diff --git a/pandas/tests/indexes/datetimelike.py b/pandas/tests/indexes/datetimelike.py index 71ae1d6bda9c7..df857cce05bbb 100644 --- a/pandas/tests/indexes/datetimelike.py +++ b/pandas/tests/indexes/datetimelike.py @@ -115,3 +115,24 @@ def test_not_equals_numeric(self): assert not index.equals(pd.Index(index.asi8)) assert not index.equals(pd.Index(index.asi8.astype("u8"))) assert not index.equals(pd.Index(index.asi8).astype("f8")) + + def test_where_cast_str(self): + index = self.create_index() + + mask = np.ones(len(index), dtype=bool) + mask[-1] = False + + result = index.where(mask, str(index[0])) + expected = index.where(mask, index[0]) + tm.assert_index_equal(result, expected) + + result = index.where(mask, [str(index[0])]) + tm.assert_index_equal(result, expected) + + msg = "Where requires matching dtype, not foo" + with pytest.raises(TypeError, match=msg): + index.where(mask, "foo") + + msg = r"Where requires matching dtype, not \['foo'\]" + with pytest.raises(TypeError, match=msg): + index.where(mask, ["foo"]) From 6c606030a72e316a26b7577b871c7208fac6b2c6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 5 Oct 2020 20:08:07 -0700 Subject: [PATCH 31/38] CLN: value -> key (#36905) --- pandas/core/arrays/interval.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 413430942575d..94c6c5aed9c0d 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -544,10 +544,10 @@ def __iter__(self): def __len__(self) -> int: return len(self._left) - def __getitem__(self, value): - value = check_array_indexer(self, value) - left = self._left[value] - right = self._right[value] + def __getitem__(self, key): + key = check_array_indexer(self, key) + left = self._left[key] + right = self._right[key] if not isinstance(left, (np.ndarray, ExtensionArray)): # scalar From de35fe218b9b00d668c4b29024ce4a7a8a45e563 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 5 Oct 2020 20:33:26 -0700 Subject: [PATCH 32/38] CLN: standardize fixture usage in datetimelike array tests (#36902) --- pandas/tests/arrays/test_datetimelike.py | 132 +++++++++++------------ pandas/tests/arrays/test_timedeltas.py | 61 ++++++----- 2 files changed, 95 insertions(+), 98 deletions(-) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 91bcdf32603f4..f22d958dc88e3 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -17,7 +17,12 @@ # TODO: more freq variants @pytest.fixture(params=["D", "B", "W", "M", "Q", "Y"]) -def period_index(request): +def freqstr(request): + return request.param + + +@pytest.fixture +def period_index(freqstr): """ A fixture to provide PeriodIndex objects with different frequencies. @@ -25,14 +30,13 @@ def period_index(request): so here we just test that the PeriodArray behavior matches the PeriodIndex behavior. """ - freqstr = request.param # TODO: non-monotone indexes; NaTs, different start dates pi = pd.period_range(start=pd.Timestamp("2000-01-01"), periods=100, freq=freqstr) return pi -@pytest.fixture(params=["D", "B", "W", "M", "Q", "Y"]) -def datetime_index(request): +@pytest.fixture +def datetime_index(freqstr): """ A fixture to provide DatetimeIndex objects with different frequencies. @@ -40,14 +44,13 @@ def datetime_index(request): so here we just test that the DatetimeArray behavior matches the DatetimeIndex behavior. """ - freqstr = request.param # TODO: non-monotone indexes; NaTs, different start dates, timezones dti = pd.date_range(start=pd.Timestamp("2000-01-01"), periods=100, freq=freqstr) return dti @pytest.fixture -def timedelta_index(request): +def timedelta_index(): """ A fixture to provide TimedeltaIndex objects with different frequencies. Most TimedeltaArray behavior is already tested in TimedeltaIndex tests, @@ -448,16 +451,15 @@ class TestDatetimeArray(SharedTests): dtype = pd.Timestamp @pytest.fixture - def arr1d(self, tz_naive_fixture): + def arr1d(self, tz_naive_fixture, freqstr): tz = tz_naive_fixture - dti = pd.date_range("2016-01-01 01:01:00", periods=3, freq="H", tz=tz) + dti = pd.date_range("2016-01-01 01:01:00", periods=3, freq=freqstr, tz=tz) dta = dti._data return dta - def test_round(self, tz_naive_fixture): + def test_round(self, arr1d): # GH#24064 - tz = tz_naive_fixture - dti = pd.date_range("2016-01-01 01:01:00", periods=3, freq="H", tz=tz) + dti = self.index_cls(arr1d) result = dti.round(freq="2T") expected = dti - pd.Timedelta(minutes=1) @@ -511,11 +513,10 @@ def test_array_interface(self, datetime_index): expected = np.asarray(arr).astype(dtype) tm.assert_numpy_array_equal(result, expected) - def test_array_object_dtype(self, tz_naive_fixture): + def test_array_object_dtype(self, arr1d): # GH#23524 - tz = tz_naive_fixture - dti = pd.date_range("2016-01-01", periods=3, tz=tz) - arr = DatetimeArray(dti) + arr = arr1d + dti = self.index_cls(arr1d) expected = np.array(list(dti)) @@ -526,11 +527,10 @@ def test_array_object_dtype(self, tz_naive_fixture): result = np.array(dti, dtype=object) tm.assert_numpy_array_equal(result, expected) - def test_array_tz(self, tz_naive_fixture): + def test_array_tz(self, arr1d): # GH#23524 - tz = tz_naive_fixture - dti = pd.date_range("2016-01-01", periods=3, tz=tz) - arr = DatetimeArray(dti) + arr = arr1d + dti = self.index_cls(arr1d) expected = dti.asi8.view("M8[ns]") result = np.array(arr, dtype="M8[ns]") @@ -547,10 +547,9 @@ def test_array_tz(self, tz_naive_fixture): assert result.base is expected.base assert result.base is not None - def test_array_i8_dtype(self, tz_naive_fixture): - tz = tz_naive_fixture - dti = pd.date_range("2016-01-01", periods=3, tz=tz) - arr = DatetimeArray(dti) + def test_array_i8_dtype(self, arr1d): + arr = arr1d + dti = self.index_cls(arr1d) expected = dti.asi8 result = np.array(arr, dtype="i8") @@ -573,10 +572,9 @@ def test_from_array_keeps_base(self): dta = DatetimeArray(arr[:0]) assert dta._data.base is arr - def test_from_dti(self, tz_naive_fixture): - tz = tz_naive_fixture - dti = pd.date_range("2016-01-01", periods=3, tz=tz) - arr = DatetimeArray(dti) + def test_from_dti(self, arr1d): + arr = arr1d + dti = self.index_cls(arr1d) assert list(dti) == list(arr) # Check that Index.__new__ knows what to do with DatetimeArray @@ -584,16 +582,15 @@ def test_from_dti(self, tz_naive_fixture): assert isinstance(dti2, pd.DatetimeIndex) assert list(dti2) == list(arr) - def test_astype_object(self, tz_naive_fixture): - tz = tz_naive_fixture - dti = pd.date_range("2016-01-01", periods=3, tz=tz) - arr = DatetimeArray(dti) + def test_astype_object(self, arr1d): + arr = arr1d + dti = self.index_cls(arr1d) + asobj = arr.astype("O") assert isinstance(asobj, np.ndarray) assert asobj.dtype == "O" assert list(asobj) == list(dti) - @pytest.mark.parametrize("freqstr", ["D", "B", "W", "M", "Q", "Y"]) def test_to_perioddelta(self, datetime_index, freqstr): # GH#23113 dti = datetime_index @@ -612,7 +609,6 @@ def test_to_perioddelta(self, datetime_index, freqstr): # an EA-specific tm.assert_ function tm.assert_index_equal(pd.Index(result), pd.Index(expected)) - @pytest.mark.parametrize("freqstr", ["D", "B", "W", "M", "Q", "Y"]) def test_to_period(self, datetime_index, freqstr): dti = datetime_index arr = DatetimeArray(dti) @@ -626,10 +622,10 @@ def test_to_period(self, datetime_index, freqstr): tm.assert_index_equal(pd.Index(result), pd.Index(expected)) @pytest.mark.parametrize("propname", pd.DatetimeIndex._bool_ops) - def test_bool_properties(self, datetime_index, propname): + def test_bool_properties(self, arr1d, propname): # in this case _bool_ops is just `is_leap_year` - dti = datetime_index - arr = DatetimeArray(dti) + dti = self.index_cls(arr1d) + arr = arr1d assert dti.freq == arr.freq result = getattr(arr, propname) @@ -638,21 +634,21 @@ def test_bool_properties(self, datetime_index, propname): tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize("propname", pd.DatetimeIndex._field_ops) - def test_int_properties(self, datetime_index, propname): + def test_int_properties(self, arr1d, propname): if propname in ["week", "weekofyear"]: # GH#33595 Deprecate week and weekofyear return - dti = datetime_index - arr = DatetimeArray(dti) + dti = self.index_cls(arr1d) + arr = arr1d result = getattr(arr, propname) expected = np.array(getattr(dti, propname), dtype=result.dtype) tm.assert_numpy_array_equal(result, expected) - def test_take_fill_valid(self, datetime_index, tz_naive_fixture): - dti = datetime_index.tz_localize(tz_naive_fixture) - arr = DatetimeArray(dti) + def test_take_fill_valid(self, arr1d): + arr = arr1d + dti = self.index_cls(arr1d) now = pd.Timestamp.now().tz_localize(dti.tz) result = arr.take([-1, 1], allow_fill=True, fill_value=now) @@ -687,10 +683,9 @@ def test_take_fill_valid(self, datetime_index, tz_naive_fixture): # require appropriate-dtype if we have a NA value arr.take([-1, 1], allow_fill=True, fill_value=value) - def test_concat_same_type_invalid(self, datetime_index): + def test_concat_same_type_invalid(self, arr1d): # different timezones - dti = datetime_index - arr = DatetimeArray(dti) + arr = arr1d if arr.tz is None: other = arr.tz_localize("UTC") @@ -718,8 +713,8 @@ def test_concat_same_type_different_freq(self): tm.assert_datetime_array_equal(result, expected) - def test_strftime(self, datetime_index): - arr = DatetimeArray(datetime_index) + def test_strftime(self, arr1d): + arr = arr1d result = arr.strftime("%Y %b") expected = np.array([ts.strftime("%Y %b") for ts in arr], dtype=object) @@ -864,9 +859,9 @@ class TestPeriodArray(SharedTests): def arr1d(self, period_index): return period_index._data - def test_from_pi(self, period_index): - pi = period_index - arr = PeriodArray(pi) + def test_from_pi(self, arr1d): + pi = self.index_cls(arr1d) + arr = arr1d assert list(arr) == list(pi) # Check that Index.__new__ knows what to do with PeriodArray @@ -874,17 +869,16 @@ def test_from_pi(self, period_index): assert isinstance(pi2, pd.PeriodIndex) assert list(pi2) == list(arr) - def test_astype_object(self, period_index): - pi = period_index - arr = PeriodArray(pi) + def test_astype_object(self, arr1d): + pi = self.index_cls(arr1d) + arr = arr1d asobj = arr.astype("O") assert isinstance(asobj, np.ndarray) assert asobj.dtype == "O" assert list(asobj) == list(pi) - def test_take_fill_valid(self, period_index): - pi = period_index - arr = PeriodArray(pi) + def test_take_fill_valid(self, arr1d): + arr = arr1d value = pd.NaT.value msg = f"'fill_value' should be a {self.dtype}. Got '{value}'." @@ -899,9 +893,9 @@ def test_take_fill_valid(self, period_index): arr.take([-1, 1], allow_fill=True, fill_value=value) @pytest.mark.parametrize("how", ["S", "E"]) - def test_to_timestamp(self, how, period_index): - pi = period_index - arr = PeriodArray(pi) + def test_to_timestamp(self, how, arr1d): + pi = self.index_cls(arr1d) + arr = arr1d expected = DatetimeArray(pi.to_timestamp(how=how)) result = arr.to_timestamp(how=how) @@ -922,10 +916,10 @@ def test_to_timestamp_out_of_bounds(self): pi._data.to_timestamp() @pytest.mark.parametrize("propname", PeriodArray._bool_ops) - def test_bool_properties(self, period_index, propname): + def test_bool_properties(self, arr1d, propname): # in this case _bool_ops is just `is_leap_year` - pi = period_index - arr = PeriodArray(pi) + pi = self.index_cls(arr1d) + arr = arr1d result = getattr(arr, propname) expected = np.array(getattr(pi, propname)) @@ -933,17 +927,17 @@ def test_bool_properties(self, period_index, propname): tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize("propname", PeriodArray._field_ops) - def test_int_properties(self, period_index, propname): - pi = period_index - arr = PeriodArray(pi) + def test_int_properties(self, arr1d, propname): + pi = self.index_cls(arr1d) + arr = arr1d result = getattr(arr, propname) expected = np.array(getattr(pi, propname)) tm.assert_numpy_array_equal(result, expected) - def test_array_interface(self, period_index): - arr = PeriodArray(period_index) + def test_array_interface(self, arr1d): + arr = arr1d # default asarray gives objects result = np.asarray(arr) @@ -966,8 +960,8 @@ def test_array_interface(self, period_index): expected = np.asarray(arr).astype("S20") tm.assert_numpy_array_equal(result, expected) - def test_strftime(self, period_index): - arr = PeriodArray(period_index) + def test_strftime(self, arr1d): + arr = arr1d result = arr.strftime("%Y") expected = np.array([per.strftime("%Y") for per in arr], dtype=object) diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py index a32529cb58ba3..b3b8f4d55e4de 100644 --- a/pandas/tests/arrays/test_timedeltas.py +++ b/pandas/tests/arrays/test_timedeltas.py @@ -61,6 +61,7 @@ def test_copy(self): class TestTimedeltaArray: + # TODO: de-duplicate with test_npsum below def test_np_sum(self): # GH#25282 vals = np.arange(5, dtype=np.int64).view("m8[h]").astype("m8[ns]") @@ -76,35 +77,6 @@ def test_from_sequence_dtype(self): with pytest.raises(ValueError, match=msg): TimedeltaArray._from_sequence([], dtype=object) - def test_abs(self): - vals = np.array([-3600 * 10 ** 9, "NaT", 7200 * 10 ** 9], dtype="m8[ns]") - arr = TimedeltaArray(vals) - - evals = np.array([3600 * 10 ** 9, "NaT", 7200 * 10 ** 9], dtype="m8[ns]") - expected = TimedeltaArray(evals) - - result = abs(arr) - tm.assert_timedelta_array_equal(result, expected) - - def test_neg(self): - vals = np.array([-3600 * 10 ** 9, "NaT", 7200 * 10 ** 9], dtype="m8[ns]") - arr = TimedeltaArray(vals) - - evals = np.array([3600 * 10 ** 9, "NaT", -7200 * 10 ** 9], dtype="m8[ns]") - expected = TimedeltaArray(evals) - - result = -arr - tm.assert_timedelta_array_equal(result, expected) - - def test_neg_freq(self): - tdi = pd.timedelta_range("2 Days", periods=4, freq="H") - arr = TimedeltaArray(tdi, freq=tdi.freq) - - expected = TimedeltaArray(-tdi._data, freq=-tdi.freq) - - result = -arr - tm.assert_timedelta_array_equal(result, expected) - @pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"]) def test_astype_int(self, dtype): arr = TimedeltaArray._from_sequence([pd.Timedelta("1H"), pd.Timedelta("2H")]) @@ -171,6 +143,37 @@ def test_searchsorted_invalid_types(self, other, index): arr.searchsorted(other) +class TestUnaryOps: + def test_abs(self): + vals = np.array([-3600 * 10 ** 9, "NaT", 7200 * 10 ** 9], dtype="m8[ns]") + arr = TimedeltaArray(vals) + + evals = np.array([3600 * 10 ** 9, "NaT", 7200 * 10 ** 9], dtype="m8[ns]") + expected = TimedeltaArray(evals) + + result = abs(arr) + tm.assert_timedelta_array_equal(result, expected) + + def test_neg(self): + vals = np.array([-3600 * 10 ** 9, "NaT", 7200 * 10 ** 9], dtype="m8[ns]") + arr = TimedeltaArray(vals) + + evals = np.array([3600 * 10 ** 9, "NaT", -7200 * 10 ** 9], dtype="m8[ns]") + expected = TimedeltaArray(evals) + + result = -arr + tm.assert_timedelta_array_equal(result, expected) + + def test_neg_freq(self): + tdi = pd.timedelta_range("2 Days", periods=4, freq="H") + arr = TimedeltaArray(tdi, freq=tdi.freq) + + expected = TimedeltaArray(-tdi._data, freq=-tdi.freq) + + result = -arr + tm.assert_timedelta_array_equal(result, expected) + + class TestReductions: @pytest.mark.parametrize("name", ["sum", "std", "min", "max", "median"]) @pytest.mark.parametrize("skipna", [True, False]) From 605cc0b8d08fb68fcf0d58fadb6aeaef10061340 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 5 Oct 2020 20:40:18 -0700 Subject: [PATCH 33/38] DEPR: Index.ravel returning an ndarray (#36900) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/indexes/base.py | 6 ++++++ pandas/io/formats/format.py | 3 ++- pandas/tests/indexes/test_common.py | 5 +++++ 4 files changed, 14 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 95628350ad998..7dd3eb51bcaeb 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -268,6 +268,7 @@ Deprecations - Deprecated :meth:`Index.is_all_dates` (:issue:`27744`) - Deprecated automatic alignment on comparison operations between :class:`DataFrame` and :class:`Series`, do ``frame, ser = frame.align(ser, axis=1, copy=False)`` before e.g. ``frame == ser`` (:issue:`28759`) - :meth:`Rolling.count` with ``min_periods=None`` will default to the size of the window in a future version (:issue:`31302`) +- :meth:`Index.ravel` returning a ``np.ndarray`` is deprecated, in the future this will return a view on the same index (:issue:`19956`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ff3d8bf05f9a5..d603797370ce3 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -659,6 +659,12 @@ def ravel(self, order="C"): -------- numpy.ndarray.ravel """ + warnings.warn( + "Index.ravel returning ndarray is deprecated; in a future version " + "this will return a view on self.", + FutureWarning, + stacklevel=2, + ) values = self._get_engine_target() return values.ravel(order=order) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index b9d41f142c2b5..13010bb2ef147 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1677,7 +1677,8 @@ def is_dates_only( values: Union[np.ndarray, DatetimeArray, Index, DatetimeIndex] ) -> bool: # return a boolean if we are only dates (and don't have a timezone) - values = values.ravel() + if not isinstance(values, Index): + values = values.ravel() values = DatetimeIndex(values) if values.tz is not None: diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 675ae388a28a4..e2dea7828b3ad 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -399,6 +399,11 @@ def test_astype_preserves_name(self, index, dtype): else: assert result.name == index.name + def test_ravel_deprecation(self, index): + # GH#19956 ravel returning ndarray is deprecated + with tm.assert_produces_warning(FutureWarning): + index.ravel() + @pytest.mark.parametrize("na_position", [None, "middle"]) def test_sort_values_invalid_na_position(index_with_missing, na_position): From 6e34063e9aa244e5ab006f422d727b3445bad741 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 5 Oct 2020 20:44:14 -0700 Subject: [PATCH 34/38] REF: collect reduction tests (#36901) --- pandas/tests/series/test_analytics.py | 77 ----------------------- pandas/tests/series/test_reductions.py | 86 ++++++++++++++++++++++++++ pandas/tests/test_lib.py | 5 +- pandas/tests/test_sorting.py | 2 +- 4 files changed, 89 insertions(+), 81 deletions(-) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 6ba55ce3c74b9..1a469d3e3d88b 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -9,12 +9,6 @@ class TestSeriesAnalytics: - def test_prod_numpy16_bug(self): - s = Series([1.0, 1.0, 1.0], index=range(3)) - result = s.prod() - - assert not isinstance(result, Series) - def test_matmul(self): # matmul test is for GH #10259 a = Series(np.random.randn(4), index=["p", "q", "r", "s"]) @@ -125,74 +119,3 @@ def test_is_monotonic(self): s = Series(list(reversed(s.tolist()))) assert s.is_monotonic is False assert s.is_monotonic_decreasing is True - - @pytest.mark.parametrize("func", [np.any, np.all]) - @pytest.mark.parametrize("kwargs", [dict(keepdims=True), dict(out=object())]) - def test_validate_any_all_out_keepdims_raises(self, kwargs, func): - s = pd.Series([1, 2]) - param = list(kwargs)[0] - name = func.__name__ - - msg = ( - f"the '{param}' parameter is not " - "supported in the pandas " - fr"implementation of {name}\(\)" - ) - with pytest.raises(ValueError, match=msg): - func(s, **kwargs) - - def test_validate_sum_initial(self): - s = pd.Series([1, 2]) - msg = ( - r"the 'initial' parameter is not " - r"supported in the pandas " - r"implementation of sum\(\)" - ) - with pytest.raises(ValueError, match=msg): - np.sum(s, initial=10) - - def test_validate_median_initial(self): - s = pd.Series([1, 2]) - msg = ( - r"the 'overwrite_input' parameter is not " - r"supported in the pandas " - r"implementation of median\(\)" - ) - with pytest.raises(ValueError, match=msg): - # It seems like np.median doesn't dispatch, so we use the - # method instead of the ufunc. - s.median(overwrite_input=True) - - def test_validate_stat_keepdims(self): - s = pd.Series([1, 2]) - msg = ( - r"the 'keepdims' parameter is not " - r"supported in the pandas " - r"implementation of sum\(\)" - ) - with pytest.raises(ValueError, match=msg): - np.sum(s, keepdims=True) - - def test_td64_summation_overflow(self): - # GH 9442 - s = pd.Series(pd.date_range("20130101", periods=100000, freq="H")) - s[0] += pd.Timedelta("1s 1ms") - - # mean - result = (s - s.min()).mean() - expected = pd.Timedelta((pd.TimedeltaIndex(s - s.min()).asi8 / len(s)).sum()) - - # the computation is converted to float so - # might be some loss of precision - assert np.allclose(result.value / 1000, expected.value / 1000) - - # sum - msg = "overflow in timedelta operation" - with pytest.raises(ValueError, match=msg): - (s - s.min()).sum() - - s1 = s[0:10000] - with pytest.raises(ValueError, match=msg): - (s1 - s1.min()).sum() - s2 = s[0:1000] - (s2 - s2.min()).sum() diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index be9330a14f9c9..28d29c69f6526 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -1,3 +1,6 @@ +import numpy as np +import pytest + import pandas as pd from pandas import Series @@ -9,3 +12,86 @@ def test_reductions_td64_with_nat(): assert ser.median() == exp assert ser.min() == exp assert ser.max() == exp + + +def test_td64_summation_overflow(): + # GH#9442 + ser = Series(pd.date_range("20130101", periods=100000, freq="H")) + ser[0] += pd.Timedelta("1s 1ms") + + # mean + result = (ser - ser.min()).mean() + expected = pd.Timedelta((pd.TimedeltaIndex(ser - ser.min()).asi8 / len(ser)).sum()) + + # the computation is converted to float so + # might be some loss of precision + assert np.allclose(result.value / 1000, expected.value / 1000) + + # sum + msg = "overflow in timedelta operation" + with pytest.raises(ValueError, match=msg): + (ser - ser.min()).sum() + + s1 = ser[0:10000] + with pytest.raises(ValueError, match=msg): + (s1 - s1.min()).sum() + s2 = ser[0:1000] + (s2 - s2.min()).sum() + + +def test_prod_numpy16_bug(): + ser = Series([1.0, 1.0, 1.0], index=range(3)) + result = ser.prod() + + assert not isinstance(result, Series) + + +@pytest.mark.parametrize("func", [np.any, np.all]) +@pytest.mark.parametrize("kwargs", [dict(keepdims=True), dict(out=object())]) +def test_validate_any_all_out_keepdims_raises(kwargs, func): + ser = Series([1, 2]) + param = list(kwargs)[0] + name = func.__name__ + + msg = ( + f"the '{param}' parameter is not " + "supported in the pandas " + fr"implementation of {name}\(\)" + ) + with pytest.raises(ValueError, match=msg): + func(ser, **kwargs) + + +def test_validate_sum_initial(): + ser = Series([1, 2]) + msg = ( + r"the 'initial' parameter is not " + r"supported in the pandas " + r"implementation of sum\(\)" + ) + with pytest.raises(ValueError, match=msg): + np.sum(ser, initial=10) + + +def test_validate_median_initial(): + ser = Series([1, 2]) + msg = ( + r"the 'overwrite_input' parameter is not " + r"supported in the pandas " + r"implementation of median\(\)" + ) + with pytest.raises(ValueError, match=msg): + # It seems like np.median doesn't dispatch, so we use the + # method instead of the ufunc. + ser.median(overwrite_input=True) + + +def test_validate_stat_keepdims(): + ser = Series([1, 2]) + msg = ( + r"the 'keepdims' parameter is not " + r"supported in the pandas " + r"implementation of sum\(\)" + ) + with pytest.raises(ValueError, match=msg): + np.sum(ser, keepdims=True) diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index b6f59807eaa15..c9c34916be32b 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -1,9 +1,8 @@ import numpy as np import pytest -from pandas._libs import lib, writers as libwriters +from pandas._libs import Timestamp, lib, writers as libwriters -import pandas as pd from pandas import Index import pandas._testing as tm @@ -41,7 +40,7 @@ def test_fast_unique_multiple_list_gen_sort(self): tm.assert_numpy_array_equal(np.array(out), expected) def test_fast_unique_multiple_unsortable_runtimewarning(self): - arr = [np.array(["foo", pd.Timestamp("2000")])] + arr = [np.array(["foo", Timestamp("2000")])] with tm.assert_produces_warning(RuntimeWarning): lib.fast_unique_multiple(arr, sort=None) diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index deb7434694d01..1c9fd46ae451f 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -298,7 +298,7 @@ def verify_order(df): "outer": np.ones(len(out), dtype="bool"), } - for how in "left", "right", "outer", "inner": + for how in ["left", "right", "outer", "inner"]: mask = jmask[how] frame = align(out[mask].copy()) assert mask.all() ^ mask.any() or how == "outer" From 28473372c6617172f11d83e04add387192da6b20 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Tue, 6 Oct 2020 16:33:13 +0100 Subject: [PATCH 35/38] PERF: Index._shallow_copy shares _cache with copies of self (#36840) * PERF: Index.equals when comparing to copies of self * refactor _shallow_copy, add GH number * PERF: share _cache, don't share _id * rename tests * fix memory usage test Co-authored-by: Jeff Reback --- doc/source/whatsnew/v1.2.0.rst | 2 ++ pandas/core/indexes/base.py | 10 +++++----- pandas/core/indexes/datetimelike.py | 14 ++++++-------- pandas/core/indexes/interval.py | 10 +++++----- pandas/core/indexes/period.py | 10 +++++----- pandas/core/indexes/range.py | 10 +++++----- pandas/tests/base/test_misc.py | 3 ++- pandas/tests/indexes/common.py | 26 ++++++++++---------------- 8 files changed, 40 insertions(+), 45 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 7dd3eb51bcaeb..9eea7f5737861 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -285,6 +285,8 @@ Performance improvements - ``Styler`` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`) - Performance improvement in :meth:`pd.to_datetime` with non-ns time unit for ``float`` ``dtype`` columns (:issue:`20445`) - Performance improvement in setting values on a :class:`IntervalArray` (:issue:`36310`) +- The internal index method :meth:`~Index._shallow_copy` now makes the new index and original index share cached attributes, + avoiding creating these again, if created on either. This can speed up operations that depend on creating copies of existing indexes (:issue:`36840`) - Performance improvement in :meth:`RollingGroupby.count` (:issue:`35625`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d603797370ce3..4967e13a9855a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -561,12 +561,12 @@ def _shallow_copy(self, values=None, name: Label = no_default): name : Label, defaults to self.name """ name = self.name if name is no_default else name - cache = self._cache.copy() if values is None else {} - if values is None: - values = self._values - result = self._simple_new(values, name=name) - result._cache = cache + if values is not None: + return self._simple_new(values, name=name) + + result = self._simple_new(self._values, name=name) + result._cache = self._cache return result def is_(self, other) -> bool: diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 28b7303ff5218..b5dc20479071c 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -673,17 +673,15 @@ def _with_freq(self, freq): def _shallow_copy(self, values=None, name: Label = lib.no_default): name = self.name if name is lib.no_default else name - cache = self._cache.copy() if values is None else {} - if values is None: - values = self._data - - if isinstance(values, np.ndarray): + if values is not None: # TODO: We would rather not get here - values = type(self._data)(values, dtype=self.dtype) + if isinstance(values, np.ndarray): + values = type(self._data)(values, dtype=self.dtype) + return self._simple_new(values, name=name) - result = type(self)._simple_new(values, name=name) - result._cache = cache + result = self._simple_new(self._data, name=name) + result._cache = self._cache return result # -------------------------------------------------------------------- diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index a56f6a5bb0340..4a877621a94c2 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -335,12 +335,12 @@ def _shallow_copy( self, values: Optional[IntervalArray] = None, name: Label = lib.no_default ): name = self.name if name is lib.no_default else name - cache = self._cache.copy() if values is None else {} - if values is None: - values = self._data - result = self._simple_new(values, name=name) - result._cache = cache + if values is not None: + return self._simple_new(values, name=name) + + result = self._simple_new(self._data, name=name) + result._cache = self._cache return result @cache_readonly diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 27b60747015de..adf7a75b33b38 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -260,12 +260,12 @@ def _has_complex_internals(self) -> bool: def _shallow_copy(self, values=None, name: Label = no_default): name = name if name is not no_default else self.name - cache = self._cache.copy() if values is None else {} - if values is None: - values = self._data - result = self._simple_new(values, name=name) - result._cache = cache + if values is not None: + return self._simple_new(values, name=name) + + result = self._simple_new(self._data, name=name) + result._cache = self._cache return result def _maybe_convert_timedelta(self, other): diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 4dffda2605ef7..d5d9a9b5bc0a3 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -397,13 +397,13 @@ def __iter__(self): def _shallow_copy(self, values=None, name: Label = no_default): name = self.name if name is no_default else name - if values is None: - result = self._simple_new(self._range, name=name) - result._cache = self._cache.copy() - return result - else: + if values is not None: return Int64Index._simple_new(values, name=name) + result = self._simple_new(self._range, name=name) + result._cache = self._cache + return result + @doc(Int64Index.copy) def copy(self, name=None, deep=False, dtype=None, names=None): name = self._validate_names(name=name, names=names, deep=deep)[0] diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index b8468a5acf277..2dc2fe6d2ad07 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -128,7 +128,8 @@ def test_memory_usage(index_or_series_obj): ) if len(obj) == 0: - assert res_deep == res == 0 + expected = 0 if isinstance(obj, Index) else 80 + assert res_deep == res == expected elif is_object or is_categorical: # only deep will pick them up assert res_deep > res diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index c40f7b1bc2120..73d2e99d3ff5e 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -935,28 +935,22 @@ def test_contains_requires_hashable_raises(self): with pytest.raises(TypeError, match=msg): {} in idx._engine - def test_copy_copies_cache(self): - # GH32898 + def test_copy_shares_cache(self): + # GH32898, GH36840 idx = self.create_index() idx.get_loc(idx[0]) # populates the _cache. copy = idx.copy() - # check that the copied cache is a copy of the original - assert idx._cache == copy._cache - assert idx._cache is not copy._cache - # cache values should reference the same object - for key, val in idx._cache.items(): - assert copy._cache[key] is val, key + assert copy._cache is idx._cache - def test_shallow_copy_copies_cache(self): - # GH32669 + def test_shallow_copy_shares_cache(self): + # GH32669, GH36840 idx = self.create_index() idx.get_loc(idx[0]) # populates the _cache. shallow_copy = idx._shallow_copy() - # check that the shallow_copied cache is a copy of the original - assert idx._cache == shallow_copy._cache - assert idx._cache is not shallow_copy._cache - # cache values should reference the same object - for key, val in idx._cache.items(): - assert shallow_copy._cache[key] is val, key + assert shallow_copy._cache is idx._cache + + shallow_copy = idx._shallow_copy(idx._data) + assert shallow_copy._cache is not idx._cache + assert shallow_copy._cache == {} From 265ffb052df6efc9f89be10d8515864c34fdf2a5 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 7 Oct 2020 10:12:25 -0700 Subject: [PATCH 36/38] checkout upstream versions --- .github/CODE_OF_CONDUCT.md | 1 - ci/azure/posix.yml | 32 +++---- ci/deps/azure-37-slow.yaml | 1 + ci/deps/azure-37.yaml | 28 ++++++ ci/deps/azure-38.yaml | 20 +++++ ci/deps/travis-37-locale.yaml | 22 +++-- ci/deps/travis-38-slow.yaml | 37 ++++++++ ci/travis_process_gbq_encryption.sh | 1 - doc/data/iris.data | 2 +- doc/source/development/contributing.rst | 3 + doc/source/development/developer.rst | 2 +- .../06_calculate_statistics.rst | 5 +- .../getting_started/intro_tutorials/index.rst | 1 - doc/source/getting_started/overview.rst | 1 - .../reference/general_utility_functions.rst | 1 - doc/source/user_guide/advanced.rst | 11 ++- doc/source/user_guide/basics.rst | 36 ++++++-- doc/source/user_guide/categorical.rst | 20 ++++- doc/source/user_guide/computation.rst | 6 +- doc/source/user_guide/cookbook.rst | 41 +++++++-- doc/source/user_guide/groupby.rst | 9 +- doc/source/user_guide/io.rst | 68 ++++++++++++--- doc/source/user_guide/merging.rst | 24 ++++- doc/source/user_guide/missing_data.rst | 5 +- doc/source/user_guide/reshaping.rst | 52 +++++++++-- doc/source/user_guide/text.rst | 28 ++++-- doc/source/user_guide/timedeltas.rst | 5 +- doc/source/user_guide/timeseries.rst | 57 +++++++++--- doc/source/user_guide/visualization.rst | 6 +- doc/source/whatsnew/v0.10.1.rst | 4 +- doc/source/whatsnew/v0.13.1.rst | 8 +- doc/source/whatsnew/v0.14.0.rst | 2 +- doc/source/whatsnew/v0.14.1.rst | 4 +- doc/source/whatsnew/v0.17.0.rst | 4 +- doc/source/whatsnew/v0.19.0.rst | 4 +- doc/source/whatsnew/v0.25.0.rst | 22 +++-- doc/source/whatsnew/v0.25.1.rst | 10 +-- doc/source/whatsnew/v0.25.2.rst | 6 +- doc/source/whatsnew/v0.25.3.rst | 2 +- doc/source/whatsnew/v0.8.0.rst | 4 +- doc/source/whatsnew/v1.0.0.rst | 12 +-- doc/source/whatsnew/v1.0.2.rst | 2 +- doc/source/whatsnew/v1.1.0.rst | 12 +-- doc/source/whatsnew/v1.2.0.rst | 30 ++++++- pandas/_libs/src/klib/khash_python.h | 2 +- pandas/_libs/testing.pyx | 27 +----- pandas/_libs/tslibs/timezones.pyx | 10 ++- pandas/_libs/tslibs/util.pxd | 4 + pandas/_libs/util.pxd | 1 - pandas/tests/arithmetic/common.py | 16 +++- pandas/tests/arithmetic/conftest.py | 15 ++-- pandas/tests/arithmetic/test_datetime64.py | 18 ++-- pandas/tests/arithmetic/test_numeric.py | 50 ++++++++--- pandas/tests/arrays/integer/test_function.py | 14 +++ pandas/tests/arrays/test_datetimes.py | 77 +++++++++++++++- pandas/tests/base/test_value_counts.py | 14 +++ .../tests/frame/apply/test_frame_transform.py | 39 +++++++-- pandas/tests/frame/methods/test_diff.py | 23 +++-- pandas/tests/frame/test_constructors.py | 4 +- pandas/tests/indexes/datetimes/test_setops.py | 2 +- pandas/tests/indexes/multi/test_join.py | 2 +- pandas/tests/indexes/multi/test_setops.py | 6 +- pandas/tests/indexes/ranges/test_setops.py | 48 ++++++++++ pandas/tests/indexes/test_common.py | 87 +++++++++++++++++++ pandas/tests/io/formats/test_info.py | 60 ++++++++++++- .../tests/io/json/data/tsframe_iso_v012.json | 2 +- pandas/tests/io/json/data/tsframe_v012.json | 2 +- pandas/tests/io/pytables/test_store.py | 8 ++ pandas/tests/io/test_parquet.py | 46 +++++++++- .../tests/reductions/test_stat_reductions.py | 2 +- pandas/tests/reshape/test_concat.py | 4 +- .../series/apply/test_series_transform.py | 37 +++++++- pandas/tests/series/indexing/test_boolean.py | 13 +-- .../tests/series/indexing/test_multiindex.py | 29 +++++++ pandas/tests/series/test_arithmetic.py | 42 +++++++++ pandas/tests/tslibs/test_timezones.py | 24 ++++- pandas/tests/window/test_grouper.py | 29 +++++++ scripts/generate_pip_deps_from_conda.py | 2 +- scripts/validate_rst_title_capitalization.py | 8 ++ 79 files changed, 1165 insertions(+), 253 deletions(-) create mode 100644 ci/deps/azure-37.yaml create mode 100644 ci/deps/azure-38.yaml create mode 100644 ci/deps/travis-38-slow.yaml diff --git a/.github/CODE_OF_CONDUCT.md b/.github/CODE_OF_CONDUCT.md index 7dd2e04249492..87a5b7905fc6d 100644 --- a/.github/CODE_OF_CONDUCT.md +++ b/.github/CODE_OF_CONDUCT.md @@ -60,4 +60,3 @@ and the [Swift Code of Conduct][swift]. [homepage]: https://www.contributor-covenant.org [version]: https://www.contributor-covenant.org/version/1/3/0/ [swift]: https://swift.org/community/#code-of-conduct - diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml index 9f8174b4fa678..3a9bb14470692 100644 --- a/ci/azure/posix.yml +++ b/ci/azure/posix.yml @@ -20,39 +20,35 @@ jobs: CONDA_PY: "37" PATTERN: "not slow and not network and not clipboard" + py37: + ENV_FILE: ci/deps/azure-37.yaml + CONDA_PY: "37" + PATTERN: "not slow and not network and not clipboard" + py37_locale_slow: ENV_FILE: ci/deps/azure-37-locale_slow.yaml CONDA_PY: "37" PATTERN: "slow" - # pandas does not use the language (zh_CN), but should support different encodings (utf8) - # we should test with encodings different than utf8, but doesn't seem like Ubuntu supports any - LANG: "zh_CN.utf8" - LC_ALL: "zh_CN.utf8" - EXTRA_APT: "language-pack-zh-hans" + LANG: "it_IT.utf8" + LC_ALL: "it_IT.utf8" + EXTRA_APT: "language-pack-it xsel" py37_slow: ENV_FILE: ci/deps/azure-37-slow.yaml CONDA_PY: "37" PATTERN: "slow" - py37_locale: - ENV_FILE: ci/deps/azure-37-locale.yaml - CONDA_PY: "37" - PATTERN: "not slow and not network" - LANG: "it_IT.utf8" - LC_ALL: "it_IT.utf8" - EXTRA_APT: "language-pack-it xsel" - -# py37_32bit: -# ENV_FILE: ci/deps/azure-37-32bit.yaml -# CONDA_PY: "37" -# PATTERN: "not slow and not network and not clipboard" -# BITS32: "yes" + py38: + ENV_FILE: ci/deps/azure-38.yaml + CONDA_PY: "38" + PATTERN: "not slow and not network and not clipboard" py38_locale: ENV_FILE: ci/deps/azure-38-locale.yaml CONDA_PY: "38" PATTERN: "not slow and not network" + # pandas does not use the language (zh_CN), but should support different encodings (utf8) + # we should test with encodings different than utf8, but doesn't seem like Ubuntu supports any LANG: "zh_CN.utf8" LC_ALL: "zh_CN.utf8" EXTRA_APT: "language-pack-zh-hans xsel" diff --git a/ci/deps/azure-37-slow.yaml b/ci/deps/azure-37-slow.yaml index 13a0d442bcae7..50fccf86b6340 100644 --- a/ci/deps/azure-37-slow.yaml +++ b/ci/deps/azure-37-slow.yaml @@ -10,6 +10,7 @@ dependencies: - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 + - pytest-azurepipelines # pandas dependencies - beautifulsoup4 diff --git a/ci/deps/azure-37.yaml b/ci/deps/azure-37.yaml new file mode 100644 index 0000000000000..82cb6760b6d1e --- /dev/null +++ b/ci/deps/azure-37.yaml @@ -0,0 +1,28 @@ +name: pandas-dev +channels: + - defaults + - conda-forge +dependencies: + - python=3.7.* + + # tools + - cython>=0.29.21 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies + - botocore>=1.11 + - fsspec>=0.7.4 + - numpy + - python-dateutil + - nomkl + - pyarrow + - pytz + - s3fs>=0.4.0 + - moto>=1.3.14 + - flask + - tabulate + - pyreadstat + - pip diff --git a/ci/deps/azure-38.yaml b/ci/deps/azure-38.yaml new file mode 100644 index 0000000000000..954e9710f79b9 --- /dev/null +++ b/ci/deps/azure-38.yaml @@ -0,0 +1,20 @@ +name: pandas-dev +channels: + - defaults + - conda-forge +dependencies: + - python=3.8.* + + # tools + - cython>=0.29.21 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies + - numpy + - python-dateutil + - nomkl + - pytz + - tabulate==0.8.3 diff --git a/ci/deps/travis-37-locale.yaml b/ci/deps/travis-37-locale.yaml index ddaf0bea097c7..e93a86910bf34 100644 --- a/ci/deps/travis-37-locale.yaml +++ b/ci/deps/travis-37-locale.yaml @@ -11,7 +11,12 @@ dependencies: - pytest-xdist>=1.21 - hypothesis>=3.58.0 - # pandas dependencies + # required + - numpy + - python-dateutil + - pytz + + # optional - beautifulsoup4 - blosc=1.15.0 - python-blosc @@ -20,22 +25,23 @@ dependencies: - ipython - jinja2 - lxml=4.3.0 - - matplotlib=3.0.* + - matplotlib - nomkl - numexpr - - numpy - openpyxl - pandas-gbq - google-cloud-bigquery>=1.27.2 # GH 36436 - pyarrow>=0.17 - - psycopg2=2.7 - - pymysql=0.7.11 - pytables>=3.5.1 - - python-dateutil - - pytz - scipy - - sqlalchemy=1.3.0 - xarray=0.12.0 - xlrd - xlsxwriter - xlwt + - moto + - flask + + # sql + - psycopg2=2.7 + - pymysql=0.7.11 + - sqlalchemy=1.3.0 diff --git a/ci/deps/travis-38-slow.yaml b/ci/deps/travis-38-slow.yaml new file mode 100644 index 0000000000000..e4b719006a11e --- /dev/null +++ b/ci/deps/travis-38-slow.yaml @@ -0,0 +1,37 @@ +name: pandas-dev +channels: + - defaults + - conda-forge +dependencies: + - python=3.8.* + + # tools + - cython>=0.29.21 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + + # pandas dependencies + - beautifulsoup4 + - fsspec>=0.7.4 + - html5lib + - lxml + - matplotlib + - numexpr + - numpy + - openpyxl + - patsy + - psycopg2 + - pymysql + - pytables + - python-dateutil + - pytz + - s3fs>=0.4.0 + - moto>=1.3.14 + - scipy + - sqlalchemy + - xlrd + - xlsxwriter + - xlwt + - moto + - flask diff --git a/ci/travis_process_gbq_encryption.sh b/ci/travis_process_gbq_encryption.sh index fccf8e1e8deff..b5118ad5defc6 100755 --- a/ci/travis_process_gbq_encryption.sh +++ b/ci/travis_process_gbq_encryption.sh @@ -10,4 +10,3 @@ elif [[ -n ${!TRAVIS_IV_ENV} ]]; then export GBQ_PROJECT_ID='pandas-gbq-tests'; echo 'Successfully decrypted gbq credentials' fi - diff --git a/doc/data/iris.data b/doc/data/iris.data index c19b9c3688515..026e214e5f754 100644 --- a/doc/data/iris.data +++ b/doc/data/iris.data @@ -148,4 +148,4 @@ SepalLength,SepalWidth,PetalLength,PetalWidth,Name 6.3,2.5,5.0,1.9,Iris-virginica 6.5,3.0,5.2,2.0,Iris-virginica 6.2,3.4,5.4,2.3,Iris-virginica -5.9,3.0,5.1,1.8,Iris-virginica \ No newline at end of file +5.9,3.0,5.1,1.8,Iris-virginica diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 17eba825d1c29..ba5530dcbd3ba 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -837,6 +837,9 @@ to run its checks by running:: without having to have done ``pre-commit install`` beforehand. +Note that if you have conflicting installations of ``virtualenv``, then you may get an +error - see `here `_. + Backwards compatibility ~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/development/developer.rst b/doc/source/development/developer.rst index bdbcf5ca337b8..d701208792a4c 100644 --- a/doc/source/development/developer.rst +++ b/doc/source/development/developer.rst @@ -184,4 +184,4 @@ As an example of fully-formed metadata: 'creator': { 'library': 'pyarrow', 'version': '0.13.0' - }} \ No newline at end of file + }} diff --git a/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst b/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst index 7e919777fdf03..6ce98ba5dbd1b 100644 --- a/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst +++ b/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst @@ -123,7 +123,10 @@ aggregating statistics for given columns can be defined using the .. ipython:: python titanic.agg( - {"Age": ["min", "max", "median", "skew"], "Fare": ["min", "max", "median", "mean"]} + { + "Age": ["min", "max", "median", "skew"], + "Fare": ["min", "max", "median", "mean"], + } ) .. raw:: html diff --git a/doc/source/getting_started/intro_tutorials/index.rst b/doc/source/getting_started/intro_tutorials/index.rst index 28e7610866461..c67e18043c175 100644 --- a/doc/source/getting_started/intro_tutorials/index.rst +++ b/doc/source/getting_started/intro_tutorials/index.rst @@ -19,4 +19,3 @@ Getting started tutorials 08_combine_dataframes 09_timeseries 10_text_data - diff --git a/doc/source/getting_started/overview.rst b/doc/source/getting_started/overview.rst index 3043cf25c5312..3d8108d78ac89 100644 --- a/doc/source/getting_started/overview.rst +++ b/doc/source/getting_started/overview.rst @@ -174,4 +174,3 @@ License ------- .. literalinclude:: ../../../LICENSE - diff --git a/doc/source/reference/general_utility_functions.rst b/doc/source/reference/general_utility_functions.rst index 3cba0a81a7011..37fe980dbf68c 100644 --- a/doc/source/reference/general_utility_functions.rst +++ b/doc/source/reference/general_utility_functions.rst @@ -122,4 +122,3 @@ Bug report function :toctree: api/ show_versions - diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index cec777e0f021e..2cd48ac7adb0e 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -304,7 +304,8 @@ whereas a tuple of lists refer to several values within a level: .. ipython:: python s = pd.Series( - [1, 2, 3, 4, 5, 6], index=pd.MultiIndex.from_product([["A", "B"], ["c", "d", "e"]]) + [1, 2, 3, 4, 5, 6], + index=pd.MultiIndex.from_product([["A", "B"], ["c", "d", "e"]]), ) s.loc[[("A", "c"), ("B", "d")]] # list of tuples s.loc[(["A", "B"], ["c", "d"])] # tuple of lists @@ -819,7 +820,9 @@ values **not** in the categories, similarly to how you can reindex **any** panda .. ipython:: python - df3 = pd.DataFrame({"A": np.arange(3), "B": pd.Series(list("abc")).astype("category")}) + df3 = pd.DataFrame( + {"A": np.arange(3), "B": pd.Series(list("abc")).astype("category")} + ) df3 = df3.set_index("B") df3 @@ -934,7 +937,9 @@ example, be millisecond offsets. np.random.randn(5, 2), index=np.arange(5) * 250.0, columns=list("AB") ), pd.DataFrame( - np.random.randn(6, 2), index=np.arange(4, 10) * 250.1, columns=list("AB") + np.random.randn(6, 2), + index=np.arange(4, 10) * 250.1, + columns=list("AB"), ), ] ) diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 8c01913e55318..53fabf94e24e0 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -464,7 +464,10 @@ which we illustrate: {"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]} ) df2 = pd.DataFrame( - {"A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0], "B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0]} + { + "A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0], + "B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0], + } ) df1 df2 @@ -712,7 +715,10 @@ Similarly, you can get the most frequently occurring value(s), i.e. the mode, of s5 = pd.Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7]) s5.mode() df5 = pd.DataFrame( - {"A": np.random.randint(0, 7, size=50), "B": np.random.randint(-10, 15, size=50)} + { + "A": np.random.randint(0, 7, size=50), + "B": np.random.randint(-10, 15, size=50), + } ) df5.mode() @@ -1192,7 +1198,9 @@ to :ref:`merging/joining functionality `: .. ipython:: python - s = pd.Series(["six", "seven", "six", "seven", "six"], index=["a", "b", "c", "d", "e"]) + s = pd.Series( + ["six", "seven", "six", "seven", "six"], index=["a", "b", "c", "d", "e"] + ) t = pd.Series({"six": 6.0, "seven": 7.0}) s s.map(t) @@ -1494,7 +1502,9 @@ labels). df = pd.DataFrame( {"x": [1, 2, 3, 4, 5, 6], "y": [10, 20, 30, 40, 50, 60]}, - index=pd.MultiIndex.from_product([["a", "b", "c"], [1, 2]], names=["let", "num"]), + index=pd.MultiIndex.from_product( + [["a", "b", "c"], [1, 2]], names=["let", "num"] + ), ) df df.rename_axis(index={"let": "abc"}) @@ -1803,7 +1813,9 @@ used to sort a pandas object by its index levels. } ) - unsorted_df = df.reindex(index=["a", "d", "c", "b"], columns=["three", "two", "one"]) + unsorted_df = df.reindex( + index=["a", "d", "c", "b"], columns=["three", "two", "one"] + ) unsorted_df # DataFrame @@ -1849,7 +1861,9 @@ to use to determine the sorted order. .. ipython:: python - df1 = pd.DataFrame({"one": [2, 1, 1, 1], "two": [1, 3, 2, 4], "three": [5, 4, 3, 2]}) + df1 = pd.DataFrame( + {"one": [2, 1, 1, 1], "two": [1, 3, 2, 4], "three": [5, 4, 3, 2]} + ) df1.sort_values(by="two") The ``by`` parameter can take a list of column names, e.g.: @@ -1994,7 +2008,9 @@ all levels to ``by``. .. ipython:: python - df1.columns = pd.MultiIndex.from_tuples([("a", "one"), ("a", "two"), ("b", "three")]) + df1.columns = pd.MultiIndex.from_tuples( + [("a", "one"), ("a", "two"), ("b", "three")] + ) df1.sort_values(by=("a", "two")) @@ -2245,7 +2261,11 @@ to the correct type. import datetime df = pd.DataFrame( - [[1, 2], ["a", "b"], [datetime.datetime(2016, 3, 2), datetime.datetime(2016, 3, 2)]] + [ + [1, 2], + ["a", "b"], + [datetime.datetime(2016, 3, 2), datetime.datetime(2016, 3, 2)], + ] ) df = df.T df diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 67f11bbb45b02..5c43de05fb5b9 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -513,7 +513,11 @@ The ordering of the categorical is determined by the ``categories`` of that colu dfs = pd.DataFrame( { - "A": pd.Categorical(list("bbeebbaa"), categories=["e", "a", "b"], ordered=True), + "A": pd.Categorical( + list("bbeebbaa"), + categories=["e", "a", "b"], + ordered=True, + ), "B": [1, 2, 1, 2, 2, 1, 2, 1], } ) @@ -642,7 +646,13 @@ Groupby will also show "unused" categories: df.groupby("cats").mean() cats2 = pd.Categorical(["a", "a", "b", "b"], categories=["a", "b", "c"]) - df2 = pd.DataFrame({"cats": cats2, "B": ["c", "d", "c", "d"], "values": [1, 2, 3, 4]}) + df2 = pd.DataFrame( + { + "cats": cats2, + "B": ["c", "d", "c", "d"], + "values": [1, 2, 3, 4], + } + ) df2.groupby(["cats", "B"]).mean() @@ -1115,7 +1125,11 @@ You can use ``fillna`` to handle missing values before applying a function. .. ipython:: python df = pd.DataFrame( - {"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"], "cats": pd.Categorical([1, 2, 3, 2])} + { + "a": [1, 2, 3, 4], + "b": ["a", "b", "c", "d"], + "cats": pd.Categorical([1, 2, 3, 2]), + } ) df.apply(lambda row: type(row["cats"]), axis=1) df.apply(lambda col: col.dtype, axis=0) diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index 2f6ac6b06d85e..75fb3380821d8 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -787,7 +787,11 @@ can even be omitted: .. ipython:: python - covs = df[["B", "C", "D"]].rolling(window=50).cov(df[["A", "B", "C"]], pairwise=True) + covs = ( + df[["B", "C", "D"]] + .rolling(window=50) + .cov(df[["A", "B", "C"]], pairwise=True) + ) covs.loc["2002-09-22":] .. ipython:: python diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 214b8a680fa7e..939acf10d6c0b 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -266,7 +266,9 @@ New columns .. ipython:: python - df = pd.DataFrame({"AAA": [1, 1, 1, 2, 2, 2, 3, 3], "BBB": [2, 1, 3, 4, 5, 1, 2, 3]}) + df = pd.DataFrame( + {"AAA": [1, 1, 1, 2, 2, 2, 3, 3], "BBB": [2, 1, 3, 4, 5, 1, 2, 3]} + ) df Method 1 : idxmin() to get the index of the minimums @@ -327,7 +329,9 @@ Arithmetic .. ipython:: python - cols = pd.MultiIndex.from_tuples([(x, y) for x in ["A", "B", "C"] for y in ["O", "I"]]) + cols = pd.MultiIndex.from_tuples( + [(x, y) for x in ["A", "B", "C"] for y in ["O", "I"]] + ) df = pd.DataFrame(np.random.randn(2, 6), index=["n", "m"], columns=cols) df df = df.div(df["C"], level=1) @@ -566,7 +570,9 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - df = pd.DataFrame({"Color": "Red Red Red Blue".split(), "Value": [100, 150, 50, 50]}) + df = pd.DataFrame( + {"Color": "Red Red Red Blue".split(), "Value": [100, 150, 50, 50]} + ) df df["Counts"] = df.groupby(["Color"]).transform(len) df @@ -648,7 +654,10 @@ Create a list of dataframes, split using a delineation based on logic included i dfs = list( zip( *df.groupby( - (1 * (df["Case"] == "B")).cumsum().rolling(window=3, min_periods=1).median() + (1 * (df["Case"] == "B")) + .cumsum() + .rolling(window=3, min_periods=1) + .median() ) ) )[-1] @@ -740,7 +749,18 @@ The :ref:`Pivot ` docs. "yes", ], "Passed": ["yes" if x > 50 else "no" for x in grades], - "Employed": [True, True, True, False, False, False, False, True, True, False], + "Employed": [ + True, + True, + True, + False, + False, + False, + False, + True, + True, + False, + ], "Grade": grades, } ) @@ -791,7 +811,9 @@ Apply return pd.Series(aList) - df_orgz = pd.concat({ind: row.apply(SeriesFromSubList) for ind, row in df.iterrows()}) + df_orgz = pd.concat( + {ind: row.apply(SeriesFromSubList) for ind, row in df.iterrows()} + ) df_orgz `Rolling apply with a DataFrame returning a Series @@ -1162,7 +1184,12 @@ Option 1: pass rows explicitly to skip rows from io import StringIO pd.read_csv( - StringIO(data), sep=";", skiprows=[11, 12], index_col=0, parse_dates=True, header=10 + StringIO(data), + sep=";", + skiprows=[11, 12], + index_col=0, + parse_dates=True, + header=10, ) Option 2: read column names and then data diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 6427cea6fa510..e8866daa9d99f 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -267,7 +267,9 @@ the length of the ``groups`` dict, so it is largely just a convenience: height = np.random.normal(60, 10, size=n) time = pd.date_range("1/1/2000", periods=n) gender = np.random.choice(["male", "female"], size=n) - df = pd.DataFrame({"height": height, "weight": weight, "gender": gender}, index=time) + df = pd.DataFrame( + {"height": height, "weight": weight, "gender": gender}, index=time + ) .. ipython:: python @@ -767,7 +769,10 @@ For example, suppose we wished to standardize the data within each group: ts.head() ts.tail() - transformed = ts.groupby(lambda x: x.year).transform(lambda x: (x - x.mean()) / x.std()) + transformed = ts.groupby(lambda x: x.year).transform( + lambda x: (x - x.mean()) / x.std() + ) + We would expect the result to now have mean 0 and standard deviation 1 within each group, which we can easily check: diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index ae22ee836cd8c..0b24ff61d87b8 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -986,7 +986,12 @@ Note that ``infer_datetime_format`` is sensitive to ``dayfirst``. With .. ipython:: python # Try to infer the format for the index column - df = pd.read_csv("foo.csv", index_col=0, parse_dates=True, infer_datetime_format=True) + df = pd.read_csv( + "foo.csv", + index_col=0, + parse_dates=True, + infer_datetime_format=True, + ) df .. ipython:: python @@ -1046,9 +1051,19 @@ writing to a file). For example: val = "0.3066101993807095471566981359501369297504425048828125" data = "a,b,c\n1,2,{0}".format(val) - abs(pd.read_csv(StringIO(data), engine="c", float_precision=None)["c"][0] - float(val)) abs( - pd.read_csv(StringIO(data), engine="c", float_precision="high")["c"][0] - float(val) + pd.read_csv( + StringIO(data), + engine="c", + float_precision=None, + )["c"][0] - float(val) + ) + abs( + pd.read_csv( + StringIO(data), + engine="c", + float_precision="high", + )["c"][0] - float(val) ) abs( pd.read_csv(StringIO(data), engine="c", float_precision="round_trip")["c"][0] @@ -2517,7 +2532,12 @@ columns to strings. .. code-block:: python url_mcc = "https://en.wikipedia.org/wiki/Mobile_country_code" - dfs = pd.read_html(url_mcc, match="Telekom Albania", header=0, converters={"MNC": str}) + dfs = pd.read_html( + url_mcc, + match="Telekom Albania", + header=0, + converters={"MNC": str}, + ) Use some combination of the above: @@ -3570,7 +3590,12 @@ HDFStore will by default not drop rows that are all missing. This behavior can b .. ipython:: python - df_with_missing = pd.DataFrame({"col1": [0, np.nan, 2], "col2": [1, np.nan, np.nan]}) + df_with_missing = pd.DataFrame( + { + "col1": [0, np.nan, 2], + "col2": [1, np.nan, np.nan], + } + ) df_with_missing df_with_missing.to_hdf("file.h5", "df_with_missing", format="table", mode="w") @@ -3944,7 +3969,8 @@ specified in the format: ``()``, where float may be signed (and fra { "A": pd.Timestamp("20130101"), "B": [ - pd.Timestamp("20130101") + timedelta(days=i, seconds=10) for i in range(10) + pd.Timestamp("20130101") + timedelta(days=i, seconds=10) + for i in range(10) ], } ) @@ -4241,7 +4267,11 @@ results. store.select("df2_mt") # as a multiple - store.select_as_multiple(["df1_mt", "df2_mt"], where=["A>0", "B>0"], selector="df1_mt") + store.select_as_multiple( + ["df1_mt", "df2_mt"], + where=["A>0", "B>0"], + selector="df1_mt", + ) Delete from a table @@ -4797,8 +4827,16 @@ Read only certain columns of a parquet file. .. ipython:: python - result = pd.read_parquet("example_fp.parquet", engine="fastparquet", columns=["a", "b"]) - result = pd.read_parquet("example_pa.parquet", engine="pyarrow", columns=["a", "b"]) + result = pd.read_parquet( + "example_fp.parquet", + engine="fastparquet", + columns=["a", "b"], + ) + result = pd.read_parquet( + "example_pa.parquet", + engine="pyarrow", + columns=["a", "b"], + ) result.dtypes @@ -5176,7 +5214,11 @@ to pass to :func:`pandas.to_datetime`: .. code-block:: python pd.read_sql_table("data", engine, parse_dates={"Date": "%Y-%m-%d"}) - pd.read_sql_table("data", engine, parse_dates={"Date": {"format": "%Y-%m-%d %H:%M:%S"}}) + pd.read_sql_table( + "data", + engine, + parse_dates={"Date": {"format": "%Y-%m-%d %H:%M:%S"}}, + ) You can check if a table exists using :func:`~pandas.io.sql.has_table` @@ -5593,7 +5635,11 @@ avoid converting categorical columns into ``pd.Categorical``: .. code-block:: python - df = pd.read_spss("spss_data.sav", usecols=["foo", "bar"], convert_categoricals=False) + df = pd.read_spss( + "spss_data.sav", + usecols=["foo", "bar"], + convert_categoricals=False, + ) More information about the SAV and ZSAV file formats is available here_. diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index 8dbfc261e6fa8..eeac0ed4837dd 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -154,6 +154,14 @@ functionality below. frames = [ process_your_file(f) for f in files ] result = pd.concat(frames) +.. note:: + + When concatenating DataFrames with named axes, pandas will attempt to preserve + these index/column names whenever possible. In the case where all inputs share a + common name, this name will be assigned to the result. When the input names do + not all agree, the result will be unnamed. The same is true for :class:`MultiIndex`, + but the logic is applied separately on a level-by-level basis. + Set logic on the other axes ~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1065,7 +1073,9 @@ join key), using ``join`` may be more convenient. Here is a simple example: .. ipython:: python - result = pd.merge(left, right, left_on="key", right_index=True, how="left", sort=False) + result = pd.merge( + left, right, left_on="key", right_index=True, how="left", sort=False + ) .. ipython:: python :suppress: @@ -1196,7 +1206,9 @@ the left argument, as in this example: left = pd.DataFrame({"v1": range(12)}, index=leftindex) left - rightindex = pd.MultiIndex.from_product([list("abc"), list("xy")], names=["abc", "xy"]) + rightindex = pd.MultiIndex.from_product( + [list("abc"), list("xy")], names=["abc", "xy"] + ) right = pd.DataFrame({"v2": [100 * i for i in range(1, 7)]}, index=rightindex) right @@ -1210,7 +1222,9 @@ done using the following code. leftindex = pd.MultiIndex.from_tuples( [("K0", "X0"), ("K0", "X1"), ("K1", "X2")], names=["key", "X"] ) - left = pd.DataFrame({"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=leftindex) + left = pd.DataFrame( + {"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=leftindex + ) rightindex = pd.MultiIndex.from_tuples( [("K0", "Y0"), ("K1", "Y1"), ("K2", "Y2"), ("K2", "Y3")], names=["key", "Y"] @@ -1376,7 +1390,9 @@ one object from values for matching indices in the other. Here is an example: .. ipython:: python - df1 = pd.DataFrame([[np.nan, 3.0, 5.0], [-4.6, np.nan, np.nan], [np.nan, 7.0, np.nan]]) + df1 = pd.DataFrame( + [[np.nan, 3.0, 5.0], [-4.6, np.nan, np.nan], [np.nan, 7.0, np.nan]] + ) df2 = pd.DataFrame([[-42.6, np.nan, -8.2], [-5.0, 1.6, 4]], index=[1, 2]) For this, use the :meth:`~DataFrame.combine_first` method: diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 7eb377694910b..e6d06aa6bd1a0 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -400,7 +400,10 @@ You can also interpolate with a DataFrame: .. ipython:: python df = pd.DataFrame( - {"A": [1, 2.1, np.nan, 4.7, 5.6, 6.8], "B": [0.25, np.nan, np.nan, 4, 12.2, 14.4]} + { + "A": [1, 2.1, np.nan, 4.7, 5.6, 6.8], + "B": [0.25, np.nan, np.nan, 4, 12.2, 14.4], + } ) df df.interpolate() diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 2061185b25416..77cf43b2e2b19 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -238,7 +238,13 @@ calling ``sort_index``, of course). Here is a more complex example: .. ipython:: python columns = pd.MultiIndex.from_tuples( - [("A", "cat"), ("B", "dog"), ("B", "cat"), ("A", "dog")], names=["exp", "animal"] + [ + ("A", "cat"), + ("B", "dog"), + ("B", "cat"), + ("A", "dog"), + ], + names=["exp", "animal"], ) index = pd.MultiIndex.from_product( [("bar", "baz", "foo", "qux"), ("one", "two")], names=["first", "second"] @@ -426,7 +432,12 @@ We can produce pivot tables from this data very easily: pd.pivot_table(df, values="D", index=["A", "B"], columns=["C"]) pd.pivot_table(df, values="D", index=["B"], columns=["A", "C"], aggfunc=np.sum) - pd.pivot_table(df, values=["D", "E"], index=["B"], columns=["A", "C"], aggfunc=np.sum) + pd.pivot_table( + df, values=["D", "E"], + index=["B"], + columns=["A", "C"], + aggfunc=np.sum, + ) The result object is a ``DataFrame`` having potentially hierarchical indexes on the rows and columns. If the ``values`` column name is not given, the pivot table @@ -800,14 +811,26 @@ parameter. .. ipython:: python - df.pivot_table(values="val0", index="row", columns="col", aggfunc="mean", fill_value=0) + df.pivot_table( + values="val0", + index="row", + columns="col", + aggfunc="mean", + fill_value=0, + ) Also note that we can pass in other aggregation functions as well. For example, we can also pass in ``sum``. .. ipython:: python - df.pivot_table(values="val0", index="row", columns="col", aggfunc="sum", fill_value=0) + df.pivot_table( + values="val0", + index="row", + columns="col", + aggfunc="sum", + fill_value=0, + ) Another aggregation we can do is calculate the frequency in which the columns and rows occur together a.k.a. "cross tabulation". To do this, we can pass @@ -825,21 +848,36 @@ We can also perform multiple aggregations. For example, to perform both a .. ipython:: python - df.pivot_table(values="val0", index="row", columns="col", aggfunc=["mean", "sum"]) + df.pivot_table( + values="val0", + index="row", + columns="col", + aggfunc=["mean", "sum"], + ) Note to aggregate over multiple value columns, we can pass in a list to the ``values`` parameter. .. ipython:: python - df.pivot_table(values=["val0", "val1"], index="row", columns="col", aggfunc=["mean"]) + df.pivot_table( + values=["val0", "val1"], + index="row", + columns="col", + aggfunc=["mean"], + ) Note to subdivide over multiple columns we can pass in a list to the ``columns`` parameter. .. ipython:: python - df.pivot_table(values=["val0"], index="row", columns=["item", "col"], aggfunc=["mean"]) + df.pivot_table( + values=["val0"], + index="row", + columns=["item", "col"], + aggfunc=["mean"], + ) .. _reshaping.explode: diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 2ada09117273d..2b27d37904599 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -261,7 +261,8 @@ i.e., from the end of the string to the beginning of the string: .. ipython:: python s3 = pd.Series( - ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"], dtype="string" + ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"], + dtype="string", ) s3 s3.str.replace("^.a|dog", "XX-XX ", case=False) @@ -515,7 +516,10 @@ DataFrame with one column per group. .. ipython:: python - pd.Series(["a1", "b2", "c3"], dtype="string").str.extract(r"([ab])(\d)", expand=False) + pd.Series( + ["a1", "b2", "c3"], + dtype="string", + ).str.extract(r"([ab])(\d)", expand=False) Elements that do not match return a row filled with ``NaN``. Thus, a Series of messy strings can be "converted" into a like-indexed Series @@ -536,7 +540,10 @@ and optional groups like .. ipython:: python - pd.Series(["a1", "b2", "3"], dtype="string").str.extract(r"([ab])?(\d)", expand=False) + pd.Series( + ["a1", "b2", "3"], + dtype="string", + ).str.extract(r"([ab])?(\d)", expand=False) can also be used. Note that any capture group names in the regular expression will be used for column names; otherwise capture group @@ -655,19 +662,28 @@ You can check whether elements contain a pattern: .. ipython:: python pattern = r"[0-9][a-z]" - pd.Series(["1", "2", "3a", "3b", "03c", "4dx"], dtype="string").str.contains(pattern) + pd.Series( + ["1", "2", "3a", "3b", "03c", "4dx"], + dtype="string", + ).str.contains(pattern) Or whether elements match a pattern: .. ipython:: python - pd.Series(["1", "2", "3a", "3b", "03c", "4dx"], dtype="string").str.match(pattern) + pd.Series( + ["1", "2", "3a", "3b", "03c", "4dx"], + dtype="string", + ).str.match(pattern) .. versionadded:: 1.1.0 .. ipython:: python - pd.Series(["1", "2", "3a", "3b", "03c", "4dx"], dtype="string").str.fullmatch(pattern) + pd.Series( + ["1", "2", "3a", "3b", "03c", "4dx"], + dtype="string", + ).str.fullmatch(pattern) .. note:: diff --git a/doc/source/user_guide/timedeltas.rst b/doc/source/user_guide/timedeltas.rst index cb265d34229dd..0b4ddaaa8a42a 100644 --- a/doc/source/user_guide/timedeltas.rst +++ b/doc/source/user_guide/timedeltas.rst @@ -409,7 +409,10 @@ Similarly to other of the datetime-like indices, ``DatetimeIndex`` and ``PeriodI .. ipython:: python - s = pd.Series(np.arange(100), index=pd.timedelta_range("1 days", periods=100, freq="h")) + s = pd.Series( + np.arange(100), + index=pd.timedelta_range("1 days", periods=100, freq="h"), + ) s Selections work similarly, with coercion on string-likes and slices: diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index be2c67521dc5d..9fbd02df50d10 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -317,7 +317,9 @@ which can be specified. These are computed from the starting point specified by .. ipython:: python - pd.to_datetime([1349720105, 1349806505, 1349892905, 1349979305, 1350065705], unit="s") + pd.to_datetime( + [1349720105, 1349806505, 1349892905, 1349979305, 1350065705], unit="s" + ) pd.to_datetime( [1349720105100, 1349720105200, 1349720105300, 1349720105400, 1349720105500], @@ -707,7 +709,9 @@ If the timestamp string is treated as a slice, it can be used to index ``DataFra .. ipython:: python :okwarning: - dft_minute = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=series_minute.index) + dft_minute = pd.DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6]}, index=series_minute.index + ) dft_minute["2011-12-31 23"] @@ -748,10 +752,11 @@ With no defaults. .. ipython:: python dft[ - datetime.datetime(2013, 1, 1, 10, 12, 0): datetime.datetime(2013, 2, 28, 10, 12, 0) + datetime.datetime(2013, 1, 1, 10, 12, 0): datetime.datetime( + 2013, 2, 28, 10, 12, 0 + ) ] - Truncating & fancy indexing ~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1036,8 +1041,15 @@ As an interesting example, let's look at Egypt where a Friday-Saturday weekend i # They also observe International Workers' Day so let's # add that for a couple of years - holidays = ["2012-05-01", datetime.datetime(2013, 5, 1), np.datetime64("2014-05-01")] - bday_egypt = pd.offsets.CustomBusinessDay(holidays=holidays, weekmask=weekmask_egypt) + holidays = [ + "2012-05-01", + datetime.datetime(2013, 5, 1), + np.datetime64("2014-05-01"), + ] + bday_egypt = pd.offsets.CustomBusinessDay( + holidays=holidays, + weekmask=weekmask_egypt, + ) dt = datetime.datetime(2013, 4, 30) dt + 2 * bday_egypt @@ -1417,7 +1429,12 @@ An example of how holidays and holiday calendars are defined: rules = [ USMemorialDay, Holiday("July 4th", month=7, day=4, observance=nearest_workday), - Holiday("Columbus Day", month=10, day=1, offset=pd.DateOffset(weekday=MO(2))), + Holiday( + "Columbus Day", + month=10, + day=1, + offset=pd.DateOffset(weekday=MO(2)), + ), ] @@ -2279,7 +2296,12 @@ To return ``dateutil`` time zone objects, append ``dateutil/`` before the string rng_dateutil.tz # dateutil - utc special case - rng_utc = pd.date_range("3/6/2012 00:00", periods=3, freq="D", tz=dateutil.tz.tzutc()) + rng_utc = pd.date_range( + "3/6/2012 00:00", + periods=3, + freq="D", + tz=dateutil.tz.tzutc(), + ) rng_utc.tz .. versionadded:: 0.25.0 @@ -2287,7 +2309,12 @@ To return ``dateutil`` time zone objects, append ``dateutil/`` before the string .. ipython:: python # datetime.timezone - rng_utc = pd.date_range("3/6/2012 00:00", periods=3, freq="D", tz=datetime.timezone.utc) + rng_utc = pd.date_range( + "3/6/2012 00:00", + periods=3, + freq="D", + tz=datetime.timezone.utc, + ) rng_utc.tz Note that the ``UTC`` time zone is a special case in ``dateutil`` and should be constructed explicitly @@ -2440,10 +2467,18 @@ control over how they are handled. .. ipython:: python pd.Timestamp( - datetime.datetime(2019, 10, 27, 1, 30, 0, 0), tz="dateutil/Europe/London", fold=0 + datetime.datetime(2019, 10, 27, 1, 30, 0, 0), + tz="dateutil/Europe/London", + fold=0, ) pd.Timestamp( - year=2019, month=10, day=27, hour=1, minute=30, tz="dateutil/Europe/London", fold=1 + year=2019, + month=10, + day=27, + hour=1, + minute=30, + tz="dateutil/Europe/London", + fold=1, ) .. _timeseries.timezone_ambiguous: diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index a6c3d9814b03d..6ad7ad9657e30 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -1453,7 +1453,11 @@ Here is an example of one way to easily plot group means with standard deviation ) df3 = pd.DataFrame( - {"data1": [3, 2, 4, 3, 2, 4, 3, 2], "data2": [6, 5, 7, 5, 4, 5, 6, 5]}, index=ix3 + { + "data1": [3, 2, 4, 3, 2, 4, 3, 2], + "data2": [6, 5, 7, 5, 4, 5, 6, 5], + }, + index=ix3, ) # Group by index labels and take the means and standard deviations diff --git a/doc/source/whatsnew/v0.10.1.rst b/doc/source/whatsnew/v0.10.1.rst index d71a0d5ca68cd..611ac2021fcec 100644 --- a/doc/source/whatsnew/v0.10.1.rst +++ b/doc/source/whatsnew/v0.10.1.rst @@ -180,7 +180,9 @@ combined result, by using ``where`` on a selector table. store.select("df2_mt") # as a multiple - store.select_as_multiple(["df1_mt", "df2_mt"], where=["A>0", "B>0"], selector="df1_mt") + store.select_as_multiple( + ["df1_mt", "df2_mt"], where=["A>0", "B>0"], selector="df1_mt" + ) .. ipython:: python :suppress: diff --git a/doc/source/whatsnew/v0.13.1.rst b/doc/source/whatsnew/v0.13.1.rst index 1215786b4cccc..249b9555b7fd4 100644 --- a/doc/source/whatsnew/v0.13.1.rst +++ b/doc/source/whatsnew/v0.13.1.rst @@ -101,7 +101,9 @@ Output formatting enhancements .. ipython:: python - df = pd.DataFrame([pd.Timestamp("20010101"), pd.Timestamp("20040601")], columns=["age"]) + df = pd.DataFrame( + [pd.Timestamp("20010101"), pd.Timestamp("20040601")], columns=["age"] + ) df["today"] = pd.Timestamp("20130419") df["diff"] = df["today"] - df["age"] df @@ -206,7 +208,9 @@ Enhancements .. code-block:: python # Try to infer the format for the index column - df = pd.read_csv("foo.csv", index_col=0, parse_dates=True, infer_datetime_format=True) + df = pd.read_csv( + "foo.csv", index_col=0, parse_dates=True, infer_datetime_format=True + ) - ``date_format`` and ``datetime_format`` keywords can now be specified when writing to ``excel`` files (:issue:`4133`) diff --git a/doc/source/whatsnew/v0.14.0.rst b/doc/source/whatsnew/v0.14.0.rst index 421ef81427210..f2401c812a979 100644 --- a/doc/source/whatsnew/v0.14.0.rst +++ b/doc/source/whatsnew/v0.14.0.rst @@ -1084,4 +1084,4 @@ Bug fixes Contributors ~~~~~~~~~~~~ -.. contributors:: v0.13.1..v0.14.0 \ No newline at end of file +.. contributors:: v0.13.1..v0.14.0 diff --git a/doc/source/whatsnew/v0.14.1.rst b/doc/source/whatsnew/v0.14.1.rst index 78fd182ea86c3..a8f8955c3c1b9 100644 --- a/doc/source/whatsnew/v0.14.1.rst +++ b/doc/source/whatsnew/v0.14.1.rst @@ -124,7 +124,9 @@ Enhancements .. ipython:: python - rng = pd.date_range("3/6/2012 00:00", periods=10, freq="D", tz="dateutil/Europe/London") + rng = pd.date_range( + "3/6/2012 00:00", periods=10, freq="D", tz="dateutil/Europe/London" + ) rng.tz See :ref:`the docs `. diff --git a/doc/source/whatsnew/v0.17.0.rst b/doc/source/whatsnew/v0.17.0.rst index 1658f877f5523..d8f39a7d6e3c0 100644 --- a/doc/source/whatsnew/v0.17.0.rst +++ b/doc/source/whatsnew/v0.17.0.rst @@ -786,7 +786,9 @@ Previous behavior: .. ipython:: python - df_with_missing = pd.DataFrame({"col1": [0, np.nan, 2], "col2": [1, np.nan, np.nan]}) + df_with_missing = pd.DataFrame( + {"col1": [0, np.nan, 2], "col2": [1, np.nan, np.nan]} + ) df_with_missing diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index 08ccc1565125f..2ac7b0f54361b 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -1091,7 +1091,9 @@ Previously, most ``Index`` classes returned ``np.ndarray``, and ``DatetimeIndex` .. ipython:: python pd.Index([1, 2, 3]).unique() - pd.DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"], tz="Asia/Tokyo").unique() + pd.DatetimeIndex( + ["2011-01-01", "2011-01-02", "2011-01-03"], tz="Asia/Tokyo" + ).unique() .. _whatsnew_0190.api.multiindex: diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 43b42c5cb5648..37b661b87068d 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -33,7 +33,7 @@ Enhancements .. _whatsnew_0250.enhancements.agg_relabel: -Groupby aggregation with relabeling +GroupBy aggregation with relabeling ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ pandas has added special groupby behavior, known as "named aggregation", for naming the @@ -85,7 +85,7 @@ See :ref:`groupby.aggregate.named` for more. .. _whatsnew_0250.enhancements.multiple_lambdas: -Groupby aggregation with multiple lambdas +GroupBy aggregation with multiple lambdas ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ You can now provide multiple lambda functions to a list-like aggregation in @@ -161,7 +161,7 @@ To restore the previous behaviour of a single threshold, set .. _whatsnew_0250.enhancements.json_normalize_with_max_level: -Json normalize with max_level param support +JSON normalize with max_level param support ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :func:`json_normalize` normalizes the provided input dict to all @@ -308,7 +308,7 @@ would be reassigned as -1. (:issue:`19387`) .. _whatsnew_0250.api_breaking.groupby_apply_first_group_once: -``Groupby.apply`` on ``DataFrame`` evaluates first group only once +``GroupBy.apply`` on ``DataFrame`` evaluates first group only once ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The implementation of :meth:`DataFrameGroupBy.apply() ` @@ -422,7 +422,7 @@ of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* t .. _whatsnew_0250.api_breaking.groupby_categorical: -Categorical dtypes are preserved during groupby +Categorical dtypes are preserved during GroupBy ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Previously, columns that were categorical, but not the groupby key(s) would be converted to ``object`` dtype during groupby operations. pandas now will preserve these dtypes. (:issue:`18502`) @@ -483,7 +483,7 @@ values are coerced to floating point, which may result in loss of precision. See :ref:`indexing.set_ops` for more. -``DataFrame`` groupby ffill/bfill no longer return group labels +``DataFrame`` GroupBy ffill/bfill no longer return group labels ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The methods ``ffill``, ``bfill``, ``pad`` and ``backfill`` of @@ -513,7 +513,7 @@ are returned. (:issue:`21521`) df.groupby("a").ffill() -``DataFrame`` describe on an empty categorical / object column will return top and freq +``DataFrame`` describe on an empty Categorical / object column will return top and freq ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ When calling :meth:`DataFrame.describe` with an empty categorical / object @@ -1085,7 +1085,6 @@ Conversion - Bug in :func:`DataFrame.astype()` when passing a dict of columns and types the ``errors`` parameter was ignored. (:issue:`25905`) - -- Strings ^^^^^^^ @@ -1139,8 +1138,8 @@ MultiIndex - Bug in which incorrect exception raised by :class:`Timedelta` when testing the membership of :class:`MultiIndex` (:issue:`24570`) - -I/O -^^^ +IO +^^ - Bug in :func:`DataFrame.to_html()` where values were truncated using display options instead of outputting the full content (:issue:`17004`) - Fixed bug in missing text when using :meth:`to_clipboard` if copying utf-16 characters in Python 3 on Windows (:issue:`25040`) @@ -1182,9 +1181,8 @@ Plotting - Fixed bug causing plots of :class:`PeriodIndex` timeseries to fail if the frequency is a multiple of the frequency rule code (:issue:`14763`) - Fixed bug when plotting a :class:`DatetimeIndex` with ``datetime.timezone.utc`` timezone (:issue:`17173`) - -- -Groupby/resample/rolling +GroupBy/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`pandas.core.resample.Resampler.agg` with a timezone aware index where ``OverflowError`` would raise when passing a list of functions (:issue:`22660`) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 8a16bab63f1bf..cc24ba5d6557c 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -6,8 +6,8 @@ What's new in 0.25.1 (August 21, 2019) These are the changes in pandas 0.25.1. See :ref:`release` for a full changelog including other versions of pandas. -I/O and LZMA -~~~~~~~~~~~~ +IO and LZMA +~~~~~~~~~~~ Some users may unknowingly have an incomplete Python installation lacking the ``lzma`` module from the standard library. In this case, ``import pandas`` failed due to an ``ImportError`` (:issue:`27575`). pandas will now warn, rather than raising an ``ImportError`` if the ``lzma`` module is not present. Any subsequent attempt to use ``lzma`` methods will raise a ``RuntimeError``. @@ -67,8 +67,8 @@ Missing - Bug in :func:`pandas.isnull` or :func:`pandas.isna` when the input is a type e.g. ``type(pandas.Series())`` (:issue:`27482`) -I/O -^^^ +IO +^^ - Avoid calling ``S3File.s3`` when reading parquet, as this was removed in s3fs version 0.3.0 (:issue:`27756`) - Better error message when a negative header is passed in :func:`pandas.read_csv` (:issue:`27779`) @@ -82,7 +82,7 @@ Plotting :meth:`pandas.plotting.deregister_matplotlib_converters` (:issue:`27481`). - Fix compatibility issue with matplotlib when passing a pandas ``Index`` to a plot call (:issue:`27775`). -Groupby/resample/rolling +GroupBy/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Fixed regression in :meth:`pands.core.groupby.DataFrameGroupBy.quantile` raising when multiple quantiles are given (:issue:`27526`) diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst index a5ea8933762ab..ab6aaebe4ed06 100644 --- a/doc/source/whatsnew/v0.25.2.rst +++ b/doc/source/whatsnew/v0.25.2.rst @@ -21,14 +21,14 @@ Indexing - Fix regression in :meth:`DataFrame.reindex` not following the ``limit`` argument (:issue:`28631`). - Fix regression in :meth:`RangeIndex.get_indexer` for decreasing :class:`RangeIndex` where target values may be improperly identified as missing/present (:issue:`28678`) -I/O -^^^ +IO +^^ - Fix regression in notebook display where ``
`` tags were missing for :attr:`DataFrame.index` values (:issue:`28204`). - Regression in :meth:`~DataFrame.to_csv` where writing a :class:`Series` or :class:`DataFrame` indexed by an :class:`IntervalIndex` would incorrectly raise a ``TypeError`` (:issue:`28210`) - Fix :meth:`~DataFrame.to_csv` with ``ExtensionArray`` with list-like values (:issue:`28840`). -Groupby/resample/rolling +GroupBy/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug incorrectly raising an ``IndexError`` when passing a list of quantiles to :meth:`pandas.core.groupby.DataFrameGroupBy.quantile` (:issue:`28113`). diff --git a/doc/source/whatsnew/v0.25.3.rst b/doc/source/whatsnew/v0.25.3.rst index f7f54198a0f82..e028c08e1e85c 100644 --- a/doc/source/whatsnew/v0.25.3.rst +++ b/doc/source/whatsnew/v0.25.3.rst @@ -11,7 +11,7 @@ including other versions of pandas. Bug fixes ~~~~~~~~~ -Groupby/resample/rolling +GroupBy/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`DataFrameGroupBy.quantile` where NA values in the grouping could cause segfaults or incorrect results (:issue:`28882`) diff --git a/doc/source/whatsnew/v0.8.0.rst b/doc/source/whatsnew/v0.8.0.rst index 8a84630a28b34..b34c2a5c6a07c 100644 --- a/doc/source/whatsnew/v0.8.0.rst +++ b/doc/source/whatsnew/v0.8.0.rst @@ -178,7 +178,9 @@ types. For example, ``'kde'`` is a new option: .. ipython:: python - s = pd.Series(np.concatenate((np.random.randn(1000), np.random.randn(1000) * 0.5 + 3))) + s = pd.Series( + np.concatenate((np.random.randn(1000), np.random.randn(1000) * 0.5 + 3)) + ) plt.figure() s.hist(density=True, alpha=0.2) s.plot(kind="kde") diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index ddc40d6d40594..8f9ceb30a947a 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -196,7 +196,7 @@ You can use the alias ``"boolean"`` as well. .. _whatsnew_100.convert_dtypes: -``convert_dtypes`` method to ease use of supported extension dtypes +Method ``convert_dtypes`` to ease use of supported extension dtypes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In order to encourage use of the extension dtypes ``StringDtype``, @@ -1082,13 +1082,11 @@ Timedelta ^^^^^^^^^ - Bug in subtracting a :class:`TimedeltaIndex` or :class:`TimedeltaArray` from a ``np.datetime64`` object (:issue:`29558`) - -- Timezones ^^^^^^^^^ - -- Numeric @@ -1113,7 +1111,6 @@ Numeric Conversion ^^^^^^^^^^ -- - Strings @@ -1152,7 +1149,6 @@ Indexing Missing ^^^^^^^ -- - MultiIndex @@ -1162,8 +1158,8 @@ MultiIndex - Series and MultiIndex ``.drop`` with ``MultiIndex`` raise exception if labels not in given in level (:issue:`8594`) - -I/O -^^^ +IO +^^ - :meth:`read_csv` now accepts binary mode file buffers when using the Python csv engine (:issue:`23779`) - Bug in :meth:`DataFrame.to_json` where using a Tuple as a column or index value and using ``orient="columns"`` or ``orient="index"`` would produce invalid JSON (:issue:`20500`) @@ -1203,7 +1199,7 @@ Plotting - Allow :meth:`DataFrame.plot.scatter` to plot ``objects`` and ``datetime`` type data (:issue:`18755`, :issue:`30391`) - Bug in :meth:`DataFrame.hist`, ``xrot=0`` does not work with ``by`` and subplots (:issue:`30288`). -Groupby/resample/rolling +GroupBy/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`core.groupby.DataFrameGroupBy.apply` only showing output from a single group when function returns an :class:`Index` (:issue:`28652`) diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst index c3f144e2f0cb3..3f7c6e85e14ca 100644 --- a/doc/source/whatsnew/v1.0.2.rst +++ b/doc/source/whatsnew/v1.0.2.rst @@ -47,7 +47,7 @@ Fixed regressions .. --------------------------------------------------------------------------- -Indexing with Nullable Boolean Arrays +Indexing with nullable boolean arrays ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Previously indexing with a nullable Boolean array containing ``NA`` would raise a ``ValueError``, however this is now permitted with ``NA`` being treated as ``False``. (:issue:`31503`) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 54ed407ed0a0a..e054ac830ce41 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -42,7 +42,7 @@ For example, the below now works: .. _whatsnew_110.period_index_partial_string_slicing: -Non-monotonic PeriodIndex Partial String Slicing +Non-monotonic PeriodIndex partial string slicing ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :class:`PeriodIndex` now supports partial string slicing for non-monotonic indexes, mirroring :class:`DatetimeIndex` behavior (:issue:`31096`) @@ -413,7 +413,7 @@ And the differences in reindexing ``df`` with ``mi_2`` and using ``method='pad'` .. _whatsnew_110.notable_bug_fixes.indexing_raises_key_errors: -Failed Label-Based Lookups Always Raise KeyError +Failed label-based lookups always raise KeyError ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Label lookups ``series[key]``, ``series.loc[key]`` and ``frame.loc[key]`` @@ -786,7 +786,7 @@ Optional libraries below the lowest tested version may still work, but are not c See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. -Development Changes +Development changes ^^^^^^^^^^^^^^^^^^^ - The minimum version of Cython is now the most recent bug-fix version (0.29.16) (:issue:`33334`). @@ -1051,8 +1051,8 @@ MultiIndex - Bug when joining two :class:`MultiIndex` without specifying level with different columns. Return-indexers parameter was ignored. (:issue:`34074`) -I/O -^^^ +IO +^^ - Passing a ``set`` as ``names`` argument to :func:`pandas.read_csv`, :func:`pandas.read_table`, or :func:`pandas.read_fwf` will raise ``ValueError: Names should be an ordered collection.`` (:issue:`34946`) - Bug in print-out when ``display.precision`` is zero. (:issue:`20359`) - Bug in :func:`read_json` where integer overflow was occurring when json contains big number strings. (:issue:`30320`) @@ -1108,7 +1108,7 @@ Plotting - Bug in :meth:`pandas.plotting.bootstrap_plot` was causing cluttered axes and overlapping labels (:issue:`34905`) - Bug in :meth:`DataFrame.plot.scatter` caused an error when plotting variable marker sizes (:issue:`32904`) -Groupby/resample/rolling +GroupBy/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Using a :class:`pandas.api.indexers.BaseIndexer` with ``count``, ``min``, ``max``, ``median``, ``skew``, ``cov``, ``corr`` will now return correct results for any monotonic :class:`pandas.api.indexers.BaseIndexer` descendant (:issue:`32865`) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 9eea7f5737861..5be9155b3ff0b 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -157,6 +157,26 @@ Alternatively, you can also use the dtype object: behaviour or API may still change without warning. Expecially the behaviour regarding NaN (distinct from NA missing values) is subject to change. +.. _whatsnew_120.index_name_preservation: + +Index/column name preservation when aggregating +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When aggregating using :meth:`concat` or the :class:`DataFrame` constructor, Pandas +will attempt to preserve index (and column) names whenever possible (:issue:`35847`). +In the case where all inputs share a common name, this name will be assigned to the +result. When the input names do not all agree, the result will be unnamed. Here is an +example where the index name is preserved: + +.. ipython:: python + + idx = pd.Index(range(5), name='abc') + ser = pd.Series(range(5, 10), index=idx) + pd.concat({'x': ser[1:], 'y': ser[:-1]}, axis=1) + +The same is true for :class:`MultiIndex`, but the logic is applied separately on a +level-by-level basis. + .. _whatsnew_120.enhancements.other: Other enhancements @@ -169,6 +189,7 @@ Other enhancements - :meth:`Rolling.mean()` and :meth:`Rolling.sum()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`) - :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`) - Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`) +- Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`) .. _whatsnew_120.api_breaking.python: @@ -336,6 +357,8 @@ Numeric - Bug in :class:`Series` where two :class:`Series` each have a :class:`DatetimeIndex` with different timezones having those indexes incorrectly changed when performing arithmetic operations (:issue:`33671`) - Bug in :meth:`pd._testing.assert_almost_equal` was incorrect for complex numeric types (:issue:`28235`) - Bug in :meth:`DataFrame.__rmatmul__` error handling reporting transposed shapes (:issue:`21581`) +- Bug in :class:`Series` flex arithmetic methods where the result when operating with a ``list``, ``tuple`` or ``np.ndarray`` would have an incorrect name (:issue:`36760`) +- Bug in :class:`IntegerArray` multiplication with ``timedelta`` and ``np.timedelta64`` objects (:issue:`36870`) Conversion ^^^^^^^^^^ @@ -363,7 +386,6 @@ Indexing - Bug in :meth:`Index.sort_values` where, when empty values were passed, the method would break by trying to compare missing values instead of pushing them to the end of the sort order. (:issue:`35584`) - Bug in :meth:`Index.get_indexer` and :meth:`Index.get_indexer_non_unique` where int64 arrays are returned instead of intp. (:issue:`36359`) - Bug in :meth:`DataFrame.sort_index` where parameter ascending passed as a list on a single level index gives wrong result. (:issue:`32334`) -- Bug in indexing with boolean masks on datetime-like values sometimes returning a view instead of a copy (:issue:`36210`) Missing ^^^^^^^ @@ -375,6 +397,7 @@ MultiIndex ^^^^^^^^^^ - Bug in :meth:`DataFrame.xs` when used with :class:`IndexSlice` raises ``TypeError`` with message ``"Expected label or tuple of labels"`` (:issue:`35301`) +- Bug in :meth:`DataFrame.reset_index` with ``NaT`` values in index raises ``ValueError`` with message ``"cannot convert float NaN to integer"`` (:issue:`36541`) - I/O @@ -392,6 +415,7 @@ I/O - Bug in :meth:`read_csv` with ``engine='python'`` truncating data if multiple items present in first row and first element started with BOM (:issue:`36343`) - Removed ``private_key`` and ``verbose`` from :func:`read_gbq` as they are no longer supported in ``pandas-gbq`` (:issue:`34654`, :issue:`30200`) - Bumped minimum pytables version to 3.5.1 to avoid a ``ValueError`` in :meth:`read_hdf` (:issue:`24839`) +- Bug in :meth:`read_parquet` with fixed offset timezones. String representation of timezones was not recognized (:issue:`35997`, :issue:`36004`) Plotting ^^^^^^^^ @@ -424,6 +448,7 @@ Reshaping - Bug in :func:`union_indexes` where input index names are not preserved in some cases. Affects :func:`concat` and :class:`DataFrame` constructor (:issue:`13475`) - Bug in func :meth:`crosstab` when using multiple columns with ``margins=True`` and ``normalize=True`` (:issue:`35144`) - Bug in :meth:`DataFrame.agg` with ``func={'name':}`` incorrectly raising ``TypeError`` when ``DataFrame.columns==['Name']`` (:issue:`36212`) +- Bug in :meth:`Series.transform` would give incorrect results or raise when the argument ``func`` was dictionary (:issue:`35811`) - Sparse @@ -437,7 +462,7 @@ ExtensionArray - Fixed Bug where :class:`DataFrame` column set to scalar extension type via a dict instantion was considered an object type rather than the extension type (:issue:`35965`) - Fixed bug where ``astype()`` with equal dtype and ``copy=False`` would return a new object (:issue:`284881`) -- +- Fixed bug when applying a NumPy ufunc with multiple outputs to a :class:`pandas.arrays.IntegerArray` returning None (:issue:`36913`) Other @@ -446,7 +471,6 @@ Other - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly raising ``AssertionError`` instead of ``ValueError`` when invalid parameter combinations are passed (:issue:`36045`) - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` with numeric values and string ``to_replace`` (:issue:`34789`) - Fixed metadata propagation in the :class:`Series.dt` accessor (:issue:`28283`) -- Bug in :meth:`Series.transform` would give incorrect results or raise when the argument ``func`` was dictionary (:issue:`35811`) - Bug in :meth:`Index.union` behaving differently depending on whether operand is a :class:`Index` or other list-like (:issue:`36384`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index 82251744915a5..2b46d30c3adb6 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -121,4 +121,4 @@ void PANDAS_INLINE kh_destroy_str_starts(kh_str_starts_t* table) { void PANDAS_INLINE kh_resize_str_starts(kh_str_starts_t* table, khint_t val) { kh_resize_str(table->table, val); -} \ No newline at end of file +} diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx index b2f19fcf5f5da..7a2fa471b9ba8 100644 --- a/pandas/_libs/testing.pyx +++ b/pandas/_libs/testing.pyx @@ -7,36 +7,13 @@ from numpy cimport import_array import_array() -from pandas._libs.util cimport is_array from pandas._libs.lib import is_complex +from pandas._libs.util cimport is_array, is_real_number_object from pandas.core.dtypes.common import is_dtype_equal from pandas.core.dtypes.missing import array_equivalent, isna -cdef NUMERIC_TYPES = ( - bool, - int, - float, - np.bool_, - np.int8, - np.int16, - np.int32, - np.int64, - np.uint8, - np.uint16, - np.uint32, - np.uint64, - np.float16, - np.float32, - np.float64, -) - - -cdef bint is_comparable_as_number(obj): - return isinstance(obj, NUMERIC_TYPES) - - cdef bint isiterable(obj): return hasattr(obj, '__iter__') @@ -200,7 +177,7 @@ cpdef assert_almost_equal(a, b, # object comparison return True - if is_comparable_as_number(a) and is_comparable_as_number(b): + if is_real_number_object(a) and is_real_number_object(b): if array_equivalent(a, b, strict_nan=True): # inf comparison return True diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index b82291a71057e..3deabc57ec522 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -1,4 +1,4 @@ -from datetime import timezone +from datetime import timedelta, timezone from cpython.datetime cimport datetime, timedelta, tzinfo @@ -102,6 +102,14 @@ cpdef inline tzinfo maybe_get_tz(object tz): # On Python 3 on Windows, the filename is not always set correctly. if isinstance(tz, _dateutil_tzfile) and '.tar.gz' in tz._filename: tz._filename = zone + elif tz[0] in {'-', '+'}: + hours = int(tz[0:3]) + minutes = int(tz[0] + tz[4:6]) + tz = timezone(timedelta(hours=hours, minutes=minutes)) + elif tz[0:4] in {'UTC-', 'UTC+'}: + hours = int(tz[3:6]) + minutes = int(tz[3] + tz[7:9]) + tz = timezone(timedelta(hours=hours, minutes=minutes)) else: tz = pytz.timezone(tz) elif is_integer_object(tz): diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index e280609bb17a7..1f79a1ea7b6d1 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -121,6 +121,10 @@ cdef inline bint is_bool_object(object obj) nogil: PyObject_TypeCheck(obj, &PyBoolArrType_Type)) +cdef inline bint is_real_number_object(object obj) nogil: + return is_bool_object(obj) or is_integer_object(obj) or is_float_object(obj) + + cdef inline bint is_timedelta64_object(object obj) nogil: """ Cython equivalent of `isinstance(val, np.timedelta64)` diff --git a/pandas/_libs/util.pxd b/pandas/_libs/util.pxd index 828bccf7d5641..5f234910deede 100644 --- a/pandas/_libs/util.pxd +++ b/pandas/_libs/util.pxd @@ -48,4 +48,3 @@ cdef inline void set_array_not_contiguous(ndarray ao) nogil: # ao->flags &= ~(NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS); PyArray_CLEARFLAGS(ao, (NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS)) - diff --git a/pandas/tests/arithmetic/common.py b/pandas/tests/arithmetic/common.py index a663c2f3a0175..e26bb513838a5 100644 --- a/pandas/tests/arithmetic/common.py +++ b/pandas/tests/arithmetic/common.py @@ -6,6 +6,7 @@ from pandas import DataFrame, Index, Series, array as pd_array import pandas._testing as tm +from pandas.core.arrays import PandasArray def assert_invalid_addsub_type(left, right, msg=None): @@ -56,18 +57,25 @@ def assert_invalid_comparison(left, right, box): # Note: not quite the same as how we do this for tm.box_expected xbox = box if box not in [Index, pd_array] else np.array - result = left == right + def xbox2(x): + # Eventually we'd like this to be tighter, but for now we'll + # just exclude PandasArray[bool] + if isinstance(x, PandasArray): + return x._ndarray + return x + + result = xbox2(left == right) expected = xbox(np.zeros(result.shape, dtype=np.bool_)) tm.assert_equal(result, expected) - result = right == left + result = xbox2(right == left) tm.assert_equal(result, expected) - result = left != right + result = xbox2(left != right) tm.assert_equal(result, ~expected) - result = right != left + result = xbox2(right != left) tm.assert_equal(result, ~expected) msg = "|".join( diff --git a/pandas/tests/arithmetic/conftest.py b/pandas/tests/arithmetic/conftest.py index 6286711ac6113..47baf4e76f8c3 100644 --- a/pandas/tests/arithmetic/conftest.py +++ b/pandas/tests/arithmetic/conftest.py @@ -2,6 +2,7 @@ import pytest import pandas as pd +import pandas._testing as tm # ------------------------------------------------------------------ # Helper Functions @@ -221,19 +222,19 @@ def mismatched_freq(request): # ------------------------------------------------------------------ -@pytest.fixture(params=[pd.Index, pd.Series, pd.DataFrame], ids=id_func) -def box(request): +@pytest.fixture(params=[pd.Index, pd.Series, pd.DataFrame, pd.array], ids=id_func) +def box_with_array(request): """ - Several array-like containers that should have effectively identical - behavior with respect to arithmetic operations. + Fixture to test behavior for Index, Series, DataFrame, and pandas Array + classes """ return request.param -@pytest.fixture(params=[pd.Index, pd.Series, pd.DataFrame, pd.array], ids=id_func) -def box_with_array(request): +@pytest.fixture(params=[pd.Index, pd.Series, tm.to_array, np.array, list], ids=id_func) +def box_1d_array(request): """ - Fixture to test behavior for Index, Series, DataFrame, and pandas Array + Fixture to test behavior for Index, Series, tm.to_array, numpy Array and list classes """ return request.param diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index e9dc83d106651..c0ae36017f47a 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -205,8 +205,6 @@ def test_nat_comparisons(self, dtype, index_or_series, reverse, pair): def test_comparison_invalid(self, tz_naive_fixture, box_with_array): # GH#4968 # invalid date/int comparisons - if box_with_array is pd.array: - pytest.xfail("assert_invalid_comparison doesnt handle BooleanArray yet") tz = tz_naive_fixture ser = Series(range(5)) ser2 = Series(pd.date_range("20010101", periods=5, tz=tz)) @@ -226,32 +224,36 @@ def test_comparison_invalid(self, tz_naive_fixture, box_with_array): ) @pytest.mark.parametrize("dtype", [None, object]) def test_nat_comparisons_scalar(self, dtype, data, box_with_array): + box = box_with_array if box_with_array is tm.to_array and dtype is object: # dont bother testing ndarray comparison methods as this fails # on older numpys (since they check object identity) return - if box_with_array is pd.array and dtype is object: - pytest.xfail("reversed comparisons give BooleanArray, not ndarray") - xbox = ( - box_with_array if box_with_array not in [pd.Index, pd.array] else np.ndarray - ) + xbox = box if box not in [pd.Index, pd.array] else np.ndarray left = Series(data, dtype=dtype) - left = tm.box_expected(left, box_with_array) + left = tm.box_expected(left, box) expected = [False, False, False] expected = tm.box_expected(expected, xbox) + if box is pd.array and dtype is object: + expected = pd.array(expected, dtype="bool") + tm.assert_equal(left == NaT, expected) tm.assert_equal(NaT == left, expected) expected = [True, True, True] expected = tm.box_expected(expected, xbox) + if box is pd.array and dtype is object: + expected = pd.array(expected, dtype="bool") tm.assert_equal(left != NaT, expected) tm.assert_equal(NaT != left, expected) expected = [False, False, False] expected = tm.box_expected(expected, xbox) + if box is pd.array and dtype is object: + expected = pd.array(expected, dtype="bool") tm.assert_equal(left < NaT, expected) tm.assert_equal(NaT > left, expected) tm.assert_equal(left <= NaT, expected) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index df98b43e11f4a..04ba41307d0ef 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -11,11 +11,19 @@ import pytest import pandas as pd -from pandas import Index, Series, Timedelta, TimedeltaIndex +from pandas import Index, Int64Index, Series, Timedelta, TimedeltaIndex, array import pandas._testing as tm from pandas.core import ops +@pytest.fixture(params=[Index, Series, tm.to_array]) +def box_pandas_1d_array(request): + """ + Fixture to test behavior for Index, Series and tm.to_array classes + """ + return request.param + + def adjust_negative_zero(zero, expected): """ Helper to adjust the expected result if we are dividing by -0.0 @@ -187,10 +195,6 @@ def test_ops_series(self): def test_numeric_arr_mul_tdscalar(self, scalar_td, numeric_idx, box_with_array): # GH#19333 box = box_with_array - if box is pd.array: - pytest.xfail( - "we get a PandasArray[timedelta64[ns]] instead of TimedeltaArray" - ) index = numeric_idx expected = pd.TimedeltaIndex([pd.Timedelta(days=n) for n in range(5)]) @@ -214,8 +218,6 @@ def test_numeric_arr_mul_tdscalar(self, scalar_td, numeric_idx, box_with_array): ) def test_numeric_arr_mul_tdscalar_numexpr_path(self, scalar_td, box_with_array): box = box_with_array - if box is pd.array: - pytest.xfail("IntegerArray.__mul__ doesnt handle timedeltas") arr = np.arange(2 * 10 ** 4).astype(np.int64) obj = tm.box_expected(arr, box, transpose=False) @@ -231,8 +233,6 @@ def test_numeric_arr_mul_tdscalar_numexpr_path(self, scalar_td, box_with_array): def test_numeric_arr_rdiv_tdscalar(self, three_days, numeric_idx, box_with_array): box = box_with_array - if box is pd.array: - pytest.xfail("We get PandasArray[td64] instead of TimedeltaArray") index = numeric_idx[1:3] @@ -263,8 +263,6 @@ def test_numeric_arr_rdiv_tdscalar(self, three_days, numeric_idx, box_with_array ) def test_add_sub_timedeltalike_invalid(self, numeric_idx, other, box_with_array): box = box_with_array - if box is pd.array: - pytest.xfail("PandasArray[int].__add__ doesnt raise on td64") left = tm.box_expected(numeric_idx, box) msg = ( @@ -1340,3 +1338,33 @@ def test_dataframe_div_silenced(): ) with tm.assert_produces_warning(None): pdf1.div(pdf2, fill_value=0) + + +@pytest.mark.parametrize( + "data, expected_data", + [([0, 1, 2], [0, 2, 4])], +) +def test_integer_array_add_list_like( + box_pandas_1d_array, box_1d_array, data, expected_data +): + # GH22606 Verify operators with IntegerArray and list-likes + arr = array(data, dtype="Int64") + container = box_pandas_1d_array(arr) + left = container + box_1d_array(data) + right = box_1d_array(data) + container + + if Series == box_pandas_1d_array: + assert_function = tm.assert_series_equal + expected = Series(expected_data, dtype="Int64") + elif Series == box_1d_array: + assert_function = tm.assert_series_equal + expected = Series(expected_data, dtype="object") + elif Index in (box_pandas_1d_array, box_1d_array): + assert_function = tm.assert_index_equal + expected = Int64Index(expected_data) + else: + assert_function = tm.assert_numpy_array_equal + expected = np.array(expected_data, dtype="object") + + assert_function(left, expected) + assert_function(right, expected) diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index 8f64c9c0900f1..9cdea1c71f109 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -64,6 +64,20 @@ def test_ufuncs_binary_int(ufunc): tm.assert_extension_array_equal(result, expected) +def test_ufunc_binary_output(): + a = integer_array([1, 2, np.nan]) + result = np.modf(a) + expected = np.modf(a.to_numpy(na_value=np.nan, dtype="float")) + + assert isinstance(result, tuple) + assert len(result) == 2 + + for x, y in zip(result, expected): + # TODO(FloatArray): This will return an extension array. + # y = integer_array(y) + tm.assert_numpy_array_equal(x, y) + + @pytest.mark.parametrize("values", [[0, 1], [0, None]]) def test_ufunc_reduce_raises(values): a = integer_array(values) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index e7605125e7420..9f136b4979bb7 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -454,8 +454,9 @@ def test_tz_dtype_matches(self): class TestReductions: - @pytest.mark.parametrize("tz", [None, "US/Central"]) - def test_min_max(self, tz): + @pytest.fixture + def arr1d(self, tz_naive_fixture): + tz = tz_naive_fixture dtype = DatetimeTZDtype(tz=tz) if tz is not None else np.dtype("M8[ns]") arr = DatetimeArray._from_sequence( [ @@ -468,6 +469,11 @@ def test_min_max(self, tz): ], dtype=dtype, ) + return arr + + def test_min_max(self, arr1d): + arr = arr1d + tz = arr.tz result = arr.min() expected = pd.Timestamp("2000-01-02", tz=tz) @@ -493,3 +499,70 @@ def test_min_max_empty(self, skipna, tz): result = arr.max(skipna=skipna) assert result is pd.NaT + + @pytest.mark.parametrize("tz", [None, "US/Central"]) + @pytest.mark.parametrize("skipna", [True, False]) + def test_median_empty(self, skipna, tz): + dtype = DatetimeTZDtype(tz=tz) if tz is not None else np.dtype("M8[ns]") + arr = DatetimeArray._from_sequence([], dtype=dtype) + result = arr.median(skipna=skipna) + assert result is pd.NaT + + arr = arr.reshape(0, 3) + result = arr.median(axis=0, skipna=skipna) + expected = type(arr)._from_sequence([pd.NaT, pd.NaT, pd.NaT], dtype=arr.dtype) + tm.assert_equal(result, expected) + + result = arr.median(axis=1, skipna=skipna) + expected = type(arr)._from_sequence([pd.NaT], dtype=arr.dtype) + tm.assert_equal(result, expected) + + def test_median(self, arr1d): + arr = arr1d + + result = arr.median() + assert result == arr[0] + result = arr.median(skipna=False) + assert result is pd.NaT + + result = arr.dropna().median(skipna=False) + assert result == arr[0] + + result = arr.median(axis=0) + assert result == arr[0] + + def test_median_axis(self, arr1d): + arr = arr1d + assert arr.median(axis=0) == arr.median() + assert arr.median(axis=0, skipna=False) is pd.NaT + + msg = r"abs\(axis\) must be less than ndim" + with pytest.raises(ValueError, match=msg): + arr.median(axis=1) + + @pytest.mark.filterwarnings("ignore:All-NaN slice encountered:RuntimeWarning") + def test_median_2d(self, arr1d): + arr = arr1d.reshape(1, -1) + + # axis = None + assert arr.median() == arr1d.median() + assert arr.median(skipna=False) is pd.NaT + + # axis = 0 + result = arr.median(axis=0) + expected = arr1d + tm.assert_equal(result, expected) + + # Since column 3 is all-NaT, we get NaT there with or without skipna + result = arr.median(axis=0, skipna=False) + expected = arr1d + tm.assert_equal(result, expected) + + # axis = 1 + result = arr.median(axis=1) + expected = type(arr)._from_sequence([arr1d.median()]) + tm.assert_equal(result, expected) + + result = arr.median(axis=1, skipna=False) + expected = type(arr)._from_sequence([pd.NaT], dtype=arr.dtype) + tm.assert_equal(result, expected) diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index de04c30432e6f..73a41e7010c5f 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -274,3 +274,17 @@ def test_value_counts_datetime64(index_or_series): td2 = klass(td2, name="dt") result2 = td2.value_counts() tm.assert_series_equal(result2, expected_s) + + +@pytest.mark.parametrize("dropna", [True, False]) +def test_value_counts_with_nan(dropna, index_or_series): + # GH31944 + klass = index_or_series + values = [True, pd.NA, np.nan] + s = klass(values) + res = s.value_counts(dropna=dropna) + if dropna is True: + expected = Series([1], index=[True]) + else: + expected = Series([2, 1], index=[pd.NA, True]) + tm.assert_series_equal(res, expected) diff --git a/pandas/tests/frame/apply/test_frame_transform.py b/pandas/tests/frame/apply/test_frame_transform.py index 346e60954fc13..01c6fd4ec08f0 100644 --- a/pandas/tests/frame/apply/test_frame_transform.py +++ b/pandas/tests/frame/apply/test_frame_transform.py @@ -4,7 +4,7 @@ import numpy as np import pytest -from pandas import DataFrame, MultiIndex +from pandas import DataFrame, MultiIndex, Series import pandas._testing as tm from pandas.core.base import SpecificationError from pandas.core.groupby.base import transformation_kernels @@ -41,9 +41,15 @@ def test_transform_groupby_kernel(axis, float_frame, op): @pytest.mark.parametrize( - "ops, names", [([np.sqrt], ["sqrt"]), ([np.abs, np.sqrt], ["absolute", "sqrt"])] + "ops, names", + [ + ([np.sqrt], ["sqrt"]), + ([np.abs, np.sqrt], ["absolute", "sqrt"]), + (np.array([np.sqrt]), ["sqrt"]), + (np.array([np.abs, np.sqrt]), ["absolute", "sqrt"]), + ], ) -def test_transform_list(axis, float_frame, ops, names): +def test_transform_listlike(axis, float_frame, ops, names): # GH 35964 other_axis = 1 if axis in {0, "index"} else 0 with np.errstate(all="ignore"): @@ -56,7 +62,14 @@ def test_transform_list(axis, float_frame, ops, names): tm.assert_frame_equal(result, expected) -def test_transform_dict(axis, float_frame): +@pytest.mark.parametrize("ops", [[], np.array([])]) +def test_transform_empty_listlike(float_frame, ops): + with pytest.raises(ValueError, match="No transform functions were provided"): + float_frame.transform(ops) + + +@pytest.mark.parametrize("box", [dict, Series]) +def test_transform_dictlike(axis, float_frame, box): # GH 35964 if axis == 0 or axis == "index": e = float_frame.columns[0] @@ -64,10 +77,26 @@ def test_transform_dict(axis, float_frame): else: e = float_frame.index[0] expected = float_frame.iloc[[0]].transform(np.abs) - result = float_frame.transform({e: np.abs}, axis=axis) + result = float_frame.transform(box({e: np.abs}), axis=axis) tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "ops", + [ + {}, + {"A": []}, + {"A": [], "B": "cumsum"}, + {"A": "cumsum", "B": []}, + {"A": [], "B": ["cumsum"]}, + {"A": ["cumsum"], "B": []}, + ], +) +def test_transform_empty_dictlike(float_frame, ops): + with pytest.raises(ValueError, match="No transform functions were provided"): + float_frame.transform(ops) + + @pytest.mark.parametrize("use_apply", [True, False]) def test_transform_udf(axis, float_frame, use_apply): # GH 35964 diff --git a/pandas/tests/frame/methods/test_diff.py b/pandas/tests/frame/methods/test_diff.py index 0486fb2d588b6..42586c14092f2 100644 --- a/pandas/tests/frame/methods/test_diff.py +++ b/pandas/tests/frame/methods/test_diff.py @@ -7,6 +7,11 @@ class TestDataFrameDiff: + def test_diff_requires_integer(self): + df = pd.DataFrame(np.random.randn(2, 2)) + with pytest.raises(ValueError, match="periods must be an integer"): + df.diff(1.5) + def test_diff(self, datetime_frame): the_diff = datetime_frame.diff(1) @@ -31,9 +36,7 @@ def test_diff(self, datetime_frame): df = pd.DataFrame({"y": pd.Series([2]), "z": pd.Series([3])}) df.insert(0, "x", 1) result = df.diff(axis=1) - expected = pd.DataFrame( - {"x": np.nan, "y": pd.Series(1), "z": pd.Series(1)} - ).astype("float64") + expected = pd.DataFrame({"x": np.nan, "y": pd.Series(1), "z": pd.Series(1)}) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("tz", [None, "UTC"]) @@ -116,19 +119,13 @@ def test_diff_axis(self): df.diff(axis=0), DataFrame([[np.nan, np.nan], [2.0, 2.0]]) ) - @pytest.mark.xfail( - reason="GH#32995 needs to operate column-wise or do inference", - raises=AssertionError, - ) def test_diff_period(self): # GH#32995 Don't pass an incorrect axis - # TODO(EA2D): this bug wouldn't have happened with 2D EA pi = pd.date_range("2016-01-01", periods=3).to_period("D") df = pd.DataFrame({"A": pi}) result = df.diff(1, axis=1) - # TODO: should we make Block.diff do type inference? or maybe algos.diff? expected = (df - pd.NaT).astype(object) tm.assert_frame_equal(result, expected) @@ -141,6 +138,14 @@ def test_diff_axis1_mixed_dtypes(self): result = df.diff(axis=1) tm.assert_frame_equal(result, expected) + # GH#21437 mixed-float-dtypes + df = pd.DataFrame( + {"a": np.arange(3, dtype="float32"), "b": np.arange(3, dtype="float64")} + ) + result = df.diff(axis=1) + expected = pd.DataFrame({"a": df["a"] * np.nan, "b": df["b"] * 0}) + tm.assert_frame_equal(result, expected) + def test_diff_axis1_mixed_dtypes_large_periods(self): # GH#32995 operate column-wise when we have mixed dtypes and axis=1 df = pd.DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)}) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index b5e211895672a..8ec11d14cd606 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1637,8 +1637,8 @@ def test_constructor_Series_differently_indexed(self): "name_in1,name_in2,name_in3,name_out", [ ("idx", "idx", "idx", "idx"), - ("idx", "idx", None, "idx"), - ("idx", None, None, "idx"), + ("idx", "idx", None, None), + ("idx", None, None, None), ("idx1", "idx2", None, None), ("idx1", "idx1", "idx2", None), ("idx1", "idx2", "idx3", None), diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 102c8f97a8a6b..a8baf67273490 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -473,7 +473,7 @@ def test_intersection_list(self): values = [pd.Timestamp("2020-01-01"), pd.Timestamp("2020-02-01")] idx = pd.DatetimeIndex(values, name="a") res = idx.intersection(values) - tm.assert_index_equal(res, idx) + tm.assert_index_equal(res, idx.rename(None)) def test_month_range_union_tz_pytz(self, sort): from pytz import timezone diff --git a/pandas/tests/indexes/multi/test_join.py b/pandas/tests/indexes/multi/test_join.py index 6be9ec463ce36..562d07d283293 100644 --- a/pandas/tests/indexes/multi/test_join.py +++ b/pandas/tests/indexes/multi/test_join.py @@ -46,7 +46,7 @@ def test_join_level_corner_case(idx): def test_join_self(idx, join_type): joined = idx.join(idx, how=join_type) - assert idx is joined + tm.assert_index_equal(joined, idx) def test_join_multi(): diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 6d4928547cad1..0b17c1c4c9679 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -243,10 +243,10 @@ def test_union(idx, sort): # corner case, pass self or empty thing: the_union = idx.union(idx, sort=sort) - assert the_union is idx + tm.assert_index_equal(the_union, idx) the_union = idx.union(idx[:0], sort=sort) - assert the_union is idx + tm.assert_index_equal(the_union, idx) # FIXME: dont leave commented-out # won't work in python 3 @@ -278,7 +278,7 @@ def test_intersection(idx, sort): # corner case, pass self the_int = idx.intersection(idx, sort=sort) - assert the_int is idx + tm.assert_index_equal(the_int, idx) # empty intersection: disjoint empty = idx[:2].intersection(idx[2:], sort=sort) diff --git a/pandas/tests/indexes/ranges/test_setops.py b/pandas/tests/indexes/ranges/test_setops.py index 5b565310cfb9c..9c9f5dbdf7e7f 100644 --- a/pandas/tests/indexes/ranges/test_setops.py +++ b/pandas/tests/indexes/ranges/test_setops.py @@ -239,3 +239,51 @@ def test_union_sorted(self, unions): res3 = idx1._int64index.union(idx2, sort=None) tm.assert_index_equal(res2, expected_sorted, exact=True) tm.assert_index_equal(res3, expected_sorted) + + def test_difference(self): + # GH#12034 Cases where we operate against another RangeIndex and may + # get back another RangeIndex + obj = RangeIndex.from_range(range(1, 10), name="foo") + + result = obj.difference(obj) + expected = RangeIndex.from_range(range(0), name="foo") + tm.assert_index_equal(result, expected) + + result = obj.difference(expected.rename("bar")) + tm.assert_index_equal(result, obj.rename(None)) + + result = obj.difference(obj[:3]) + tm.assert_index_equal(result, obj[3:]) + + result = obj.difference(obj[-3:]) + tm.assert_index_equal(result, obj[:-3]) + + result = obj.difference(obj[2:6]) + expected = Int64Index([1, 2, 7, 8, 9], name="foo") + tm.assert_index_equal(result, expected) + + def test_symmetric_difference(self): + # GH#12034 Cases where we operate against another RangeIndex and may + # get back another RangeIndex + left = RangeIndex.from_range(range(1, 10), name="foo") + + result = left.symmetric_difference(left) + expected = RangeIndex.from_range(range(0), name="foo") + tm.assert_index_equal(result, expected) + + result = left.symmetric_difference(expected.rename("bar")) + tm.assert_index_equal(result, left.rename(None)) + + result = left[:-2].symmetric_difference(left[2:]) + expected = Int64Index([1, 2, 8, 9], name="foo") + tm.assert_index_equal(result, expected) + + right = RangeIndex.from_range(range(10, 15)) + + result = left.symmetric_difference(right) + expected = RangeIndex.from_range(range(1, 15)) + tm.assert_index_equal(result, expected) + + result = left.symmetric_difference(right[1:]) + expected = Int64Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14]) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index e2dea7828b3ad..94b10572fb5e1 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -124,6 +124,93 @@ def test_corner_union(self, index, fname, sname, expected_name): expected = index.drop(index).set_names(expected_name) tm.assert_index_equal(union, expected) + @pytest.mark.parametrize( + "fname, sname, expected_name", + [ + ("A", "A", "A"), + ("A", "B", None), + ("A", None, None), + (None, "B", None), + (None, None, None), + ], + ) + def test_union_unequal(self, index, fname, sname, expected_name): + if isinstance(index, MultiIndex) or not index.is_unique: + pytest.skip("Not for MultiIndex or repeated indices") + + # test copy.union(subset) - need sort for unicode and string + first = index.copy().set_names(fname) + second = index[1:].set_names(sname) + union = first.union(second).sort_values() + expected = index.set_names(expected_name).sort_values() + tm.assert_index_equal(union, expected) + + @pytest.mark.parametrize( + "fname, sname, expected_name", + [ + ("A", "A", "A"), + ("A", "B", None), + ("A", None, None), + (None, "B", None), + (None, None, None), + ], + ) + def test_corner_intersect(self, index, fname, sname, expected_name): + # GH35847 + # Test intersections with various name combinations + + if isinstance(index, MultiIndex) or not index.is_unique: + pytest.skip("Not for MultiIndex or repeated indices") + + # Test copy.intersection(copy) + first = index.copy().set_names(fname) + second = index.copy().set_names(sname) + intersect = first.intersection(second) + expected = index.copy().set_names(expected_name) + tm.assert_index_equal(intersect, expected) + + # Test copy.intersection(empty) + first = index.copy().set_names(fname) + second = index.drop(index).set_names(sname) + intersect = first.intersection(second) + expected = index.drop(index).set_names(expected_name) + tm.assert_index_equal(intersect, expected) + + # Test empty.intersection(copy) + first = index.drop(index).set_names(fname) + second = index.copy().set_names(sname) + intersect = first.intersection(second) + expected = index.drop(index).set_names(expected_name) + tm.assert_index_equal(intersect, expected) + + # Test empty.intersection(empty) + first = index.drop(index).set_names(fname) + second = index.drop(index).set_names(sname) + intersect = first.intersection(second) + expected = index.drop(index).set_names(expected_name) + tm.assert_index_equal(intersect, expected) + + @pytest.mark.parametrize( + "fname, sname, expected_name", + [ + ("A", "A", "A"), + ("A", "B", None), + ("A", None, None), + (None, "B", None), + (None, None, None), + ], + ) + def test_intersect_unequal(self, index, fname, sname, expected_name): + if isinstance(index, MultiIndex) or not index.is_unique: + pytest.skip("Not for MultiIndex or repeated indices") + + # test copy.intersection(subset) - need sort for unicode and string + first = index.copy().set_names(fname) + second = index[1:].set_names(sname) + intersect = first.intersection(second).sort_values() + expected = index[1:].set_names(expected_name).sort_values() + tm.assert_index_equal(intersect, expected) + def test_to_flat_index(self, index): # 22866 if isinstance(index, MultiIndex): diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py index 7000daeb9b575..d98530b5435e7 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/io/formats/test_info.py @@ -87,7 +87,7 @@ def test_info_verbose(): frame.info(verbose=True, buf=buf) res = buf.getvalue() - header = " # Column Dtype \n--- ------ ----- " + header = " # Column Dtype \n--- ------ ----- " assert header in res frame.info(verbose=True, buf=buf) @@ -101,6 +101,64 @@ def test_info_verbose(): assert line.startswith(line_nr) +@pytest.mark.parametrize( + "size, header_exp, separator_exp, first_line_exp, last_line_exp", + [ + ( + 4, + " # Column Non-Null Count Dtype ", + "--- ------ -------------- ----- ", + " 0 0 3 non-null float64", + " 3 3 3 non-null float64", + ), + ( + 11, + " # Column Non-Null Count Dtype ", + "--- ------ -------------- ----- ", + " 0 0 3 non-null float64", + " 10 10 3 non-null float64", + ), + ( + 101, + " # Column Non-Null Count Dtype ", + "--- ------ -------------- ----- ", + " 0 0 3 non-null float64", + " 100 100 3 non-null float64", + ), + ( + 1001, + " # Column Non-Null Count Dtype ", + "--- ------ -------------- ----- ", + " 0 0 3 non-null float64", + " 1000 1000 3 non-null float64", + ), + ( + 10001, + " # Column Non-Null Count Dtype ", + "--- ------ -------------- ----- ", + " 0 0 3 non-null float64", + " 10000 10000 3 non-null float64", + ), + ], +) +def test_info_verbose_with_counts_spacing( + size, header_exp, separator_exp, first_line_exp, last_line_exp +): + """Test header column, spacer, first line and last line in verbose mode.""" + frame = DataFrame(np.random.randn(3, size)) + buf = StringIO() + frame.info(verbose=True, null_counts=True, buf=buf) + all_lines = buf.getvalue().splitlines() + # Here table would contain only header, separator and table lines + # dframe repr, index summary, memory usage and dtypes are excluded + table = all_lines[3:-2] + header, separator, first_line, *rest, last_line = table + assert header == header_exp + assert separator == separator_exp + assert first_line == first_line_exp + assert last_line == last_line_exp + + def test_info_memory(): # https://github.com/pandas-dev/pandas/issues/21056 df = DataFrame({"a": Series([1, 2], dtype="i8")}) diff --git a/pandas/tests/io/json/data/tsframe_iso_v012.json b/pandas/tests/io/json/data/tsframe_iso_v012.json index bd9ff885ad23a..5fa01d5cd902d 100644 --- a/pandas/tests/io/json/data/tsframe_iso_v012.json +++ b/pandas/tests/io/json/data/tsframe_iso_v012.json @@ -1 +1 @@ -{"A":{"2000-01-03T00:00:00":1.56808523,"2000-01-04T00:00:00":-0.2550111,"2000-01-05T00:00:00":1.51493992,"2000-01-06T00:00:00":-0.02765498,"2000-01-07T00:00:00":0.05951614},"B":{"2000-01-03T00:00:00":0.65727391,"2000-01-04T00:00:00":-0.08072427,"2000-01-05T00:00:00":0.11805825,"2000-01-06T00:00:00":0.44679743,"2000-01-07T00:00:00":-2.69652057},"C":{"2000-01-03T00:00:00":1.81021139,"2000-01-04T00:00:00":-0.03202878,"2000-01-05T00:00:00":1.629455,"2000-01-06T00:00:00":0.33192641,"2000-01-07T00:00:00":1.28163262},"D":{"2000-01-03T00:00:00":-0.17251653,"2000-01-04T00:00:00":-0.17581665,"2000-01-05T00:00:00":-1.31506612,"2000-01-06T00:00:00":-0.27885413,"2000-01-07T00:00:00":0.34703478},"date":{"2000-01-03T00:00:00":"1992-01-06T18:21:32.120000","2000-01-04T00:00:00":"1992-01-06T18:21:32.120000","2000-01-05T00:00:00":"1992-01-06T18:21:32.120000","2000-01-06T00:00:00":"2013-01-01T00:00:00","2000-01-07T00:00:00":"1992-01-06T18:21:32.120000"}} \ No newline at end of file +{"A":{"2000-01-03T00:00:00":1.56808523,"2000-01-04T00:00:00":-0.2550111,"2000-01-05T00:00:00":1.51493992,"2000-01-06T00:00:00":-0.02765498,"2000-01-07T00:00:00":0.05951614},"B":{"2000-01-03T00:00:00":0.65727391,"2000-01-04T00:00:00":-0.08072427,"2000-01-05T00:00:00":0.11805825,"2000-01-06T00:00:00":0.44679743,"2000-01-07T00:00:00":-2.69652057},"C":{"2000-01-03T00:00:00":1.81021139,"2000-01-04T00:00:00":-0.03202878,"2000-01-05T00:00:00":1.629455,"2000-01-06T00:00:00":0.33192641,"2000-01-07T00:00:00":1.28163262},"D":{"2000-01-03T00:00:00":-0.17251653,"2000-01-04T00:00:00":-0.17581665,"2000-01-05T00:00:00":-1.31506612,"2000-01-06T00:00:00":-0.27885413,"2000-01-07T00:00:00":0.34703478},"date":{"2000-01-03T00:00:00":"1992-01-06T18:21:32.120000","2000-01-04T00:00:00":"1992-01-06T18:21:32.120000","2000-01-05T00:00:00":"1992-01-06T18:21:32.120000","2000-01-06T00:00:00":"2013-01-01T00:00:00","2000-01-07T00:00:00":"1992-01-06T18:21:32.120000"}} diff --git a/pandas/tests/io/json/data/tsframe_v012.json b/pandas/tests/io/json/data/tsframe_v012.json index d4474c767855c..1d6a0a45c028e 100644 --- a/pandas/tests/io/json/data/tsframe_v012.json +++ b/pandas/tests/io/json/data/tsframe_v012.json @@ -1 +1 @@ -{"A":{"946857600000000000":1.56808523,"946944000000000000":-0.2550111,"947030400000000000":1.51493992,"947116800000000000":-0.02765498,"947203200000000000":0.05951614},"B":{"946857600000000000":0.65727391,"946944000000000000":-0.08072427,"947030400000000000":0.11805825,"947116800000000000":0.44679743,"947203200000000000":-2.69652057},"C":{"946857600000000000":1.81021139,"946944000000000000":-0.03202878,"947030400000000000":1.629455,"947116800000000000":0.33192641,"947203200000000000":1.28163262},"D":{"946857600000000000":-0.17251653,"946944000000000000":-0.17581665,"947030400000000000":-1.31506612,"947116800000000000":-0.27885413,"947203200000000000":0.34703478},"date":{"946857600000000000":694722092120000000,"946944000000000000":694722092120000000,"947030400000000000":694722092120000000,"947116800000000000":1356998400000000000,"947203200000000000":694722092120000000},"modified":{"946857600000000000":694722092120000000,"946944000000000000":null,"947030400000000000":694722092120000000,"947116800000000000":1356998400000000000,"947203200000000000":694722092120000000}} \ No newline at end of file +{"A":{"946857600000000000":1.56808523,"946944000000000000":-0.2550111,"947030400000000000":1.51493992,"947116800000000000":-0.02765498,"947203200000000000":0.05951614},"B":{"946857600000000000":0.65727391,"946944000000000000":-0.08072427,"947030400000000000":0.11805825,"947116800000000000":0.44679743,"947203200000000000":-2.69652057},"C":{"946857600000000000":1.81021139,"946944000000000000":-0.03202878,"947030400000000000":1.629455,"947116800000000000":0.33192641,"947203200000000000":1.28163262},"D":{"946857600000000000":-0.17251653,"946944000000000000":-0.17581665,"947030400000000000":-1.31506612,"947116800000000000":-0.27885413,"947203200000000000":0.34703478},"date":{"946857600000000000":694722092120000000,"946944000000000000":694722092120000000,"947030400000000000":694722092120000000,"947116800000000000":1356998400000000000,"947203200000000000":694722092120000000},"modified":{"946857600000000000":694722092120000000,"946944000000000000":null,"947030400000000000":694722092120000000,"947116800000000000":1356998400000000000,"947203200000000000":694722092120000000}} diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index c1938db12a0bc..1e1c9e91faa4b 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -49,6 +49,7 @@ HDFStore, PossibleDataLossError, Term, + _maybe_adjust_name, read_hdf, ) @@ -4921,3 +4922,10 @@ def test_unsuppored_hdf_file_error(self, datapath): with pytest.raises(ValueError, match=message): pd.read_hdf(data_path) + + +@pytest.mark.parametrize("bad_version", [(1, 2), (1,), [], "12", "123"]) +def test_maybe_adjust_name_bad_version_raises(bad_version): + msg = "Version is incorrect, expected sequence of 3 integers" + with pytest.raises(ValueError, match=msg): + _maybe_adjust_name("values_block_0", version=bad_version) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index f7b25f8c0eeac..9114edc19315f 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -125,6 +125,21 @@ def df_full(): ) +@pytest.fixture( + params=[ + datetime.datetime.now(datetime.timezone.utc), + datetime.datetime.now(datetime.timezone.min), + datetime.datetime.now(datetime.timezone.max), + datetime.datetime.strptime("2019-01-04T16:41:24+0200", "%Y-%m-%dT%H:%M:%S%z"), + datetime.datetime.strptime("2019-01-04T16:41:24+0215", "%Y-%m-%dT%H:%M:%S%z"), + datetime.datetime.strptime("2019-01-04T16:41:24-0200", "%Y-%m-%dT%H:%M:%S%z"), + datetime.datetime.strptime("2019-01-04T16:41:24-0215", "%Y-%m-%dT%H:%M:%S%z"), + ] +) +def timezone_aware_date_list(request): + return request.param + + def check_round_trip( df, engine=None, @@ -134,6 +149,7 @@ def check_round_trip( expected=None, check_names=True, check_like=False, + check_dtype=True, repeat=2, ): """Verify parquet serializer and deserializer produce the same results. @@ -175,7 +191,11 @@ def compare(repeat): actual = read_parquet(path, **read_kwargs) tm.assert_frame_equal( - expected, actual, check_names=check_names, check_like=check_like + expected, + actual, + check_names=check_names, + check_like=check_like, + check_dtype=check_dtype, ) if path is None: @@ -739,6 +759,21 @@ def test_timestamp_nanoseconds(self, pa): df = pd.DataFrame({"a": pd.date_range("2017-01-01", freq="1n", periods=10)}) check_round_trip(df, pa, write_kwargs={"version": "2.0"}) + def test_timezone_aware_index(self, pa, timezone_aware_date_list): + idx = 5 * [timezone_aware_date_list] + df = pd.DataFrame(index=idx, data={"index_as_col": idx}) + + # see gh-36004 + # compare time(zone) values only, skip their class: + # pyarrow always creates fixed offset timezones using pytz.FixedOffset() + # even if it was datetime.timezone() originally + # + # technically they are the same: + # they both implement datetime.tzinfo + # they both wrap datetime.timedelta() + # this use-case sets the resolution to 1 minute + check_round_trip(df, pa, check_dtype=False) + @td.skip_if_no("pyarrow", min_version="0.17") def test_filter_row_groups(self, pa): # https://github.com/pandas-dev/pandas/issues/26551 @@ -877,3 +912,12 @@ def test_empty_dataframe(self, fp): expected = df.copy() expected.index.name = "index" check_round_trip(df, fp, expected=expected) + + def test_timezone_aware_index(self, fp, timezone_aware_date_list): + idx = 5 * [timezone_aware_date_list] + + df = pd.DataFrame(index=idx, data={"index_as_col": idx}) + + expected = df.copy() + expected.index.name = "index" + check_round_trip(df, fp, expected=expected) diff --git a/pandas/tests/reductions/test_stat_reductions.py b/pandas/tests/reductions/test_stat_reductions.py index 59dbcb9ab9fa0..fd2746672a0eb 100644 --- a/pandas/tests/reductions/test_stat_reductions.py +++ b/pandas/tests/reductions/test_stat_reductions.py @@ -96,7 +96,7 @@ def _check_stat_op( string_series_[5:15] = np.NaN # mean, idxmax, idxmin, min, and max are valid for dates - if name not in ["max", "min", "mean"]: + if name not in ["max", "min", "mean", "median"]: ds = Series(pd.date_range("1/1/2001", periods=10)) with pytest.raises(TypeError): f(ds) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index b0f6a8ef0c517..f0eb745041a66 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1300,8 +1300,8 @@ def test_concat_ignore_index(self, sort): "name_in1,name_in2,name_in3,name_out", [ ("idx", "idx", "idx", "idx"), - ("idx", "idx", None, "idx"), - ("idx", None, None, "idx"), + ("idx", "idx", None, None), + ("idx", None, None, None), ("idx1", "idx2", None, None), ("idx1", "idx1", "idx2", None), ("idx1", "idx2", "idx3", None), diff --git a/pandas/tests/series/apply/test_series_transform.py b/pandas/tests/series/apply/test_series_transform.py index 0e200709f60cf..67b271f757cfb 100644 --- a/pandas/tests/series/apply/test_series_transform.py +++ b/pandas/tests/series/apply/test_series_transform.py @@ -34,9 +34,15 @@ def test_transform_groupby_kernel(string_series, op): @pytest.mark.parametrize( - "ops, names", [([np.sqrt], ["sqrt"]), ([np.abs, np.sqrt], ["absolute", "sqrt"])] + "ops, names", + [ + ([np.sqrt], ["sqrt"]), + ([np.abs, np.sqrt], ["absolute", "sqrt"]), + (np.array([np.sqrt]), ["sqrt"]), + (np.array([np.abs, np.sqrt]), ["absolute", "sqrt"]), + ], ) -def test_transform_list(string_series, ops, names): +def test_transform_listlike(string_series, ops, names): # GH 35964 with np.errstate(all="ignore"): expected = concat([op(string_series) for op in ops], axis=1) @@ -45,15 +51,38 @@ def test_transform_list(string_series, ops, names): tm.assert_frame_equal(result, expected) -def test_transform_dict(string_series): +@pytest.mark.parametrize("ops", [[], np.array([])]) +def test_transform_empty_listlike(string_series, ops): + with pytest.raises(ValueError, match="No transform functions were provided"): + string_series.transform(ops) + + +@pytest.mark.parametrize("box", [dict, Series]) +def test_transform_dictlike(string_series, box): # GH 35964 with np.errstate(all="ignore"): expected = concat([np.sqrt(string_series), np.abs(string_series)], axis=1) expected.columns = ["foo", "bar"] - result = string_series.transform({"foo": np.sqrt, "bar": np.abs}) + result = string_series.transform(box({"foo": np.sqrt, "bar": np.abs})) tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "ops", + [ + {}, + {"A": []}, + {"A": [], "B": ["cumsum"]}, + {"A": ["cumsum"], "B": []}, + {"A": [], "B": "cumsum"}, + {"A": "cumsum", "B": []}, + ], +) +def test_transform_empty_dictlike(string_series, ops): + with pytest.raises(ValueError, match="No transform functions were provided"): + string_series.transform(ops) + + def test_transform_udf(axis, string_series): # GH 35964 # via apply diff --git a/pandas/tests/series/indexing/test_boolean.py b/pandas/tests/series/indexing/test_boolean.py index 28bebd764a866..e2b71b1f2f412 100644 --- a/pandas/tests/series/indexing/test_boolean.py +++ b/pandas/tests/series/indexing/test_boolean.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas import Index, Series, date_range +from pandas import Index, Series import pandas._testing as tm from pandas.core.indexing import IndexingError @@ -128,14 +128,3 @@ def test_get_set_boolean_different_order(string_series): sel = string_series[ordered > 0] exp = string_series[string_series > 0] tm.assert_series_equal(sel, exp) - - -def test_getitem_boolean_dt64_copies(): - # GH#36210 - dti = date_range("2016-01-01", periods=4, tz="US/Pacific") - key = np.array([True, True, False, False]) - - ser = Series(dti._data) - - res = ser[key] - assert res._values._data.base is None diff --git a/pandas/tests/series/indexing/test_multiindex.py b/pandas/tests/series/indexing/test_multiindex.py index e98a32d62b767..0420d76b5e8a8 100644 --- a/pandas/tests/series/indexing/test_multiindex.py +++ b/pandas/tests/series/indexing/test_multiindex.py @@ -1,8 +1,10 @@ """ test get/set & misc """ +import pytest import pandas as pd from pandas import MultiIndex, Series +import pandas._testing as tm def test_access_none_value_in_multiindex(): @@ -20,3 +22,30 @@ def test_access_none_value_in_multiindex(): s = Series([1] * len(midx), dtype=object, index=midx) result = s.loc[("Level1", "Level2_a")] assert result == 1 + + +@pytest.mark.parametrize( + "ix_data, exp_data", + [ + ( + [(pd.NaT, 1), (pd.NaT, 2)], + {"a": [pd.NaT, pd.NaT], "b": [1, 2], "x": [11, 12]}, + ), + ( + [(pd.NaT, 1), (pd.Timestamp("2020-01-01"), 2)], + {"a": [pd.NaT, pd.Timestamp("2020-01-01")], "b": [1, 2], "x": [11, 12]}, + ), + ( + [(pd.NaT, 1), (pd.Timedelta(123, "d"), 2)], + {"a": [pd.NaT, pd.Timedelta(123, "d")], "b": [1, 2], "x": [11, 12]}, + ), + ], +) +def test_nat_multi_index(ix_data, exp_data): + # GH36541: that reset_index() does not raise ValueError + ix = pd.MultiIndex.from_tuples(ix_data, names=["a", "b"]) + result = pd.DataFrame({"x": [11, 12]}, index=ix) + result = result.reset_index() + + expected = pd.DataFrame(exp_data) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index f30246ff12fac..09181201beee4 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -699,3 +699,45 @@ def test_datetime_understood(self): result = series - offset expected = pd.Series(pd.to_datetime(["2011-12-26", "2011-12-27", "2011-12-28"])) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "names", + [ + ("foo", None, None), + ("Egon", "Venkman", None), + ("NCC1701D", "NCC1701D", "NCC1701D"), + ], +) +@pytest.mark.parametrize("box", [list, tuple, np.array, pd.Index, pd.Series, pd.array]) +@pytest.mark.parametrize("flex", [True, False]) +def test_series_ops_name_retention(flex, box, names, all_binary_operators): + # GH#33930 consistent name renteiton + op = all_binary_operators + + if op is ops.rfloordiv and box in [list, tuple]: + pytest.xfail("op fails because of inconsistent ndarray-wrapping GH#28759") + + left = pd.Series(range(10), name=names[0]) + right = pd.Series(range(10), name=names[1]) + + right = box(right) + if flex: + name = op.__name__.strip("_") + if name in ["and", "rand", "xor", "rxor", "or", "ror"]: + # Series doesn't have these as flex methods + return + result = getattr(left, name)(right) + else: + result = op(left, right) + + if box is pd.Index and op.__name__.strip("_") in ["rxor", "ror", "rand"]: + # Index treats these as set operators, so does not defer + assert isinstance(result, pd.Index) + return + + assert isinstance(result, Series) + if box in [pd.Index, pd.Series]: + assert result.name == names[2] + else: + assert result.name == names[0] diff --git a/pandas/tests/tslibs/test_timezones.py b/pandas/tests/tslibs/test_timezones.py index 81b41f567976d..e49f511fe3cc4 100644 --- a/pandas/tests/tslibs/test_timezones.py +++ b/pandas/tests/tslibs/test_timezones.py @@ -1,4 +1,4 @@ -from datetime import datetime +from datetime import datetime, timedelta, timezone import dateutil.tz import pytest @@ -118,3 +118,25 @@ def test_maybe_get_tz_invalid_types(): msg = "" with pytest.raises(TypeError, match=msg): timezones.maybe_get_tz(Timestamp.now("UTC")) + + +def test_maybe_get_tz_offset_only(): + # see gh-36004 + + # timezone.utc + tz = timezones.maybe_get_tz(timezone.utc) + assert tz == timezone(timedelta(hours=0, minutes=0)) + + # without UTC+- prefix + tz = timezones.maybe_get_tz("+01:15") + assert tz == timezone(timedelta(hours=1, minutes=15)) + + tz = timezones.maybe_get_tz("-01:15") + assert tz == timezone(-timedelta(hours=1, minutes=15)) + + # with UTC+- prefix + tz = timezones.maybe_get_tz("UTC+02:45") + assert tz == timezone(timedelta(hours=2, minutes=45)) + + tz = timezones.maybe_get_tz("UTC-02:45") + assert tz == timezone(-timedelta(hours=2, minutes=45)) diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index 7cfac7c6a752a..f0e8b39464a9f 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -428,3 +428,32 @@ def test_groupby_rolling_empty_frame(self): result = expected.groupby(["s1", "s2"]).rolling(window=1).sum() expected.index = pd.MultiIndex.from_tuples([], names=["s1", "s2", None]) tm.assert_frame_equal(result, expected) + + def test_groupby_rolling_string_index(self): + # GH: 36727 + df = pd.DataFrame( + [ + ["A", "group_1", pd.Timestamp(2019, 1, 1, 9)], + ["B", "group_1", pd.Timestamp(2019, 1, 2, 9)], + ["Z", "group_2", pd.Timestamp(2019, 1, 3, 9)], + ["H", "group_1", pd.Timestamp(2019, 1, 6, 9)], + ["E", "group_2", pd.Timestamp(2019, 1, 20, 9)], + ], + columns=["index", "group", "eventTime"], + ).set_index("index") + + groups = df.groupby("group") + df["count_to_date"] = groups.cumcount() + rolling_groups = groups.rolling("10d", on="eventTime") + result = rolling_groups.apply(lambda df: df.shape[0]) + expected = pd.DataFrame( + [ + ["A", "group_1", pd.Timestamp(2019, 1, 1, 9), 1.0], + ["B", "group_1", pd.Timestamp(2019, 1, 2, 9), 2.0], + ["H", "group_1", pd.Timestamp(2019, 1, 6, 9), 3.0], + ["Z", "group_2", pd.Timestamp(2019, 1, 3, 9), 1.0], + ["E", "group_2", pd.Timestamp(2019, 1, 20, 9), 1.0], + ], + columns=["index", "group", "eventTime", "count_to_date"], + ).set_index(["group", "index"]) + tm.assert_frame_equal(result, expected) diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py index c417f58f6bf1b..c6d00eb58a969 100755 --- a/scripts/generate_pip_deps_from_conda.py +++ b/scripts/generate_pip_deps_from_conda.py @@ -94,7 +94,7 @@ def main(conda_fname, pip_fname, compare=False): f"# This file is auto-generated from {fname}, do not modify.\n" "# See that file for comments about the need/usage of each dependency.\n\n" ) - pip_content = header + "\n".join(pip_deps) + pip_content = header + "\n".join(pip_deps) + "\n" if compare: with open(pip_fname) as pip_fd: diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index c5f3701cc3c3f..b8839c83d00b9 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -30,6 +30,7 @@ "BigQuery", "STATA", "Interval", + "IntervalArray", "PEP8", "Period", "Series", @@ -141,6 +142,13 @@ "False", "Styler", "os", + "UTC", + "str", + "msgpack", + "ExtensionArray", + "LZMA", + "Numba", + "Timestamp", } CAP_EXCEPTIONS_DICT = {word.lower(): word for word in CAPITALIZATION_EXCEPTIONS} From 5b80f2460d835fa20585aa847c5330df0ef284f2 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 13 Oct 2020 11:35:27 -0700 Subject: [PATCH 37/38] CLN: remove unnecessary validate_ methdos --- pandas/core/arrays/_mixins.py | 12 +++--------- pandas/core/arrays/datetimelike.py | 4 ++-- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 95a003efbe1d0..948ffdc1f7c01 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -180,13 +180,10 @@ def _validate_shift_value(self, fill_value): return self._validate_fill_value(fill_value) def __setitem__(self, key, value): - key = self._validate_setitem_key(key) + key = check_array_indexer(self, key) value = self._validate_setitem_value(value) self._ndarray[key] = value - def _validate_setitem_key(self, key): - return check_array_indexer(self, key) - def _validate_setitem_value(self, value): return value @@ -198,7 +195,8 @@ def __getitem__(self, key): return self._box_func(result) return self._from_backing_data(result) - key = self._validate_getitem_key(key) + key = extract_array(key, extract_numpy=True) + key = check_array_indexer(self, key) result = self._ndarray[key] if lib.is_scalar(result): return self._box_func(result) @@ -206,10 +204,6 @@ def __getitem__(self, key): result = self._from_backing_data(result) return result - def _validate_getitem_key(self, key): - key = extract_array(key, extract_numpy=True) - return check_array_indexer(self, key) - @doc(ExtensionArray.fillna) def fillna(self: _T, value=None, method=None, limit=None) -> _T: value, method = validate_fillna_kwargs(value, method) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index eb5e5b03fe243..f1455aa864197 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -65,7 +65,7 @@ from pandas.core.arrays._mixins import NDArrayBackedExtensionArray import pandas.core.common as com from pandas.core.construction import array, extract_array -from pandas.core.indexers import check_setitem_lengths +from pandas.core.indexers import check_array_indexer, check_setitem_lengths from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.ops.invalid import invalid_comparison, make_invalid_op @@ -294,7 +294,7 @@ def _get_getitem_freq(self, key): elif self.ndim != 1: freq = None else: - key = self._validate_getitem_key(key) # maybe ndarray[bool] -> slice + key = check_array_indexer(self, key) # maybe ndarray[bool] -> slice freq = None if isinstance(key, slice): if self.freq is not None and key.step is not None: From 80561bcb038454779a7d7afe2c3f3ac6ea4fcd6f Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 13 Oct 2020 15:54:48 -0700 Subject: [PATCH 38/38] dummy commit to force CI