STY: Fix doctest and docstring formatting errors (#56408)

mroeschke · web-flow · commit 04307e717d3d · 2023-12-08T17:43:34.000-08:00
* STY: Fix doctest and docstring formatting errors

* ensure stderr is output too

* Fix more failures

* Don't add redirects for single page, fix example

* A few more

* Remove e flag
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -14,6 +14,8 @@
 #   $ ./ci/code_checks.sh single-docs   # check single-page docs build warning-free
 #   $ ./ci/code_checks.sh notebooks     # check execution of documentation notebooks
 
+set -uo pipefail
+
 [[ -z "$1" || "$1" == "code" || "$1" == "doctests" || "$1" == "docstrings" || "$1" == "single-docs" || "$1" == "notebooks" ]] || \
     { echo "Unknown command $1. Usage: $0 [code|doctests|docstrings|single-docs|notebooks]"; exit 9999; }
 
diff --git a/doc/make.py b/doc/make.py
@@ -236,8 +236,9 @@ def html(self):
             os.remove(zip_fname)
 
         if ret_code == 0:
-            if self.single_doc_html is not None and not self.no_browser:
-                self._open_browser(self.single_doc_html)
+            if self.single_doc_html is not None:
+                if not self.no_browser:
+                    self._open_browser(self.single_doc_html)
             else:
                 self._add_redirects()
                 if self.whatsnew and not self.no_browser:
diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py
@@ -270,7 +270,7 @@ def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame:
         Examples
         --------
         >>> import scipy.sparse
-        >>> mat = scipy.sparse.eye(3)
+        >>> mat = scipy.sparse.eye(3, dtype=float)
         >>> pd.DataFrame.sparse.from_spmatrix(mat)
              0    1    2
         0  1.0  0.0  0.0
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -3955,7 +3955,7 @@ def to_csv(
         >>> df = pd.DataFrame({{'name': ['Raphael', 'Donatello'],
         ...                    'mask': ['red', 'purple'],
         ...                    'weapon': ['sai', 'bo staff']}})
-        >>> df.to_csv('out.csv', index=False) # doctest: +SKIP
+        >>> df.to_csv('out.csv', index=False)  # doctest: +SKIP
 
         Create 'out.zip' containing 'out.csv'
 
@@ -8972,7 +8972,7 @@ def clip(
 
         Clips using specific lower and upper thresholds per column:
 
-        >>> df.clip([-2, -1], [4,5])
+        >>> df.clip([-2, -1], [4, 5])
             col_0  col_1
         0      4     -1
         1     -2     -1
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -470,10 +470,9 @@ def _aggregate_named(self, func, *args, **kwargs):
 
     __examples_series_doc = dedent(
         """
-    >>> ser = pd.Series(
-    ...    [390.0, 350.0, 30.0, 20.0],
-    ...    index=["Falcon", "Falcon", "Parrot", "Parrot"],
-    ...    name="Max Speed")
+    >>> ser = pd.Series([390.0, 350.0, 30.0, 20.0],
+    ...                 index=["Falcon", "Falcon", "Parrot", "Parrot"],
+    ...                 name="Max Speed")
     >>> grouped = ser.groupby([1, 1, 2, 2])
     >>> grouped.transform(lambda x: (x - x.mean()) / x.std())
         Falcon    0.707107
@@ -1331,14 +1330,10 @@ class DataFrameGroupBy(GroupBy[DataFrame]):
         """
     Examples
     --------
-    >>> df = pd.DataFrame(
-    ...     {
-    ...         "A": [1, 1, 2, 2],
+    >>> data = {"A": [1, 1, 2, 2],
     ...         "B": [1, 2, 3, 4],
-    ...         "C": [0.362838, 0.227877, 1.267767, -0.562860],
-    ...     }
-    ... )
-
+    ...         "C": [0.362838, 0.227877, 1.267767, -0.562860]}
+    >>> df = pd.DataFrame(data)
     >>> df
        A  B         C
     0  1  1  0.362838
@@ -1393,7 +1388,8 @@ class DataFrameGroupBy(GroupBy[DataFrame]):
 
     >>> df.groupby("A").agg(
     ...     b_min=pd.NamedAgg(column="B", aggfunc="min"),
-    ...     c_sum=pd.NamedAgg(column="C", aggfunc="sum"))
+    ...     c_sum=pd.NamedAgg(column="C", aggfunc="sum")
+    ... )
        b_min     c_sum
     A
     1      1  0.590715
@@ -2154,7 +2150,7 @@ def idxmax(
 
         >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48],
         ...                    'co2_emissions': [37.2, 19.66, 1712]},
-        ...                    index=['Pork', 'Wheat Products', 'Beef'])
+        ...                   index=['Pork', 'Wheat Products', 'Beef'])
 
         >>> df
                         consumption  co2_emissions
@@ -2236,7 +2232,7 @@ def idxmin(
 
         >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48],
         ...                    'co2_emissions': [37.2, 19.66, 1712]},
-        ...                    index=['Pork', 'Wheat Products', 'Beef'])
+        ...                   index=['Pork', 'Wheat Products', 'Beef'])
 
         >>> df
                         consumption  co2_emissions
@@ -2319,9 +2315,9 @@ def value_counts(
         Examples
         --------
         >>> df = pd.DataFrame({
-        ...    'gender': ['male', 'male', 'female', 'male', 'female', 'male'],
-        ...    'education': ['low', 'medium', 'high', 'low', 'high', 'low'],
-        ...    'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR']
+        ...     'gender': ['male', 'male', 'female', 'male', 'female', 'male'],
+        ...     'education': ['low', 'medium', 'high', 'low', 'high', 'low'],
+        ...     'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR']
         ... })
 
         >>> df
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -232,8 +232,8 @@ class providing the base-class of operations.
     """,
     "dataframe_examples": """
     >>> df = pd.DataFrame({'A': 'a a b'.split(),
-    ...                    'B': [1,2,3],
-    ...                    'C': [4,6,5]})
+    ...                    'B': [1, 2, 3],
+    ...                    'C': [4, 6, 5]})
     >>> g1 = df.groupby('A', group_keys=False)
     >>> g2 = df.groupby('A', group_keys=True)
 
@@ -313,7 +313,7 @@ class providing the base-class of operations.
 
         The resulting dtype will reflect the return value of the passed ``func``.
 
-    >>> g1.apply(lambda x: x*2 if x.name == 'a' else x/2)
+    >>> g1.apply(lambda x: x * 2 if x.name == 'a' else x / 2)
     a    0.0
     a    2.0
     b    1.0
@@ -322,7 +322,7 @@ class providing the base-class of operations.
     In the above, the groups are not part of the index. We can have them included
     by using ``g2`` where ``group_keys=True``:
 
-    >>> g2.apply(lambda x: x*2 if x.name == 'a' else x/2)
+    >>> g2.apply(lambda x: x * 2 if x.name == 'a' else x / 2)
     a  a    0.0
        a    2.0
     b  b    1.0
@@ -421,14 +421,18 @@ class providing the base-class of operations.
 functions that expect Series, DataFrames, GroupBy or Resampler objects.
 Instead of writing
 
->>> h(g(f(df.groupby('group')), arg1=a), arg2=b, arg3=c)  # doctest: +SKIP
+>>> h = lambda x, arg2, arg3: x + 1 - arg2 * arg3
+>>> g = lambda x, arg1: x * 5 / arg1
+>>> f = lambda x: x ** 4
+>>> df = pd.DataFrame([["a", 4], ["b", 5]], columns=["group", "value"])
+>>> h(g(f(df.groupby('group')), arg1=1), arg2=2, arg3=3)  # doctest: +SKIP
 
 You can write
 
 >>> (df.groupby('group')
 ...    .pipe(f)
-...    .pipe(g, arg1=a)
-...    .pipe(h, arg2=b, arg3=c))  # doctest: +SKIP
+...    .pipe(g, arg1=1)
+...    .pipe(h, arg2=2, arg3=3))  # doctest: +SKIP
 
 which is much more readable.
 
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -862,7 +862,8 @@ def levels(self) -> FrozenList:
         Examples
         --------
         >>> index = pd.MultiIndex.from_product([['mammal'],
-        ... ('goat', 'human', 'cat', 'dog')], names=['Category', 'Animals'])
+        ...                                     ('goat', 'human', 'cat', 'dog')],
+        ...                                    names=['Category', 'Animals'])
         >>> leg_num = pd.DataFrame(data=(4, 2, 4, 4), index=index, columns=['Legs'])
         >>> leg_num
                           Legs
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
@@ -859,7 +859,7 @@ def fillna(self, method, limit: int | None = None):
         Missing values present before the upsampling are not affected.
 
         >>> sm = pd.Series([1, None, 3],
-        ...               index=pd.date_range('20180101', periods=3, freq='h'))
+        ...                index=pd.date_range('20180101', periods=3, freq='h'))
         >>> sm
         2018-01-01 00:00:00    1.0
         2018-01-01 01:00:00    NaN
@@ -1028,21 +1028,16 @@ def interpolate(
         Examples
         --------
 
-        >>> import datetime as dt
-        >>> timesteps = [
-        ...    dt.datetime(2023, 3, 1, 7, 0, 0),
-        ...    dt.datetime(2023, 3, 1, 7, 0, 1),
-        ...    dt.datetime(2023, 3, 1, 7, 0, 2),
-        ...    dt.datetime(2023, 3, 1, 7, 0, 3),
-        ...    dt.datetime(2023, 3, 1, 7, 0, 4)]
+        >>> start = "2023-03-01T07:00:00"
+        >>> timesteps = pd.date_range(start, periods=5, freq="s")
         >>> series = pd.Series(data=[1, -1, 2, 1, 3], index=timesteps)
         >>> series
         2023-03-01 07:00:00    1
         2023-03-01 07:00:01   -1
         2023-03-01 07:00:02    2
         2023-03-01 07:00:03    1
         2023-03-01 07:00:04    3
-        dtype: int64
+        Freq: s, dtype: int64
 
         Upsample the dataframe to 0.5Hz by providing the period time of 2s.
 
diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py
@@ -797,7 +797,7 @@
     ...                    'B': ['a', 'b', 'c', 'd', 'e'],
     ...                    'C': ['f', 'g', 'h', 'i', 'j']}})
 
-    >>> df.replace(to_replace='^[a-g]', value = 'e', regex=True)
+    >>> df.replace(to_replace='^[a-g]', value='e', regex=True)
         A  B  C
     0  0  e  e
     1  1  e  e
@@ -808,7 +808,7 @@
     If ``value`` is not ``None`` and `to_replace` is a dictionary, the dictionary
     keys will be the DataFrame columns that the replacement will be applied.
 
-    >>> df.replace(to_replace={{'B': '^[a-c]', 'C': '^[h-j]'}}, value = 'e', regex=True)
+    >>> df.replace(to_replace={{'B': '^[a-c]', 'C': '^[h-j]'}}, value='e', regex=True)
         A  B  C
     0  0  e  f
     1  1  e  g
diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py
@@ -2439,14 +2439,14 @@ def var(
         create_section_header("Examples"),
         dedent(
             """\
-        >>> ser = pd.Series([1, 5, 2, 7, 12, 6])
+        >>> ser = pd.Series([1, 5, 2, 7, 15, 6])
         >>> ser.rolling(3).skew().round(6)
         0         NaN
         1         NaN
         2    1.293343
         3   -0.585583
-        4    0.000000
-        5    1.545393
+        4    0.670284
+        5    1.652317
         dtype: float64
         """
         ),
@@ -2794,12 +2794,12 @@ def cov(
 
         >>> v1 = [3, 3, 3, 5, 8]
         >>> v2 = [3, 4, 4, 4, 8]
-        >>> # numpy returns a 2X2 array, the correlation coefficient
-        >>> # is the number at entry [0][1]
-        >>> print(f"{{np.corrcoef(v1[:-1], v2[:-1])[0][1]:.6f}}")
-        0.333333
-        >>> print(f"{{np.corrcoef(v1[1:], v2[1:])[0][1]:.6f}}")
-        0.916949
+        >>> np.corrcoef(v1[:-1], v2[:-1])
+        array([[1.        , 0.33333333],
+               [0.33333333, 1.        ]])
+        >>> np.corrcoef(v1[1:], v2[1:])
+        array([[1.       , 0.9169493],
+               [0.9169493, 1.       ]])
         >>> s1 = pd.Series(v1)
         >>> s2 = pd.Series(v2)
         >>> s1.rolling(4).corr(s2)
@@ -2813,15 +2813,18 @@ def cov(
         The below example shows a similar rolling calculation on a
         DataFrame using the pairwise option.
 
-        >>> matrix = np.array([[51., 35.], [49., 30.], [47., 32.],\
-        [46., 31.], [50., 36.]])
-        >>> print(np.corrcoef(matrix[:-1,0], matrix[:-1,1]).round(7))
-        [[1.         0.6263001]
-         [0.6263001  1.       ]]
-        >>> print(np.corrcoef(matrix[1:,0], matrix[1:,1]).round(7))
-        [[1.         0.5553681]
-         [0.5553681  1.        ]]
-        >>> df = pd.DataFrame(matrix, columns=['X','Y'])
+        >>> matrix = np.array([[51., 35.],
+        ...                    [49., 30.],
+        ...                    [47., 32.],
+        ...                    [46., 31.],
+        ...                    [50., 36.]])
+        >>> np.corrcoef(matrix[:-1, 0], matrix[:-1, 1])
+        array([[1.       , 0.6263001],
+               [0.6263001, 1.       ]])
+        >>> np.corrcoef(matrix[1:, 0], matrix[1:, 1])
+        array([[1.        , 0.55536811],
+               [0.55536811, 1.        ]])
+        >>> df = pd.DataFrame(matrix, columns=['X', 'Y'])
         >>> df
               X     Y
         0  51.0  35.0
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
@@ -680,7 +680,7 @@ def read_sql(
 
        pandas now supports reading via ADBC drivers
 
-    >>> from adbc_driver_postgresql import dbapi
+    >>> from adbc_driver_postgresql import dbapi  # doctest:+SKIP
     >>> with dbapi.connect('postgres:///db_name') as conn:  # doctest:+SKIP
     ...     pd.read_sql('SELECT int_column FROM test_data', conn)
        int_column
diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py
@@ -241,10 +241,10 @@ def hist_frame(
     .. plot::
         :context: close-figs
 
-        >>> df = pd.DataFrame({
-        ...     'length': [1.5, 0.5, 1.2, 0.9, 3],
-        ...     'width': [0.7, 0.2, 0.15, 0.2, 1.1]
-        ...     }, index=['pig', 'rabbit', 'duck', 'chicken', 'horse'])
+        >>> data = {'length': [1.5, 0.5, 1.2, 0.9, 3],
+        ...         'width': [0.7, 0.2, 0.15, 0.2, 1.1]}
+        >>> index = ['pig', 'rabbit', 'duck', 'chicken', 'horse']
+        >>> df = pd.DataFrame(data, index=index)
         >>> hist = df.hist(bins=3)
     """
     plot_backend = _get_plot_backend(backend)
@@ -607,10 +607,10 @@ def boxplot_frame_groupby(
         >>> import itertools
         >>> tuples = [t for t in itertools.product(range(1000), range(4))]
         >>> index = pd.MultiIndex.from_tuples(tuples, names=['lvl0', 'lvl1'])
-        >>> data = np.random.randn(len(index),4)
+        >>> data = np.random.randn(len(index), 4)
         >>> df = pd.DataFrame(data, columns=list('ABCD'), index=index)
         >>> grouped = df.groupby(level='lvl1')
-        >>> grouped.boxplot(rot=45, fontsize=12, figsize=(8,10))  # doctest: +SKIP
+        >>> grouped.boxplot(rot=45, fontsize=12, figsize=(8, 10))  # doctest: +SKIP
 
     The ``subplots=False`` option shows the boxplots in a single figure.
 
@@ -1400,9 +1400,7 @@ def hist(
         .. plot::
             :context: close-figs
 
-            >>> df = pd.DataFrame(
-            ...     np.random.randint(1, 7, 6000),
-            ...     columns = ['one'])
+            >>> df = pd.DataFrame(np.random.randint(1, 7, 6000), columns=['one'])
             >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000)
             >>> ax = df.plot.hist(bins=12, alpha=0.5)
 
diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py
@@ -439,7 +439,7 @@ def bootstrap_plot(
         :context: close-figs
 
         >>> s = pd.Series(np.random.uniform(size=100))
-        >>> pd.plotting.bootstrap_plot(s)
+        >>> pd.plotting.bootstrap_plot(s)  # doctest: +SKIP
         <Figure size 640x480 with 6 Axes>
     """
     plot_backend = _get_plot_backend("matplotlib")
diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py
@@ -228,11 +228,12 @@ def validate_pep8(self):
                 file.name,
             ]
             response = subprocess.run(cmd, capture_output=True, check=False, text=True)
-            stdout = response.stdout
-            stdout = stdout.replace(file.name, "")
-            messages = stdout.strip("\n").splitlines()
-            if messages:
-                error_messages.extend(messages)
+            for output in ("stdout", "stderr"):
+                out = getattr(response, output)
+                out = out.replace(file.name, "")
+                messages = out.strip("\n").splitlines()
+                if messages:
+                    error_messages.extend(messages)
         finally:
             file.close()
             os.unlink(file.name)