Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 2 additions & 117 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@

from pandas.io.common import get_handle
from pandas.io.formats import console, format as fmt
from pandas.io.formats.info import BaseInfo, DataFrameInfo
from pandas.io.formats.info import DataFrameInfo
import pandas.plotting

if TYPE_CHECKING:
Expand Down Expand Up @@ -2596,122 +2596,7 @@ def to_html(
)

# ----------------------------------------------------------------------
@Substitution(
klass="DataFrame",
type_sub=" and columns",
max_cols_sub=dedent(
"""\
max_cols : int, optional
When to switch from the verbose to the truncated output. If the
DataFrame has more than `max_cols` columns, the truncated output
is used. By default, the setting in
``pandas.options.display.max_info_columns`` is used."""
),
show_counts_sub=dedent(
"""\
show_counts : bool, optional
Whether to show the non-null counts. By default, this is shown
only if the DataFrame is smaller than
``pandas.options.display.max_info_rows`` and
``pandas.options.display.max_info_columns``. A value of True always
shows the counts, and False never shows the counts.
null_counts : bool, optional
.. deprecated:: 1.2.0
Use show_counts instead."""
),
examples_sub=dedent(
"""\
>>> int_values = [1, 2, 3, 4, 5]
>>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
>>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
>>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values,
... "float_col": float_values})
>>> df
int_col text_col float_col
0 1 alpha 0.00
1 2 beta 0.25
2 3 gamma 0.50
3 4 delta 0.75
4 5 epsilon 1.00

Prints information of all columns:

>>> df.info(verbose=True)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 int_col 5 non-null int64
1 text_col 5 non-null object
2 float_col 5 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 248.0+ bytes

Prints a summary of columns count and its dtypes but not per column
information:

>>> df.info(verbose=False)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Columns: 3 entries, int_col to float_col
dtypes: float64(1), int64(1), object(1)
memory usage: 248.0+ bytes

Pipe output of DataFrame.info to buffer instead of sys.stdout, get
buffer content and writes to a text file:

>>> import io
>>> buffer = io.StringIO()
>>> df.info(buf=buffer)
>>> s = buffer.getvalue()
>>> with open("df_info.txt", "w",
... encoding="utf-8") as f: # doctest: +SKIP
... f.write(s)
260

The `memory_usage` parameter allows deep introspection mode, specially
useful for big DataFrames and fine-tune memory optimization:

>>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
>>> df = pd.DataFrame({
... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),
... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),
... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)
... })
>>> df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 column_1 1000000 non-null object
1 column_2 1000000 non-null object
2 column_3 1000000 non-null object
dtypes: object(3)
memory usage: 22.9+ MB

>>> df.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 column_1 1000000 non-null object
1 column_2 1000000 non-null object
2 column_3 1000000 non-null object
dtypes: object(3)
memory usage: 165.9 MB"""
),
see_also_sub=dedent(
"""\
DataFrame.describe: Generate descriptive statistics of DataFrame
columns.
DataFrame.memory_usage: Memory usage of DataFrame columns."""
),
version_added_sub="",
)
@doc(BaseInfo.render)
@doc(DataFrameInfo.render)
def info(
self,
verbose: Optional[bool] = None,
Expand Down
154 changes: 143 additions & 11 deletions pandas/io/formats/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from abc import ABC, abstractmethod
import sys
from textwrap import dedent
from typing import (
IO,
TYPE_CHECKING,
Expand All @@ -17,6 +18,7 @@
from pandas._config import get_option

from pandas._typing import Dtype, FrameOrSeriesUnion
from pandas.util._decorators import doc

from pandas.core.indexes.api import Index

Expand All @@ -27,6 +29,132 @@
from pandas.core.frame import DataFrame


frame_max_cols_sub = dedent(
"""\
max_cols : int, optional
When to switch from the verbose to the truncated output. If the
DataFrame has more than `max_cols` columns, the truncated output
is used. By default, the setting in
``pandas.options.display.max_info_columns`` is used."""
)


frame_null_counts_sub = dedent(
"""\
null_counts : bool, optional
Whether to show the non-null counts. By default, this is shown
only if the DataFrame is smaller than
``pandas.options.display.max_info_rows`` and
``pandas.options.display.max_info_columns``. A value of True always
shows the counts, and False never shows the counts."""
)


frame_examples_sub = dedent(
"""\
>>> int_values = [1, 2, 3, 4, 5]
>>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
>>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
>>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values,
... "float_col": float_values})
>>> df
int_col text_col float_col
0 1 alpha 0.00
1 2 beta 0.25
2 3 gamma 0.50
3 4 delta 0.75
4 5 epsilon 1.00

Prints information of all columns:

>>> df.info(verbose=True)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 int_col 5 non-null int64
1 text_col 5 non-null object
2 float_col 5 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 248.0+ bytes

Prints a summary of columns count and its dtypes but not per column
information:

>>> df.info(verbose=False)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Columns: 3 entries, int_col to float_col
dtypes: float64(1), int64(1), object(1)
memory usage: 248.0+ bytes

Pipe output of DataFrame.info to buffer instead of sys.stdout, get
buffer content and writes to a text file:

>>> import io
>>> buffer = io.StringIO()
>>> df.info(buf=buffer)
>>> s = buffer.getvalue()
>>> with open("df_info.txt", "w",
... encoding="utf-8") as f: # doctest: +SKIP
... f.write(s)
260

The `memory_usage` parameter allows deep introspection mode, specially
useful for big DataFrames and fine-tune memory optimization:

>>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
>>> df = pd.DataFrame({
... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),
... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),
... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)
... })
>>> df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 column_1 1000000 non-null object
1 column_2 1000000 non-null object
2 column_3 1000000 non-null object
dtypes: object(3)
memory usage: 22.9+ MB

>>> df.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 column_1 1000000 non-null object
1 column_2 1000000 non-null object
2 column_3 1000000 non-null object
dtypes: object(3)
memory usage: 165.9 MB"""
)


frame_see_also_sub = dedent(
"""\
DataFrame.describe: Generate descriptive statistics of DataFrame
columns.
DataFrame.memory_usage: Memory usage of DataFrame columns."""
)


frame_subs = {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you do this? This could be simply specified in the decorator, doesn't seem to add value.

So, instead of:

    @doc(BaseInfo.render, **frame_subs)

You can use:

    @doc(BaseInfo.render, klass='DataFrame', type_sub='and colmns', ...)

"klass": "DataFrame",
"type_sub": " and columns",
"max_cols_sub": frame_max_cols_sub,
"null_counts_sub": frame_null_counts_sub,
"examples_sub": frame_examples_sub,
"see_also_sub": frame_see_also_sub,
"version_added_sub": "",
}


def _put_str(s: Union[str, Dtype], space: int) -> str:
"""
Make string of specified length, padding to the right if necessary.
Expand Down Expand Up @@ -174,26 +302,26 @@ def render(
show_counts: Optional[bool],
) -> None:
"""
Print a concise summary of a %(klass)s.
Print a concise summary of a {klass}.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you have variables in this docstring, this render should also have the @doc decorator, with the values of the variables here, no?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for the comment!
I tried that, but the problem is that in future I would like to share the docstring with Series.info().
If we put @doc with params pertaining to dataframes onto BaseInfo.render, then how would we handle the same for series?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The @doc decorator does some magic. Basically it renders the docstring (template) with the provided variables, and it stores it as the __doc__, so that's the docstring of the function/class. But also, it saves the original docstring before rendering the variables in something like __doc_original__ (forgot the actual name), so if you reuse that docstring somewhere else with different variables, it'll work as "expected".


This method prints information about a %(klass)s including
the index dtype%(type_sub)s, non-null values and memory usage.
%(version_added_sub)s\
This method prints information about a {klass} including
the index dtype{type_sub}, non-null values and memory usage.
{version_added_sub}\

Parameters
----------
data : %(klass)s
%(klass)s to print information about.
data : {klass}
{klass} to print information about.
verbose : bool, optional
Whether to print the full summary. By default, the setting in
``pandas.options.display.max_info_columns`` is followed.
buf : writable buffer, defaults to sys.stdout
Where to send the output. By default, the output is printed to
sys.stdout. Pass a writable buffer if you need to further process
the output.
%(max_cols_sub)s
{max_cols_sub}
memory_usage : bool, str, optional
Specifies whether total memory usage of the %(klass)s
Specifies whether total memory usage of the {klass}
elements (including the index) should be displayed. By default,
this follows the ``pandas.options.display.memory_usage`` setting.

Expand All @@ -210,15 +338,15 @@ def render(
Returns
-------
None
This method prints a summary of a %(klass)s and returns None.
This method prints a summary of a {klass} and returns None.

See Also
--------
%(see_also_sub)s
{see_also_sub}

Examples
--------
%(examples_sub)s
{examples_sub}
"""


Expand Down Expand Up @@ -281,6 +409,10 @@ def memory_usage_bytes(self) -> int:
deep = False
return self.data.memory_usage(index=True, deep=deep).sum()

@doc(
BaseInfo.render,
**frame_subs,
)
def render(
self,
*,
Expand Down