diff --git a/pandas/_typing.py b/pandas/_typing.py index 7678d1bf12d8b..a9177106535fc 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -38,6 +38,8 @@ from pandas.core.indexes.base import Index from pandas.core.series import Series + from pandas.io.formats.format import EngFormatter + # array-like AnyArrayLike = TypeVar("AnyArrayLike", "ExtensionArray", "Index", "Series", np.ndarray) @@ -127,6 +129,10 @@ EncodingVar = TypeVar("EncodingVar", str, None, Optional[str]) +# type of float formatter in DataFrameFormatter +FloatFormatType = Union[str, Callable, "EngFormatter"] + + @dataclass class IOargs(Generic[ModeVar, EncodingVar]): """ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 801307a8f9481..3af7d5c8fae24 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -788,10 +788,8 @@ def _repr_html_(self) -> Optional[str]: max_cols=max_cols, show_dimensions=show_dimensions, decimal=".", - table_id=None, - render_links=False, ) - return formatter.to_html(notebook=True) + return fmt.DataFrameRenderer(formatter).to_html(notebook=True) else: return None @@ -874,9 +872,12 @@ def to_string( max_cols=max_cols, show_dimensions=show_dimensions, decimal=decimal, + ) + return fmt.DataFrameRenderer(formatter).to_string( + buf=buf, + encoding=encoding, line_width=line_width, ) - return formatter.to_string(buf=buf, encoding=encoding) # ---------------------------------------------------------------------- @@ -2476,29 +2477,29 @@ def to_html( columns=columns, col_space=col_space, na_rep=na_rep, + header=header, + index=index, formatters=formatters, float_format=float_format, + bold_rows=bold_rows, sparsify=sparsify, justify=justify, index_names=index_names, - header=header, - index=index, - bold_rows=bold_rows, escape=escape, + decimal=decimal, max_rows=max_rows, max_cols=max_cols, show_dimensions=show_dimensions, - decimal=decimal, - table_id=table_id, - render_links=render_links, ) # TODO: a generic formatter wld b in DataFrameFormatter - return formatter.to_html( + return fmt.DataFrameRenderer(formatter).to_html( buf=buf, classes=classes, notebook=notebook, border=border, encoding=encoding, + table_id=table_id, + render_links=render_links, ) # ---------------------------------------------------------------------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4d2f504146e87..d658d799f1fb8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4,7 +4,6 @@ from datetime import timedelta import functools import gc -from io import StringIO import json import operator import pickle @@ -109,7 +108,11 @@ from pandas.core.window import Expanding, ExponentialMovingWindow, Rolling, Window from pandas.io.formats import format as fmt -from pandas.io.formats.format import DataFrameFormatter, format_percentiles +from pandas.io.formats.format import ( + DataFrameFormatter, + DataFrameRenderer, + format_percentiles, +) from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: @@ -3149,7 +3152,7 @@ def to_latex( escape=escape, decimal=decimal, ) - return formatter.to_latex( + return DataFrameRenderer(formatter).to_latex( buf=buf, column_format=column_format, longtable=longtable, @@ -3182,7 +3185,7 @@ def to_csv( date_format: Optional[str] = None, doublequote: bool_t = True, escapechar: Optional[str] = None, - decimal: Optional[str] = ".", + decimal: str = ".", errors: str = "strict", storage_options: StorageOptions = None, ) -> Optional[str]: @@ -3340,10 +3343,16 @@ def to_csv( """ df = self if isinstance(self, ABCDataFrame) else self.to_frame() - from pandas.io.formats.csvs import CSVFormatter + formatter = DataFrameFormatter( + frame=df, + header=header, + index=index, + na_rep=na_rep, + float_format=float_format, + decimal=decimal, + ) - formatter = CSVFormatter( - df, + return DataFrameRenderer(formatter).to_csv( path_or_buf, line_terminator=line_terminator, sep=sep, @@ -3351,11 +3360,7 @@ def to_csv( errors=errors, compression=compression, quoting=quoting, - na_rep=na_rep, - float_format=float_format, - cols=columns, - header=header, - index=index, + columns=columns, index_label=index_label, mode=mode, chunksize=chunksize, @@ -3363,16 +3368,8 @@ def to_csv( date_format=date_format, doublequote=doublequote, escapechar=escapechar, - decimal=decimal, storage_options=storage_options, ) - formatter.save() - - if path_or_buf is None: - assert isinstance(formatter.path_or_buf, StringIO) - return formatter.path_or_buf.getvalue() - - return None # ---------------------------------------------------------------------- # Lookup Caching diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index d0e9163fc5f11..6c62d6825bc84 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -5,7 +5,7 @@ import csv as csvlib from io import StringIO, TextIOWrapper import os -from typing import Any, Dict, Hashable, Iterator, List, Optional, Sequence, Union +from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Sequence, Union import numpy as np @@ -13,6 +13,7 @@ from pandas._typing import ( CompressionOptions, FilePathOrBuffer, + FloatFormatType, IndexLabel, Label, StorageOptions, @@ -30,18 +31,17 @@ from pandas.io.common import get_filepath_or_buffer, get_handle +if TYPE_CHECKING: + from pandas.io.formats.format import DataFrameFormatter + class CSVFormatter: def __init__( self, - obj, + formatter: "DataFrameFormatter", path_or_buf: Optional[FilePathOrBuffer[str]] = None, sep: str = ",", - na_rep: str = "", - float_format: Optional[str] = None, cols: Optional[Sequence[Label]] = None, - header: Union[bool, Sequence[Hashable]] = True, - index: bool = True, index_label: Optional[IndexLabel] = None, mode: str = "w", encoding: Optional[str] = None, @@ -54,10 +54,11 @@ def __init__( date_format: Optional[str] = None, doublequote: bool = True, escapechar: Optional[str] = None, - decimal=".", storage_options: StorageOptions = None, ): - self.obj = obj + self.fmt = formatter + + self.obj = self.fmt.frame self.encoding = encoding or "utf-8" @@ -79,35 +80,45 @@ def __init__( self.mode = ioargs.mode self.sep = sep - self.na_rep = na_rep - self.float_format = float_format - self.decimal = decimal - self.header = header - self.index = index - self.index_label = index_label + self.index_label = self._initialize_index_label(index_label) self.errors = errors self.quoting = quoting or csvlib.QUOTE_MINIMAL - self.quotechar = quotechar + self.quotechar = self._initialize_quotechar(quotechar) self.doublequote = doublequote self.escapechar = escapechar self.line_terminator = line_terminator or os.linesep self.date_format = date_format - self.cols = cols # type: ignore[assignment] - self.chunksize = chunksize # type: ignore[assignment] + self.cols = self._initialize_columns(cols) + self.chunksize = self._initialize_chunksize(chunksize) + + @property + def na_rep(self) -> str: + return self.fmt.na_rep + + @property + def float_format(self) -> Optional["FloatFormatType"]: + return self.fmt.float_format @property - def index_label(self) -> IndexLabel: - return self._index_label + def decimal(self) -> str: + return self.fmt.decimal - @index_label.setter - def index_label(self, index_label: Optional[IndexLabel]) -> None: + @property + def header(self) -> Union[bool, Sequence[str]]: + return self.fmt.header + + @property + def index(self) -> bool: + return self.fmt.index + + def _initialize_index_label(self, index_label: Optional[IndexLabel]) -> IndexLabel: if index_label is not False: if index_label is None: - index_label = self._get_index_label_from_obj() + return self._get_index_label_from_obj() elif not isinstance(index_label, (list, tuple, np.ndarray, ABCIndexClass)): # given a string for a DF with Index - index_label = [index_label] - self._index_label = index_label + return [index_label] + return index_label def _get_index_label_from_obj(self) -> List[str]: if isinstance(self.obj.index, ABCMultiIndex): @@ -122,30 +133,17 @@ def _get_index_label_flat(self) -> List[str]: index_label = self.obj.index.name return [""] if index_label is None else [index_label] - @property - def quotechar(self) -> Optional[str]: + def _initialize_quotechar(self, quotechar: Optional[str]) -> Optional[str]: if self.quoting != csvlib.QUOTE_NONE: # prevents crash in _csv - return self._quotechar + return quotechar return None - @quotechar.setter - def quotechar(self, quotechar: Optional[str]) -> None: - self._quotechar = quotechar - @property def has_mi_columns(self) -> bool: return bool(isinstance(self.obj.columns, ABCMultiIndex)) - @property - def cols(self) -> Sequence[Label]: - return self._cols - - @cols.setter - def cols(self, cols: Optional[Sequence[Label]]) -> None: - self._cols = self._refine_cols(cols) - - def _refine_cols(self, cols: Optional[Sequence[Label]]) -> Sequence[Label]: + def _initialize_columns(self, cols: Optional[Sequence[Label]]) -> Sequence[Label]: # validate mi options if self.has_mi_columns: if cols is not None: @@ -161,12 +159,16 @@ def _refine_cols(self, cols: Optional[Sequence[Label]]) -> Sequence[Label]: # update columns to include possible multiplicity of dupes # and make sure sure cols is just a list of labels - cols = self.obj.columns - if isinstance(cols, ABCIndexClass): - return cols._format_native_types(**self._number_format) + new_cols = self.obj.columns + if isinstance(new_cols, ABCIndexClass): + return new_cols._format_native_types(**self._number_format) else: - assert isinstance(cols, Sequence) - return list(cols) + return list(new_cols) + + def _initialize_chunksize(self, chunksize: Optional[int]) -> int: + if chunksize is None: + return (100000 // (len(self.cols) or 1)) or 1 + return int(chunksize) @property def _number_format(self) -> Dict[str, Any]: @@ -179,17 +181,6 @@ def _number_format(self) -> Dict[str, Any]: decimal=self.decimal, ) - @property - def chunksize(self) -> int: - return self._chunksize - - @chunksize.setter - def chunksize(self, chunksize: Optional[int]) -> None: - if chunksize is None: - chunksize = (100000 // (len(self.cols) or 1)) or 1 - assert chunksize is not None - self._chunksize = int(chunksize) - @property def data_index(self) -> Index: data_index = self.obj.index diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 7635cda56ba26..6f4bd2ed8c73a 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -39,8 +39,14 @@ from pandas._libs.tslib import format_array_from_datetime from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT from pandas._libs.tslibs.nattype import NaTType -from pandas._typing import FilePathOrBuffer, Label -from pandas.errors import AbstractMethodError +from pandas._typing import ( + CompressionOptions, + FilePathOrBuffer, + FloatFormatType, + IndexLabel, + Label, + StorageOptions, +) from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -75,10 +81,10 @@ if TYPE_CHECKING: from pandas import Categorical, DataFrame, Series + FormattersType = Union[ List[Callable], Tuple[Callable, ...], Mapping[Union[str, int], Callable] ] -FloatFormatType = Union[str, Callable, "EngFormatter"] ColspaceType = Mapping[Label, Union[str, int]] ColspaceArgType = Union[ str, int, Sequence[Union[str, int]], Mapping[Label, Union[str, int]] @@ -449,95 +455,8 @@ def get_adjustment() -> TextAdjustment: return TextAdjustment() -class TableFormatter: - - show_dimensions: Union[bool, str] - formatters: FormattersType - columns: Index - _is_truncated: bool - - @property - def is_truncated(self) -> bool: - return self._is_truncated - - @property - def should_show_dimensions(self) -> bool: - return self.show_dimensions is True or ( - self.show_dimensions == "truncate" and self.is_truncated - ) - - def _get_formatter(self, i: Union[str, int]) -> Optional[Callable]: - if isinstance(self.formatters, (list, tuple)): - if is_integer(i): - i = cast(int, i) - return self.formatters[i] - else: - return None - else: - if is_integer(i) and i not in self.columns: - i = self.columns[i] - return self.formatters.get(i, None) - - @contextmanager - def get_buffer( - self, buf: Optional[FilePathOrBuffer[str]], encoding: Optional[str] = None - ): - """ - Context manager to open, yield and close buffer for filenames or Path-like - objects, otherwise yield buf unchanged. - """ - if buf is not None: - buf = stringify_path(buf) - else: - buf = StringIO() - - if encoding is None: - encoding = "utf-8" - elif not isinstance(buf, str): - raise ValueError("buf is not a file name and encoding is specified.") - - if hasattr(buf, "write"): - yield buf - elif isinstance(buf, str): - with open(buf, "w", encoding=encoding, newline="") as f: - # GH#30034 open instead of codecs.open prevents a file leak - # if we have an invalid encoding argument. - # newline="" is needed to roundtrip correctly on - # windows test_to_latex_filename - yield f - else: - raise TypeError("buf is not a file name and it has no write method") - - def write_result(self, buf: IO[str]) -> None: - """ - Write the result of serialization to buf. - """ - raise AbstractMethodError(self) - - def get_result( - self, - buf: Optional[FilePathOrBuffer[str]] = None, - encoding: Optional[str] = None, - ) -> Optional[str]: - """ - Perform serialization. Write to buf or return as string if buf is None. - """ - with self.get_buffer(buf, encoding=encoding) as f: - self.write_result(buf=f) - if buf is None: - return f.getvalue() - return None - - -class DataFrameFormatter(TableFormatter): - """ - Render a DataFrame - - self.to_string() : console-friendly tabular output - self.to_html() : html table - self.to_latex() : LaTeX tabular environment table - - """ +class DataFrameFormatter: + """Class for processing dataframe formatting options and data.""" __doc__ = __doc__ if __doc__ else "" __doc__ += common_docstring + return_docstring @@ -555,46 +474,94 @@ def __init__( float_format: Optional[FloatFormatType] = None, sparsify: Optional[bool] = None, index_names: bool = True, - line_width: Optional[int] = None, max_rows: Optional[int] = None, min_rows: Optional[int] = None, max_cols: Optional[int] = None, show_dimensions: Union[bool, str] = False, decimal: str = ".", - table_id: Optional[str] = None, - render_links: bool = False, bold_rows: bool = False, escape: bool = True, ): self.frame = frame - self.show_index_names = index_names - self.sparsify = self._initialize_sparsify(sparsify) - self.float_format = float_format - self.formatters = self._initialize_formatters(formatters) - self.na_rep = na_rep - self.decimal = decimal + self.columns = self._initialize_columns(columns) self.col_space = self._initialize_colspace(col_space) self.header = header self.index = index - self.line_width = line_width + self.na_rep = na_rep + self.formatters = self._initialize_formatters(formatters) + self.justify = self._initialize_justify(justify) + self.float_format = float_format + self.sparsify = self._initialize_sparsify(sparsify) + self.show_index_names = index_names + self.decimal = decimal + self.bold_rows = bold_rows + self.escape = escape self.max_rows = max_rows self.min_rows = min_rows self.max_cols = max_cols self.show_dimensions = show_dimensions - self.table_id = table_id - self.render_links = render_links - self.justify = self._initialize_justify(justify) - self.bold_rows = bold_rows - self.escape = escape - self.columns = self._initialize_columns(columns) self.max_cols_fitted = self._calc_max_cols_fitted() self.max_rows_fitted = self._calc_max_rows_fitted() self.tr_frame = self.frame - self._truncate() + self.truncate() self.adj = get_adjustment() + def get_strcols(self) -> List[List[str]]: + """ + Render a DataFrame to a list of columns (as lists of strings). + """ + strcols = self._get_strcols_without_index() + + if self.index: + str_index = self._get_formatted_index(self.tr_frame) + strcols.insert(0, str_index) + + return strcols + + @property + def should_show_dimensions(self) -> bool: + return self.show_dimensions is True or ( + self.show_dimensions == "truncate" and self.is_truncated + ) + + @property + def is_truncated(self) -> bool: + return bool(self.is_truncated_horizontally or self.is_truncated_vertically) + + @property + def is_truncated_horizontally(self) -> bool: + return bool(self.max_cols_fitted and (len(self.columns) > self.max_cols_fitted)) + + @property + def is_truncated_vertically(self) -> bool: + return bool(self.max_rows_fitted and (len(self.frame) > self.max_rows_fitted)) + + @property + def dimensions_info(self) -> str: + return f"\n\n[{len(self.frame)} rows x {len(self.frame.columns)} columns]" + + @property + def has_index_names(self) -> bool: + return _has_names(self.frame.index) + + @property + def has_column_names(self) -> bool: + return _has_names(self.frame.columns) + + @property + def show_row_idx_names(self) -> bool: + return all((self.has_index_names, self.index, self.show_index_names)) + + @property + def show_col_idx_names(self) -> bool: + return all((self.has_column_names, self.show_index_names, self.header)) + + @property + def max_rows_displayed(self) -> int: + return min(self.max_rows or len(self.frame), len(self.frame)) + def _initialize_sparsify(self, sparsify: Optional[bool]) -> bool: if sparsify is None: return get_option("display.multi_sparse") @@ -653,10 +620,6 @@ def _initialize_colspace( result = dict(zip(self.frame.columns, col_space)) return result - @property - def max_rows_displayed(self) -> int: - return min(self.max_rows or len(self.frame), len(self.frame)) - def _calc_max_cols_fitted(self) -> Optional[int]: """Number of columns fitting the screen.""" if not self._is_in_terminal(): @@ -707,26 +670,14 @@ def _get_number_of_auxillary_rows(self) -> int: num_rows = dot_row + prompt_row if self.show_dimensions: - num_rows += len(self._dimensions_info.splitlines()) + num_rows += len(self.dimensions_info.splitlines()) if self.header: num_rows += 1 return num_rows - @property - def is_truncated_horizontally(self) -> bool: - return bool(self.max_cols_fitted and (len(self.columns) > self.max_cols_fitted)) - - @property - def is_truncated_vertically(self) -> bool: - return bool(self.max_rows_fitted and (len(self.frame) > self.max_rows_fitted)) - - @property - def is_truncated(self) -> bool: - return bool(self.is_truncated_horizontally or self.is_truncated_vertically) - - def _truncate(self) -> None: + def truncate(self) -> None: """ Check whether the frame should be truncated. If so, slice the frame up. """ @@ -785,7 +736,7 @@ def _get_strcols_without_index(self) -> List[List[str]]: if not is_list_like(self.header) and not self.header: for i, c in enumerate(self.tr_frame): - fmt_values = self._format_col(i) + fmt_values = self.format_col(i) fmt_values = _make_fixed_width( strings=fmt_values, justify=self.justify, @@ -816,7 +767,7 @@ def _get_strcols_without_index(self) -> List[List[str]]: header_colwidth = max( int(self.col_space.get(c, 0)), *(self.adj.len(x) for x in cheader) ) - fmt_values = self._format_col(i) + fmt_values = self.format_col(i) fmt_values = _make_fixed_width( fmt_values, self.justify, minimum=header_colwidth, adj=self.adj ) @@ -827,223 +778,7 @@ def _get_strcols_without_index(self) -> List[List[str]]: return strcols - def _get_strcols(self) -> List[List[str]]: - strcols = self._get_strcols_without_index() - - str_index = self._get_formatted_index(self.tr_frame) - if self.index: - strcols.insert(0, str_index) - - return strcols - - def _to_str_columns(self) -> List[List[str]]: - """ - Render a DataFrame to a list of columns (as lists of strings). - """ - strcols = self._get_strcols() - - if self.is_truncated: - strcols = self._insert_dot_separators(strcols) - - return strcols - - def _insert_dot_separators(self, strcols: List[List[str]]) -> List[List[str]]: - str_index = self._get_formatted_index(self.tr_frame) - index_length = len(str_index) - - if self.is_truncated_horizontally: - strcols = self._insert_dot_separator_horizontal(strcols, index_length) - - if self.is_truncated_vertically: - strcols = self._insert_dot_separator_vertical(strcols, index_length) - - return strcols - - def _insert_dot_separator_horizontal( - self, strcols: List[List[str]], index_length: int - ) -> List[List[str]]: - strcols.insert(self.tr_col_num + 1, [" ..."] * index_length) - return strcols - - def _insert_dot_separator_vertical( - self, strcols: List[List[str]], index_length: int - ) -> List[List[str]]: - n_header_rows = index_length - len(self.tr_frame) - row_num = self.tr_row_num - for ix, col in enumerate(strcols): - cwidth = self.adj.len(col[row_num]) - - if self.is_truncated_horizontally: - is_dot_col = ix == self.tr_col_num + 1 - else: - is_dot_col = False - - if cwidth > 3 or is_dot_col: - dots = "..." - else: - dots = ".." - - if ix == 0: - dot_mode = "left" - elif is_dot_col: - cwidth = 4 - dot_mode = "right" - else: - dot_mode = "right" - - dot_str = self.adj.justify([dots], cwidth, mode=dot_mode)[0] - col.insert(row_num + n_header_rows, dot_str) - return strcols - - def write_result(self, buf: IO[str]) -> None: - """ - Render a DataFrame to a console-friendly tabular output. - """ - text = self._get_string_representation() - - buf.writelines(text) - - if self.should_show_dimensions: - buf.write(self._dimensions_info) - - @property - def _dimensions_info(self) -> str: - return f"\n\n[{len(self.frame)} rows x {len(self.frame.columns)} columns]" - - def _get_string_representation(self) -> str: - if self.frame.empty: - info_line = ( - f"Empty {type(self.frame).__name__}\n" - f"Columns: {pprint_thing(self.frame.columns)}\n" - f"Index: {pprint_thing(self.frame.index)}" - ) - return info_line - - strcols = self._to_str_columns() - - if self.line_width is None: - # no need to wrap around just print the whole frame - return self.adj.adjoin(1, *strcols) - - if self.max_cols is None or self.max_cols > 0: - # need to wrap around - return self._join_multiline(*strcols) - - # max_cols == 0. Try to fit frame to terminal - return self._fit_strcols_to_terminal_width(strcols) - - def _fit_strcols_to_terminal_width(self, strcols) -> str: - from pandas import Series - - lines = self.adj.adjoin(1, *strcols).split("\n") - max_len = Series(lines).str.len().max() - # plus truncate dot col - width, _ = get_terminal_size() - dif = max_len - width - # '+ 1' to avoid too wide repr (GH PR #17023) - adj_dif = dif + 1 - col_lens = Series([Series(ele).apply(len).max() for ele in strcols]) - n_cols = len(col_lens) - counter = 0 - while adj_dif > 0 and n_cols > 1: - counter += 1 - mid = int(round(n_cols / 2.0)) - mid_ix = col_lens.index[mid] - col_len = col_lens[mid_ix] - # adjoin adds one - adj_dif -= col_len + 1 - col_lens = col_lens.drop(mid_ix) - n_cols = len(col_lens) - - # subtract index column - max_cols_fitted = n_cols - self.index - # GH-21180. Ensure that we print at least two. - max_cols_fitted = max(max_cols_fitted, 2) - self.max_cols_fitted = max_cols_fitted - - # Call again _truncate to cut frame appropriately - # and then generate string representation - self._truncate() - strcols = self._to_str_columns() - return self.adj.adjoin(1, *strcols) - - def _join_multiline(self, *args) -> str: - lwidth = self.line_width - adjoin_width = 1 - strcols = list(args) - if self.index: - idx = strcols.pop(0) - lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width - - col_widths = [ - np.array([self.adj.len(x) for x in col]).max() if len(col) > 0 else 0 - for col in strcols - ] - - assert lwidth is not None - col_bins = _binify(col_widths, lwidth) - nbins = len(col_bins) - - if self.is_truncated_vertically: - assert self.max_rows_fitted is not None - nrows = self.max_rows_fitted + 1 - else: - nrows = len(self.frame) - - str_lst = [] - start = 0 - for i, end in enumerate(col_bins): - row = strcols[start:end] - if self.index: - row.insert(0, idx) - if nbins > 1: - if end <= len(strcols) and i < nbins - 1: - row.append([" \\"] + [" "] * (nrows - 1)) - else: - row.append([" "] * nrows) - str_lst.append(self.adj.adjoin(adjoin_width, *row)) - start = end - return "\n\n".join(str_lst) - - def to_string( - self, - buf: Optional[FilePathOrBuffer[str]] = None, - encoding: Optional[str] = None, - ) -> Optional[str]: - return self.get_result(buf=buf, encoding=encoding) - - def to_latex( - self, - buf: Optional[FilePathOrBuffer[str]] = None, - column_format: Optional[str] = None, - longtable: bool = False, - encoding: Optional[str] = None, - multicolumn: bool = False, - multicolumn_format: Optional[str] = None, - multirow: bool = False, - caption: Optional[Union[str, Tuple[str, str]]] = None, - label: Optional[str] = None, - position: Optional[str] = None, - ) -> Optional[str]: - """ - Render a DataFrame to a LaTeX tabular/longtable environment output. - """ - from pandas.io.formats.latex import LatexFormatter - - latex_formatter = LatexFormatter( - self, - longtable=longtable, - column_format=column_format, - multicolumn=multicolumn, - multicolumn_format=multicolumn_format, - multirow=multirow, - caption=caption, - label=label, - position=position, - ) - return latex_formatter.get_result(buf=buf, encoding=encoding) - - def _format_col(self, i: int) -> List[str]: + def format_col(self, i: int) -> List[str]: frame = self.tr_frame formatter = self._get_formatter(i) return format_array( @@ -1056,34 +791,17 @@ def _format_col(self, i: int) -> List[str]: leading_space=self.index, ) - def to_html( - self, - buf: Optional[FilePathOrBuffer[str]] = None, - encoding: Optional[str] = None, - classes: Optional[Union[str, List, Tuple]] = None, - notebook: bool = False, - border: Optional[int] = None, - ) -> Optional[str]: - """ - Render a DataFrame to a html table. - - Parameters - ---------- - classes : str or list-like - classes to include in the `class` attribute of the opening - ```` tag, in addition to the default "dataframe". - notebook : {True, False}, optional, default False - Whether the generated HTML is for IPython Notebook. - border : int - A ``border=border`` attribute is included in the opening - ``
`` tag. Default ``pd.options.display.html.border``. - """ - from pandas.io.formats.html import HTMLFormatter, NotebookFormatter - - Klass = NotebookFormatter if notebook else HTMLFormatter - return Klass(self, classes=classes, border=border).get_result( - buf=buf, encoding=encoding - ) + def _get_formatter(self, i: Union[str, int]) -> Optional[Callable]: + if isinstance(self.formatters, (list, tuple)): + if is_integer(i): + i = cast(int, i) + return self.formatters[i] + else: + return None + else: + if is_integer(i) and i not in self.columns: + i = self.columns[i] + return self.formatters.get(i, None) def _get_formatted_column_labels(self, frame: "DataFrame") -> List[List[str]]: from pandas.core.indexes.multi import sparsify_labels @@ -1126,22 +844,6 @@ def space_format(x, y): # self.str_columns = str_columns return str_columns - @property - def has_index_names(self) -> bool: - return _has_names(self.frame.index) - - @property - def has_column_names(self) -> bool: - return _has_names(self.frame.columns) - - @property - def show_row_idx_names(self) -> bool: - return all((self.has_index_names, self.index, self.show_index_names)) - - @property - def show_col_idx_names(self) -> bool: - return all((self.has_column_names, self.show_index_names, self.header)) - def _get_formatted_index(self, frame: "DataFrame") -> List[str]: # Note: this is only used by to_string() and to_latex(), not by # to_html(). so safe to cast col_space here. @@ -1192,6 +894,224 @@ def _get_column_name_list(self) -> List[str]: return names +class DataFrameRenderer: + """Class for creating dataframe output in multiple formats. + + Called in pandas.core.generic.NDFrame: + - to_csv + - to_latex + + Called in pandas.core.frame.DataFrame: + - to_html + - to_string + + Parameters + ---------- + fmt : DataFrameFormatter + Formatter with the formating options. + """ + + def __init__(self, fmt: DataFrameFormatter): + self.fmt = fmt + + def to_latex( + self, + buf: Optional[FilePathOrBuffer[str]] = None, + column_format: Optional[str] = None, + longtable: bool = False, + encoding: Optional[str] = None, + multicolumn: bool = False, + multicolumn_format: Optional[str] = None, + multirow: bool = False, + caption: Optional[str] = None, + label: Optional[str] = None, + position: Optional[str] = None, + ) -> Optional[str]: + """ + Render a DataFrame to a LaTeX tabular/longtable environment output. + """ + from pandas.io.formats.latex import LatexFormatter + + latex_formatter = LatexFormatter( + self.fmt, + longtable=longtable, + column_format=column_format, + multicolumn=multicolumn, + multicolumn_format=multicolumn_format, + multirow=multirow, + caption=caption, + label=label, + position=position, + ) + string = latex_formatter.to_string() + return save_to_buffer(string, buf=buf, encoding=encoding) + + def to_html( + self, + buf: Optional[FilePathOrBuffer[str]] = None, + encoding: Optional[str] = None, + classes: Optional[Union[str, List, Tuple]] = None, + notebook: bool = False, + border: Optional[int] = None, + table_id: Optional[str] = None, + render_links: bool = False, + ) -> Optional[str]: + """ + Render a DataFrame to a html table. + + Parameters + ---------- + buf : str, Path or StringIO-like, optional, default None + Buffer to write to. If None, the output is returned as a string. + encoding : str, default “utf-8” + Set character encoding. + classes : str or list-like + classes to include in the `class` attribute of the opening + ``
`` tag, in addition to the default "dataframe". + notebook : {True, False}, optional, default False + Whether the generated HTML is for IPython Notebook. + border : int + A ``border=border`` attribute is included in the opening + ``
`` tag. Default ``pd.options.display.html.border``. + table_id : str, optional + A css id is included in the opening `
` tag if specified. + render_links : bool, default False + Convert URLs to HTML links. + """ + from pandas.io.formats.html import HTMLFormatter, NotebookFormatter + + Klass = NotebookFormatter if notebook else HTMLFormatter + + html_formatter = Klass( + self.fmt, + classes=classes, + border=border, + table_id=table_id, + render_links=render_links, + ) + string = html_formatter.to_string() + return save_to_buffer(string, buf=buf, encoding=encoding) + + def to_string( + self, + buf: Optional[FilePathOrBuffer[str]] = None, + encoding: Optional[str] = None, + line_width: Optional[int] = None, + ) -> Optional[str]: + """ + Render a DataFrame to a console-friendly tabular output. + + Parameters + ---------- + buf : str, Path or StringIO-like, optional, default None + Buffer to write to. If None, the output is returned as a string. + encoding: str, default “utf-8” + Set character encoding. + line_width : int, optional + Width to wrap a line in characters. + """ + from pandas.io.formats.string import StringFormatter + + string_formatter = StringFormatter(self.fmt, line_width=line_width) + string = string_formatter.to_string() + return save_to_buffer(string, buf=buf, encoding=encoding) + + def to_csv( + self, + path_or_buf: Optional[FilePathOrBuffer[str]] = None, + encoding: Optional[str] = None, + sep: str = ",", + columns: Optional[Sequence[Label]] = None, + index_label: Optional[IndexLabel] = None, + mode: str = "w", + compression: CompressionOptions = "infer", + quoting: Optional[int] = None, + quotechar: str = '"', + line_terminator: Optional[str] = None, + chunksize: Optional[int] = None, + date_format: Optional[str] = None, + doublequote: bool = True, + escapechar: Optional[str] = None, + errors: str = "strict", + storage_options: StorageOptions = None, + ) -> Optional[str]: + """ + Render dataframe as comma-separated file. + """ + from pandas.io.formats.csvs import CSVFormatter + + csv_formatter = CSVFormatter( + path_or_buf=path_or_buf, + line_terminator=line_terminator, + sep=sep, + encoding=encoding, + errors=errors, + compression=compression, + quoting=quoting, + cols=columns, + index_label=index_label, + mode=mode, + chunksize=chunksize, + quotechar=quotechar, + date_format=date_format, + doublequote=doublequote, + escapechar=escapechar, + storage_options=storage_options, + formatter=self.fmt, + ) + csv_formatter.save() + + if path_or_buf is None: + assert isinstance(csv_formatter.path_or_buf, StringIO) + return csv_formatter.path_or_buf.getvalue() + + return None + + +def save_to_buffer( + string: str, + buf: Optional[FilePathOrBuffer[str]] = None, + encoding: Optional[str] = None, +) -> Optional[str]: + """ + Perform serialization. Write to buf or return as string if buf is None. + """ + with get_buffer(buf, encoding=encoding) as f: + f.write(string) + if buf is None: + return f.getvalue() + return None + + +@contextmanager +def get_buffer(buf: Optional[FilePathOrBuffer[str]], encoding: Optional[str] = None): + """ + Context manager to open, yield and close buffer for filenames or Path-like + objects, otherwise yield buf unchanged. + """ + if buf is not None: + buf = stringify_path(buf) + else: + buf = StringIO() + + if encoding is None: + encoding = "utf-8" + elif not isinstance(buf, str): + raise ValueError("buf is not a file name and encoding is specified.") + + if hasattr(buf, "write"): + yield buf + elif isinstance(buf, str): + with open(buf, "w", encoding=encoding, newline="") as f: + # GH#30034 open instead of codecs.open prevents a file leak + # if we have an invalid encoding argument. + # newline="" is needed to roundtrip correctly on + # windows test_to_latex_filename + yield f + else: + raise TypeError("buf is not a file name and it has no write method") + + # ---------------------------------------------------------------------- # Array formatters @@ -2036,26 +1956,6 @@ def set_eng_float_format(accuracy: int = 3, use_eng_prefix: bool = False) -> Non set_option("display.column_space", max(12, accuracy + 9)) -def _binify(cols: List[int], line_width: int) -> List[int]: - adjoin_width = 1 - bins = [] - curr_width = 0 - i_last_column = len(cols) - 1 - for i, w in enumerate(cols): - w_adjoined = w + adjoin_width - curr_width += w_adjoined - if i_last_column == i: - wrap = curr_width + 1 > line_width and i > 0 - else: - wrap = curr_width + 2 > line_width and i > 0 - if wrap: - bins.append(i) - curr_width = w_adjoined - - bins.append(len(cols)) - return bins - - def get_level_lengths( levels: Any, sentinel: Union[bool, object, str] = "" ) -> List[Dict[int, int]]: diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index c8eb89afdd849..b4f7e3922f02f 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -3,7 +3,7 @@ """ from textwrap import dedent -from typing import IO, Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union, cast +from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union, cast from pandas._config import get_option @@ -12,16 +12,11 @@ from pandas import MultiIndex, option_context from pandas.io.common import is_url -from pandas.io.formats.format import ( - DataFrameFormatter, - TableFormatter, - buffer_put_lines, - get_level_lengths, -) +from pandas.io.formats.format import DataFrameFormatter, get_level_lengths from pandas.io.formats.printing import pprint_thing -class HTMLFormatter(TableFormatter): +class HTMLFormatter: """ Internal class for formatting output data in html. This class is intended for shared functionality between @@ -38,6 +33,8 @@ def __init__( formatter: DataFrameFormatter, classes: Optional[Union[str, List[str], Tuple[str, ...]]] = None, border: Optional[int] = None, + table_id: Optional[str] = None, + render_links: bool = False, ) -> None: self.fmt = formatter self.classes = classes @@ -51,14 +48,35 @@ def __init__( if border is None: border = cast(int, get_option("display.html.border")) self.border = border - self.table_id = self.fmt.table_id - self.render_links = self.fmt.render_links + self.table_id = table_id + self.render_links = render_links self.col_space = { column: f"{value}px" if isinstance(value, int) else value for column, value in self.fmt.col_space.items() } + def to_string(self) -> str: + lines = self.render() + if any(isinstance(x, str) for x in lines): + lines = [str(x) for x in lines] + return "\n".join(lines) + + def render(self) -> List[str]: + self._write_table() + + if self.should_show_dimensions: + by = chr(215) # × + self.write( + f"

{len(self.frame)} rows {by} {len(self.frame.columns)} columns

" + ) + + return self.elements + + @property + def should_show_dimensions(self): + return self.fmt.should_show_dimensions + @property def show_row_idx_names(self) -> bool: return self.fmt.show_row_idx_names @@ -187,20 +205,6 @@ def write_tr( indent -= indent_delta self.write("", indent) - def render(self) -> List[str]: - self._write_table() - - if self.should_show_dimensions: - by = chr(215) # × - self.write( - f"

{len(self.frame)} rows {by} {len(self.frame.columns)} columns

" - ) - - return self.elements - - def write_result(self, buf: IO[str]) -> None: - buffer_put_lines(buf, self.render()) - def _write_table(self, indent: int = 0) -> None: _classes = ["dataframe"] # Default class. use_mathjax = get_option("display.html.use_mathjax") @@ -370,7 +374,7 @@ def _write_header(self, indent: int) -> None: def _get_formatted_values(self) -> Dict[int, List[str]]: with option_context("display.max_colwidth", None): - fmt_values = {i: self.fmt._format_col(i) for i in range(self.ncols)} + fmt_values = {i: self.fmt.format_col(i) for i in range(self.ncols)} return fmt_values def _write_body(self, indent: int) -> None: @@ -565,7 +569,7 @@ class NotebookFormatter(HTMLFormatter): """ def _get_formatted_values(self) -> Dict[int, List[str]]: - return {i: self.fmt._format_col(i) for i in range(self.ncols)} + return {i: self.fmt.format_col(i) for i in range(self.ncols)} def _get_columns_formatted_values(self) -> List[str]: return self.columns.format() diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index 2eee0ce73291f..f3c49e1cd3801 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -2,13 +2,13 @@ Module for formatting output data in Latex. """ from abc import ABC, abstractmethod -from typing import IO, Iterator, List, Optional, Tuple, Type, Union +from typing import Iterator, List, Optional, Tuple, Type, Union import numpy as np from pandas.core.dtypes.generic import ABCMultiIndex -from pandas.io.formats.format import DataFrameFormatter, TableFormatter +from pandas.io.formats.format import DataFrameFormatter def _split_into_full_short_caption( @@ -133,17 +133,12 @@ def header_levels(self) -> int: def _get_strcols(self) -> List[List[str]]: """String representation of the columns.""" - if len(self.frame.columns) == 0 or len(self.frame.index) == 0: - info_line = ( - f"Empty {type(self.frame).__name__}\n" - f"Columns: {self.frame.columns}\n" - f"Index: {self.frame.index}" - ) - strcols = [[info_line]] + if self.fmt.frame.empty: + strcols = [[self._empty_info_line]] else: - strcols = self.fmt._to_str_columns() + strcols = self.fmt.get_strcols() - # reestablish the MultiIndex that has been joined by _to_str_column + # reestablish the MultiIndex that has been joined by get_strcols() if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex): out = self.frame.index.format( adjoin=False, @@ -176,6 +171,14 @@ def pad_empties(x): strcols = out + strcols[1:] return strcols + @property + def _empty_info_line(self): + return ( + f"Empty {type(self.frame).__name__}\n" + f"Columns: {self.frame.columns}\n" + f"Index: {self.frame.index}" + ) + def _preprocess_row(self, row: List[str]) -> List[str]: """Preprocess elements of the row.""" if self.fmt.escape: @@ -647,7 +650,7 @@ def env_end(self) -> str: return "\\end{tabular}" -class LatexFormatter(TableFormatter): +class LatexFormatter: r""" Used to render a DataFrame to a LaTeX tabular/longtable environment output. @@ -703,13 +706,12 @@ def __init__( self.label = label self.position = position - def write_result(self, buf: IO[str]) -> None: + def to_string(self) -> str: """ Render a DataFrame to a LaTeX tabular, longtable, or table/tabular environment output. """ - table_string = self.builder.get_result() - buf.write(table_string) + return self.builder.get_result() @property def builder(self) -> TableBuilderAbstract: diff --git a/pandas/io/formats/string.py b/pandas/io/formats/string.py new file mode 100644 index 0000000000000..4ebb78f29c739 --- /dev/null +++ b/pandas/io/formats/string.py @@ -0,0 +1,201 @@ +""" +Module for formatting output data in console (to string). +""" +from shutil import get_terminal_size +from typing import Iterable, List, Optional + +import numpy as np + +from pandas.io.formats.format import DataFrameFormatter +from pandas.io.formats.printing import pprint_thing + + +class StringFormatter: + """Formatter for string representation of a dataframe.""" + + def __init__(self, fmt: DataFrameFormatter, line_width: Optional[int] = None): + self.fmt = fmt + self.adj = fmt.adj + self.frame = fmt.frame + self.line_width = line_width + + def to_string(self) -> str: + text = self._get_string_representation() + if self.fmt.should_show_dimensions: + text = "".join([text, self.fmt.dimensions_info]) + return text + + def _get_strcols(self) -> List[List[str]]: + strcols = self.fmt.get_strcols() + if self.fmt.is_truncated: + strcols = self._insert_dot_separators(strcols) + return strcols + + def _get_string_representation(self) -> str: + if self.fmt.frame.empty: + return self._empty_info_line + + strcols = self._get_strcols() + + if self.line_width is None: + # no need to wrap around just print the whole frame + return self.adj.adjoin(1, *strcols) + + if self._need_to_wrap_around: + return self._join_multiline(strcols) + + return self._fit_strcols_to_terminal_width(strcols) + + @property + def _empty_info_line(self) -> str: + return ( + f"Empty {type(self.frame).__name__}\n" + f"Columns: {pprint_thing(self.frame.columns)}\n" + f"Index: {pprint_thing(self.frame.index)}" + ) + + @property + def _need_to_wrap_around(self) -> bool: + return bool(self.fmt.max_cols is None or self.fmt.max_cols > 0) + + def _insert_dot_separators(self, strcols: List[List[str]]) -> List[List[str]]: + str_index = self.fmt._get_formatted_index(self.fmt.tr_frame) + index_length = len(str_index) + + if self.fmt.is_truncated_horizontally: + strcols = self._insert_dot_separator_horizontal(strcols, index_length) + + if self.fmt.is_truncated_vertically: + strcols = self._insert_dot_separator_vertical(strcols, index_length) + + return strcols + + def _insert_dot_separator_horizontal( + self, strcols: List[List[str]], index_length: int + ) -> List[List[str]]: + strcols.insert(self.fmt.tr_col_num + 1, [" ..."] * index_length) + return strcols + + def _insert_dot_separator_vertical( + self, strcols: List[List[str]], index_length: int + ) -> List[List[str]]: + n_header_rows = index_length - len(self.fmt.tr_frame) + row_num = self.fmt.tr_row_num + for ix, col in enumerate(strcols): + cwidth = self.adj.len(col[row_num]) + + if self.fmt.is_truncated_horizontally: + is_dot_col = ix == self.fmt.tr_col_num + 1 + else: + is_dot_col = False + + if cwidth > 3 or is_dot_col: + dots = "..." + else: + dots = ".." + + if ix == 0: + dot_mode = "left" + elif is_dot_col: + cwidth = 4 + dot_mode = "right" + else: + dot_mode = "right" + + dot_str = self.adj.justify([dots], cwidth, mode=dot_mode)[0] + col.insert(row_num + n_header_rows, dot_str) + return strcols + + def _join_multiline(self, strcols_input: Iterable[List[str]]) -> str: + lwidth = self.line_width + adjoin_width = 1 + strcols = list(strcols_input) + + if self.fmt.index: + idx = strcols.pop(0) + lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width + + col_widths = [ + np.array([self.adj.len(x) for x in col]).max() if len(col) > 0 else 0 + for col in strcols + ] + + assert lwidth is not None + col_bins = _binify(col_widths, lwidth) + nbins = len(col_bins) + + if self.fmt.is_truncated_vertically: + assert self.fmt.max_rows_fitted is not None + nrows = self.fmt.max_rows_fitted + 1 + else: + nrows = len(self.frame) + + str_lst = [] + start = 0 + for i, end in enumerate(col_bins): + row = strcols[start:end] + if self.fmt.index: + row.insert(0, idx) + if nbins > 1: + if end <= len(strcols) and i < nbins - 1: + row.append([" \\"] + [" "] * (nrows - 1)) + else: + row.append([" "] * nrows) + str_lst.append(self.adj.adjoin(adjoin_width, *row)) + start = end + return "\n\n".join(str_lst) + + def _fit_strcols_to_terminal_width(self, strcols: List[List[str]]) -> str: + from pandas import Series + + lines = self.adj.adjoin(1, *strcols).split("\n") + max_len = Series(lines).str.len().max() + # plus truncate dot col + width, _ = get_terminal_size() + dif = max_len - width + # '+ 1' to avoid too wide repr (GH PR #17023) + adj_dif = dif + 1 + col_lens = Series([Series(ele).apply(len).max() for ele in strcols]) + n_cols = len(col_lens) + counter = 0 + while adj_dif > 0 and n_cols > 1: + counter += 1 + mid = int(round(n_cols / 2.0)) + mid_ix = col_lens.index[mid] + col_len = col_lens[mid_ix] + # adjoin adds one + adj_dif -= col_len + 1 + col_lens = col_lens.drop(mid_ix) + n_cols = len(col_lens) + + # subtract index column + max_cols_fitted = n_cols - self.fmt.index + # GH-21180. Ensure that we print at least two. + max_cols_fitted = max(max_cols_fitted, 2) + self.fmt.max_cols_fitted = max_cols_fitted + + # Call again _truncate to cut frame appropriately + # and then generate string representation + self.fmt.truncate() + strcols = self._get_strcols() + return self.adj.adjoin(1, *strcols) + + +def _binify(cols: List[int], line_width: int) -> List[int]: + adjoin_width = 1 + bins = [] + curr_width = 0 + i_last_column = len(cols) - 1 + for i, w in enumerate(cols): + w_adjoined = w + adjoin_width + curr_width += w_adjoined + if i_last_column == i: + wrap = curr_width + 1 > line_width and i > 0 + else: + wrap = curr_width + 2 > line_width and i > 0 + if wrap: + bins.append(i) + curr_width = w_adjoined + + bins.append(len(cols)) + return bins