-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Fix/26837 format array #44527
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix/26837 format array #44527
Changes from 2 commits
3885851
a7ec260
6a03c9a
f84867b
9bcd61e
83cfe06
e33b675
42ea58b
2c8bceb
e1f2cb3
8dcb3e5
04f590b
fb063f5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,6 +29,7 @@ | |
AstypeArg, | ||
Dtype, | ||
FillnaOptions, | ||
FloatFormatType, | ||
PositionalIndexer, | ||
ScalarIndexer, | ||
SequenceIndexer, | ||
|
@@ -137,6 +138,7 @@ class ExtensionArray: | |
view | ||
_concat_same_type | ||
_formatter | ||
_format_array | ||
_from_factorized | ||
_from_sequence | ||
_from_sequence_of_strings | ||
|
@@ -167,6 +169,8 @@ class ExtensionArray: | |
|
||
* __repr__ : A default repr for the ExtensionArray. | ||
* _formatter : Print scalars inside a Series or DataFrame. | ||
* _format_array: Full control over formatting an ExtensionArray | ||
to be included in a Series or DataFrame. | ||
|
||
Some methods require casting the ExtensionArray to an ndarray of Python | ||
objects with ``self.astype(object)``, which may be expensive. When | ||
|
@@ -1232,6 +1236,76 @@ def _repr_2d(self) -> str: | |
class_name = f"<{type(self).__name__}>" | ||
return f"{class_name}\n[\n{data}\n]\nShape: {self.shape}, dtype: {self.dtype}" | ||
|
||
def _format_array( | ||
self, | ||
formatter: Callable | None, | ||
float_format: FloatFormatType = None, | ||
na_rep: str = "NaN", | ||
digits: int = None, | ||
space: str | int = None, | ||
justify: str = "right", | ||
decimal: str = ".", | ||
leading_space: bool | None = True, | ||
quoting: int | None = None, | ||
) -> list[str]: | ||
""" | ||
Format an array of of values. | ||
|
||
Parameters | ||
---------- | ||
formatter : Callable, optional | ||
The function to apply to each element of the array to convert it | ||
to a string. By default, `self._formatter` is used. | ||
float_format | ||
na_rep | ||
digits | ||
space | ||
justify | ||
decimal | ||
leading_space : bool, optional, default True | ||
Whether the array should be formatted with a leading space. | ||
When an array as a column of a Series or DataFrame, we do want | ||
the leading space to pad between columns. | ||
|
||
When formatting an Index subclass | ||
(e.g. IntervalIndex._format_native_types), we don't want the | ||
leading space since it should be left-aligned. | ||
|
||
|
||
""" | ||
from pandas import Categorical | ||
from pandas.core.construction import extract_array | ||
|
||
from pandas.io.formats.format import format_array | ||
|
||
# values = self | ||
values = extract_array(self, extract_numpy=True) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why would we need to extract_array? i guess for PandasArray? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's my guess too. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you check if its really necessary and if so, add a comment as to why |
||
|
||
if formatter is None: | ||
# error: Item "ndarray" of "Union[Any, Union[ExtensionArray, ndarray]]" has | ||
# no attribute "_formatter" | ||
formatter = values._formatter(boxed=True) # type: ignore[union-attr] | ||
|
||
if isinstance(values, Categorical): | ||
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# Categorical is special for now, so that we can preserve tzinfo | ||
array = values._internal_get_values() | ||
else: | ||
array = np.asarray(values) | ||
|
||
fmt_values = format_array( | ||
array, | ||
formatter, | ||
float_format=float_format, | ||
na_rep=na_rep, | ||
digits=digits, | ||
space=space, | ||
justify=justify, | ||
decimal=decimal, | ||
leading_space=leading_space, | ||
quoting=quoting, | ||
) | ||
return fmt_values | ||
|
||
def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]: | ||
""" | ||
Formatting function for scalar values. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,7 @@ | |
) | ||
from typing import ( | ||
TYPE_CHECKING, | ||
Callable, | ||
Literal, | ||
overload, | ||
) | ||
|
@@ -37,7 +38,10 @@ | |
to_offset, | ||
tzconversion, | ||
) | ||
from pandas._typing import npt | ||
from pandas._typing import ( | ||
FloatFormatType, | ||
npt, | ||
) | ||
from pandas.errors import PerformanceWarning | ||
from pandas.util._exceptions import find_stack_level | ||
from pandas.util._validators import validate_inclusive | ||
|
@@ -681,6 +685,43 @@ def _format_native_types( | |
self.asi8, tz=self.tz, format=fmt, na_rep=na_rep | ||
) | ||
|
||
def _format_array( | ||
self, | ||
formatter: Callable | None, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. im not too familiar with how this gets reached. is e.g. formatter going to always be self._formatter? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you're right. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. then can we do without the formatter arg? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry I was incorrect. |
||
float_format: FloatFormatType = None, | ||
na_rep: str = "NaN", | ||
digits: int = None, | ||
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
space: str | int = None, | ||
justify: str = "right", | ||
decimal: str = ".", | ||
leading_space: bool | None = True, | ||
quoting: int | None = None, | ||
) -> list[str]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same comment this seems like adding a lot of boilerplate that could be handled in the base class no? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is slightly different than the Categorical case. Categorical wants to change the We could add some method to the interface to get the formatting class for an array. I don't really think that we want to publicly expose the ArrayFormatter interface publicly though. If it's purely about the lines of code here, we could have a "private" # in ExtensionArray._format_array
if hasattr(self, "_format_class"):
fmt_klass = self._format_class
else:
fmt_klass = GenericArrayFormatter I dunno. This is all kind of messy. |
||
from pandas.io.formats.format import ( | ||
Datetime64Formatter, | ||
Datetime64TZFormatter, | ||
) | ||
|
||
if is_datetime64tz_dtype(self.dtype): | ||
fmt_klass = Datetime64TZFormatter | ||
else: | ||
fmt_klass = Datetime64Formatter | ||
|
||
fmt_obj = fmt_klass( | ||
self, | ||
digits=digits, | ||
na_rep=na_rep, | ||
float_format=float_format, | ||
formatter=formatter, | ||
space=space, | ||
justify=justify, | ||
decimal=decimal, | ||
leading_space=leading_space, | ||
quoting=quoting, | ||
) | ||
|
||
return fmt_obj.get_result() | ||
|
||
# ----------------------------------------------------------------- | ||
# Comparison Methods | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1276,30 +1276,41 @@ def format_array( | |
List[str] | ||
""" | ||
fmt_klass: type[GenericArrayFormatter] | ||
if is_datetime64_dtype(values.dtype): | ||
|
||
if space is None: | ||
space = get_option("display.column_space") | ||
|
||
if float_format is None: | ||
float_format = get_option("display.float_format") | ||
|
||
if digits is None: | ||
digits = get_option("display.precision") | ||
|
||
if is_extension_array_dtype(values): | ||
return values._format_array( | ||
formatter, | ||
float_format, | ||
na_rep, | ||
digits, | ||
space, | ||
justify, | ||
decimal, | ||
leading_space, | ||
quoting, | ||
) | ||
elif is_datetime64_dtype(values.dtype): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I should check to see if these datetlike are actually hit anymore. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yah. i expect that if you use |
||
fmt_klass = Datetime64Formatter | ||
elif is_datetime64tz_dtype(values.dtype): | ||
fmt_klass = Datetime64TZFormatter | ||
elif is_timedelta64_dtype(values.dtype): | ||
fmt_klass = Timedelta64Formatter | ||
elif is_extension_array_dtype(values.dtype): | ||
fmt_klass = ExtensionArrayFormatter | ||
elif is_float_dtype(values.dtype) or is_complex_dtype(values.dtype): | ||
fmt_klass = FloatArrayFormatter | ||
elif is_integer_dtype(values.dtype): | ||
fmt_klass = IntArrayFormatter | ||
else: | ||
fmt_klass = GenericArrayFormatter | ||
|
||
if space is None: | ||
space = get_option("display.column_space") | ||
|
||
if float_format is None: | ||
float_format = get_option("display.float_format") | ||
|
||
if digits is None: | ||
digits = get_option("display.precision") | ||
|
||
fmt_obj = fmt_klass( | ||
values, | ||
digits=digits, | ||
|
@@ -1633,37 +1644,6 @@ def _format_strings(self) -> list[str]: | |
return fmt_values.tolist() | ||
|
||
|
||
class ExtensionArrayFormatter(GenericArrayFormatter): | ||
def _format_strings(self) -> list[str]: | ||
values = extract_array(self.values, extract_numpy=True) | ||
|
||
formatter = self.formatter | ||
if formatter is None: | ||
# error: Item "ndarray" of "Union[Any, Union[ExtensionArray, ndarray]]" has | ||
# no attribute "_formatter" | ||
formatter = values._formatter(boxed=True) # type: ignore[union-attr] | ||
|
||
if isinstance(values, Categorical): | ||
# Categorical is special for now, so that we can preserve tzinfo | ||
array = values._internal_get_values() | ||
else: | ||
array = np.asarray(values) | ||
|
||
fmt_values = format_array( | ||
array, | ||
formatter, | ||
float_format=self.float_format, | ||
na_rep=self.na_rep, | ||
digits=self.digits, | ||
space=self.space, | ||
justify=self.justify, | ||
decimal=self.decimal, | ||
leading_space=self.leading_space, | ||
quoting=self.quoting, | ||
) | ||
return fmt_values | ||
|
||
|
||
def format_percentiles( | ||
percentiles: (np.ndarray | list[int | float] | list[float] | list[str | float]), | ||
) -> list[str]: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
import pandas as pd | ||
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
|
||
class MyDtype(pd.api.extensions.ExtensionDtype): | ||
name = "mydtype" | ||
type = list | ||
|
||
|
||
class MyEA(pd.api.extensions.ExtensionArray): | ||
def __init__(self, data): | ||
self.data = data | ||
self._dtype = MyDtype() | ||
|
||
@property | ||
def dtype(self): | ||
return self._dtype | ||
|
||
def __len__(self): | ||
return 1 | ||
|
||
def __array__(self, dtype=None): | ||
raise ValueError("Cannot be converted to an array!") | ||
|
||
def _format_array( | ||
self, | ||
formatter: None, | ||
float_format: None, | ||
na_rep="NaN", | ||
digits=None, | ||
space=None, | ||
justify="right", | ||
decimal=".", | ||
leading_space=True, | ||
quoting=None, | ||
): | ||
return ["<MyEA>([1])"] | ||
|
||
|
||
def test_no_conversion(): | ||
s = pd.Series(MyEA([1])) | ||
repr(s) # OK! | ||
|
||
df = pd.DataFrame({"A": MyEA([1])}, copy=False) | ||
repr(df) # OK! |
Uh oh!
There was an error while loading. Please reload this page.