-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
API: allow nan-likes in StringArray constructor #41412
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
3e1784d
96ff1da
418e1d2
25a6c4d
47d68f7
8257dbd
922436a
2f28086
3ee2198
9426a52
3ee55f2
fe4981a
a66948a
e852719
91b73bb
42ec3e4
41f49d2
62cc5be
29909f3
153b6b4
b27a839
ed5b953
caa5705
2d75031
52a00d1
8dc0b66
1bacaed
66be087
7b058cd
1be1bdf
3c57094
03738a9
12351de
889829a
358000f
c649b1d
5e5aa9c
eb7d8f2
2426319
20817a7
33d8f9a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -678,12 +678,14 @@ def astype_intsafe(ndarray[object] arr, cnp.dtype new_dtype) -> ndarray: | |
cpdef ndarray[object] ensure_string_array( | ||
arr, | ||
object na_value=np.nan, | ||
bint convert_na_value=True, | ||
coerce="all", | ||
bint copy=True, | ||
bint skipna=True, | ||
): | ||
""" | ||
Returns a new numpy array with object dtype and only strings and na values. | ||
Checks that all elements in numpy array are string or null | ||
lithomas1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
and returns a new numpy array with object dtype | ||
and only strings and na values if so. Otherwise, raise a ValueError. | ||
lithomas1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
Parameters | ||
---------- | ||
|
@@ -693,6 +695,14 @@ cpdef ndarray[object] ensure_string_array( | |
The value to use for na. For example, np.nan or pd.NA. | ||
convert_na_value : bool, default True | ||
If False, existing na values will be used unchanged in the new array. | ||
coerce : {{'all', 'null', 'non-null', None}}, default 'all' | ||
lithomas1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Whether to coerce non-string elements to strings. | ||
- 'all' will convert null values and non-null non-string values. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what does 'all' do for null values? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. converts them to |
||
- 'null' will only convert nulls without converting other non-strings. | ||
- 'non-null' will only convert non-null non-string elements to string. | ||
- None will not convert anything. | ||
If coerce is not all, a ValueError will be raised for values | ||
that are not strings or na_value. | ||
copy : bool, default True | ||
Whether to ensure that a new array is returned. | ||
skipna : bool, default True | ||
|
@@ -724,9 +734,12 @@ cpdef ndarray[object] ensure_string_array( | |
continue | ||
|
||
if not checknull(val): | ||
result[i] = str(val) | ||
if coerce =="all" or coerce == "non-null": | ||
lithomas1 marked this conversation as resolved.
Show resolved
Hide resolved
lithomas1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
result[i] = str(val) | ||
else: | ||
raise ValueError("Non-string element encountered in array.") | ||
lithomas1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
else: | ||
if convert_na_value: | ||
if coerce=="all" or coerce == "null": | ||
val = na_value | ||
if skipna: | ||
result[i] = val | ||
|
@@ -1837,10 +1850,6 @@ cdef class StringValidator(Validator): | |
cdef inline bint is_array_typed(self) except -1: | ||
return issubclass(self.dtype.type, np.str_) | ||
|
||
cdef bint is_valid_null(self, object value) except -1: | ||
# We deliberately exclude None / NaN here since StringArray uses NA | ||
return value is C_NA | ||
|
||
|
||
cpdef bint is_string_array(ndarray values, bint skipna=False): | ||
cdef: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -144,11 +144,18 @@ class StringArray(PandasArray): | |
.. warning:: | ||
|
||
Currently, this expects an object-dtype ndarray | ||
where the elements are Python strings or :attr:`pandas.NA`. | ||
where the elements are Python strings | ||
or nan-likes(``None``, ``nan``, ``NaT``, ``NA``, Decimal("NaN")). | ||
lithomas1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
This may change without warning in the future. Use | ||
:meth:`pandas.array` with ``dtype="string"`` for a stable way of | ||
creating a `StringArray` from any sequence. | ||
|
||
.. versionchanged:: 1.3 | ||
lithomas1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
StringArray now accepts nan-likes in the constructor in addition | ||
to strings, whereas it only accepted strings and :attr:`pandas.NA` | ||
before. | ||
|
||
copy : bool, default False | ||
Whether to copy the array of data. | ||
|
||
|
@@ -208,21 +215,25 @@ def __init__(self, values, copy=False): | |
values = extract_array(values) | ||
|
||
super().__init__(values, copy=copy) | ||
if not isinstance(values, type(self)): | ||
self._validate() | ||
# error: Incompatible types in assignment (expression has type "StringDtype", | ||
# variable has type "PandasDtype") | ||
NDArrayBacked.__init__(self, self._ndarray, StringDtype()) | ||
if not isinstance(values, type(self)): | ||
self._validate() | ||
|
||
def _validate(self): | ||
"""Validate that we only store NA or strings.""" | ||
if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): | ||
raise ValueError("StringArray requires a sequence of strings or pandas.NA") | ||
if self._ndarray.dtype != "object": | ||
raise ValueError( | ||
"StringArray requires a sequence of strings or pandas.NA. Got " | ||
f"'{self._ndarray.dtype}' dtype instead." | ||
) | ||
try: | ||
lib.ensure_string_array( | ||
lithomas1 marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is this going through ensure_string_array instead of e.g. is_string_array? For the latter, the ravel would be unnecessary. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is_string_array will not convert the other nans(None, np.nan, etc.) to the correct na_value of pd.NA. FWIW, switching to ensure_string_array will also align us with _from_sequence. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does any of this become simpler if done in conjunction with the edits in #45057 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will try to split this one up. I'm thinking of maybe sticking to is_string_array and then doing another pass over the data to convert the not pd.NA nans, as a quick short term fix to unblock you. This will probably give a perf regression, but since is_string_array got sped up in #44495 on master(not 1.3.x), all I have to do is make the perf regression less than the perf improvement there, so that the regression is not user visible. |
||
self._ndarray, na_value=StringDtype.na_value, coerce="null", copy=False | ||
), | ||
except ValueError: | ||
raise ValueError("StringArray requires a sequence of strings or pandas.NA") | ||
|
||
@classmethod | ||
def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): | ||
|
@@ -235,7 +246,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): | |
# avoid costly conversion to object dtype | ||
na_values = scalars._mask | ||
result = scalars._data | ||
result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) | ||
result = lib.ensure_string_array(result, copy=copy, coerce="non-null") | ||
result[na_values] = StringDtype.na_value | ||
|
||
else: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -289,14 +289,13 @@ def test_constructor_raises(cls): | |
with pytest.raises(ValueError, match=msg): | ||
cls(np.array([])) | ||
|
||
with pytest.raises(ValueError, match=msg): | ||
cls(np.array(["a", np.nan], dtype=object)) | ||
|
||
with pytest.raises(ValueError, match=msg): | ||
cls(np.array(["a", None], dtype=object)) | ||
|
||
with pytest.raises(ValueError, match=msg): | ||
cls(np.array(["a", pd.NaT], dtype=object)) | ||
@pytest.mark.parametrize("na", [np.nan, pd.NaT, None, pd.NA]) | ||
def test_constructor_nan_like(na): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you parameterize over list as well (assume same result). |
||
expected = pd.arrays.StringArray(np.array(["a", pd.NA])) | ||
tm.assert_extension_array_equal( | ||
pd.arrays.StringArray(np.array(["a", na], dtype="object")), expected | ||
) | ||
|
||
|
||
@pytest.mark.parametrize("copy", [True, False]) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1388,11 +1388,12 @@ def test_is_string_array(self): | |
assert lib.is_string_array( | ||
np.array(["foo", "bar", pd.NA], dtype=object), skipna=True | ||
) | ||
# NaN is not valid for string array, just NA | ||
assert not lib.is_string_array( | ||
assert lib.is_string_array( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IIUC from #41412 (comment), we now call ensure_string_array in the constructor whereas previously is_string_array was called. so why is lib.is_string_array changing. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. lib.is_string_array is used for inferring dtypes elsewhere I think, and we want ndarrays with strings and np.nan/None to be inferred as string in addition to ndarrays with only strings/pd.NA. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. right. so is will hopefully get a chance to look in more detail soon, but if you could summarize why it's changing that'll be great. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, it isn't called anymore in this PR, but it should be changed(allow np.nan/None) to be consistent with what we are accepting in the constructor now. The actual change is just a one-liner here, so should be pretty harmless. https://github.com/pandas-dev/pandas/pull/41412/files#diff-c61205b9d6ae5756840d1ed6915157fe9d99aa33b29a762f821e9fe38ab0277cR1900 |
||
np.array(["foo", "bar", np.nan], dtype=object), skipna=True | ||
) | ||
|
||
assert not lib.is_string_array( | ||
np.array(["foo", "bar", np.nan], dtype=object), skipna=False | ||
) | ||
assert not lib.is_string_array(np.array([1, 2])) | ||
|
||
def test_to_object_array_tuples(self): | ||
|
Uh oh!
There was an error while loading. Please reload this page.