pandas-dev · lithomas1 · May 10, 2021 · May 11, 2021 · May 11, 2021 · May 19, 2021
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -613,6 +613,7 @@ Other API changes
 - Partially initialized :class:`CategoricalDtype` (i.e. those with ``categories=None`` objects will no longer compare as equal to fully initialized dtype objects.
 - Accessing ``_constructor_expanddim`` on a :class:`DataFrame` and ``_constructor_sliced`` on a :class:`Series` now raise an ``AttributeError``. Previously a ``NotImplementedError`` was raised (:issue:`38782`)
 - Added new ``engine`` and ``**engine_kwargs`` parameters to :meth:`DataFrame.to_sql` to support other future "SQL engines". Currently we still only use ``SQLAlchemy`` under the hood, but more engines are planned to be supported such as ``turbodbc`` (:issue:`36893`)
+- :class:`StringArray` now accepts nan-likes(``None``, ``nan``, ``NaT``, ``NA``, Decimal("NaN")) in its constructor in addition to strings.
 
 Build
 =====

diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
@@ -146,7 +146,7 @@ def maybe_convert_numeric(
 def ensure_string_array(
     arr,
     na_value: object = np.nan,
-    convert_na_value: bool = True,
+    coerce: str = "all,
     copy: bool = True,
     skipna: bool = True,
 ) -> np.ndarray: ...  # np.ndarray[object]

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -678,12 +678,14 @@ def astype_intsafe(ndarray[object] arr, cnp.dtype new_dtype) -> ndarray:
 cpdef ndarray[object] ensure_string_array(
         arr,
         object na_value=np.nan,
-        bint convert_na_value=True,
+        coerce="all",
         bint copy=True,
         bint skipna=True,
 ):
     """
-    Returns a new numpy array with object dtype and only strings and na values.
+    Checks that all elements in numpy array are string or null
+    and returns a new numpy array with object dtype
+    and only strings and na values if so. Otherwise, raise a ValueError.
 
     Parameters
     ----------
@@ -693,6 +695,14 @@ cpdef ndarray[object] ensure_string_array(
         The value to use for na. For example, np.nan or pd.NA.
     convert_na_value : bool, default True
         If False, existing na values will be used unchanged in the new array.
+    coerce : {{'all', 'null', 'non-null', None}}, default 'all'
+        Whether to coerce non-string elements to strings.
+            - 'all' will convert null values and non-null non-string values.
+            - 'null' will only convert nulls without converting other non-strings.
+            - 'non-null' will only convert non-null non-string elements to string.
+            - None will not convert anything.
+        If coerce is not all, a ValueError will be raised for values
+        that are not strings or na_value.
     copy : bool, default True
         Whether to ensure that a new array is returned.
     skipna : bool, default True
@@ -724,9 +734,12 @@ cpdef ndarray[object] ensure_string_array(
             continue
 
         if not checknull(val):
-            result[i] = str(val)
+            if coerce =="all" or coerce == "non-null":
+                result[i] = str(val)
+            else:
+                raise ValueError("Non-string element encountered in array.")
         else:
-            if convert_na_value:
+            if coerce=="all" or coerce == "null":
                 val = na_value
             if skipna:
                 result[i] = val
@@ -1837,10 +1850,6 @@ cdef class StringValidator(Validator):
     cdef inline bint is_array_typed(self) except -1:
         return issubclass(self.dtype.type, np.str_)
 
-    cdef bint is_valid_null(self, object value) except -1:
-        # We deliberately exclude None / NaN here since StringArray uses NA
-        return value is C_NA
-
 
 cpdef bint is_string_array(ndarray values, bint skipna=False):
     cdef:

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -144,11 +144,18 @@ class StringArray(PandasArray):
         .. warning::
 
            Currently, this expects an object-dtype ndarray
-           where the elements are Python strings or :attr:`pandas.NA`.
+           where the elements are Python strings
+           or nan-likes(``None``, ``nan``, ``NaT``, ``NA``, Decimal("NaN")).
            This may change without warning in the future. Use
            :meth:`pandas.array` with ``dtype="string"`` for a stable way of
            creating a `StringArray` from any sequence.
 
+        .. versionchanged:: 1.3
+
+           StringArray now accepts nan-likes in the constructor in addition
+           to strings, whereas it only accepted strings and :attr:`pandas.NA`
+           before.
+
     copy : bool, default False
         Whether to copy the array of data.
 
@@ -208,21 +215,25 @@ def __init__(self, values, copy=False):
         values = extract_array(values)
 
         super().__init__(values, copy=copy)
+        if not isinstance(values, type(self)):
+            self._validate()
         # error: Incompatible types in assignment (expression has type "StringDtype",
         # variable has type "PandasDtype")
         NDArrayBacked.__init__(self, self._ndarray, StringDtype())
-        if not isinstance(values, type(self)):
-            self._validate()
 
     def _validate(self):
         """Validate that we only store NA or strings."""
-        if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
-            raise ValueError("StringArray requires a sequence of strings or pandas.NA")
         if self._ndarray.dtype != "object":
             raise ValueError(
                 "StringArray requires a sequence of strings or pandas.NA. Got "
                 f"'{self._ndarray.dtype}' dtype instead."
             )
+        try:
+            lib.ensure_string_array(
+                self._ndarray, na_value=StringDtype.na_value, coerce="null", copy=False
+            ),
+        except ValueError:
+            raise ValueError("StringArray requires a sequence of strings or pandas.NA")
 
     @classmethod
     def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
@@ -235,7 +246,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
             # avoid costly conversion to object dtype
             na_values = scalars._mask
             result = scalars._data
-            result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
+            result = lib.ensure_string_array(result, copy=copy, coerce="non-null")
             result[na_values] = StringDtype.na_value
 
         else:

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -247,7 +247,7 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False)
             # numerical issues with Float32Dtype
             na_values = scalars._mask
             result = scalars._data
-            result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
+            result = lib.ensure_string_array(result, copy=copy, coerce="non-null")
             return cls(pa.array(result, mask=na_values, type=pa.string()))
 
         # convert non-na-likes to str

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -1125,7 +1125,7 @@ def astype_nansafe(
         return arr.astype(dtype, copy=copy)
 
     if issubclass(dtype.type, str):
-        return lib.ensure_string_array(arr, skipna=skipna, convert_na_value=False)
+        return lib.ensure_string_array(arr, skipna=skipna, coerce="non-null")
 
     elif is_datetime64_dtype(arr):
         if dtype == np.int64:
@@ -1934,7 +1934,7 @@ def construct_1d_ndarray_preserving_na(
     """
 
     if dtype is not None and dtype.kind == "U":
-        subarr = lib.ensure_string_array(values, convert_na_value=False, copy=copy)
+        subarr = lib.ensure_string_array(values, coerce="non-null", copy=copy)
     else:
         if dtype is not None:
             _disallow_mismatched_datetimelike(values, dtype)

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -289,14 +289,13 @@ def test_constructor_raises(cls):
     with pytest.raises(ValueError, match=msg):
         cls(np.array([]))
 
-    with pytest.raises(ValueError, match=msg):
-        cls(np.array(["a", np.nan], dtype=object))
-
-    with pytest.raises(ValueError, match=msg):
-        cls(np.array(["a", None], dtype=object))
 
-    with pytest.raises(ValueError, match=msg):
-        cls(np.array(["a", pd.NaT], dtype=object))
+@pytest.mark.parametrize("na", [np.nan, pd.NaT, None, pd.NA])
+def test_constructor_nan_like(na):
+    expected = pd.arrays.StringArray(np.array(["a", pd.NA]))
+    tm.assert_extension_array_equal(
+        pd.arrays.StringArray(np.array(["a", na], dtype="object")), expected
+    )
 
 
 @pytest.mark.parametrize("copy", [True, False])

diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
@@ -1388,11 +1388,12 @@ def test_is_string_array(self):
         assert lib.is_string_array(
             np.array(["foo", "bar", pd.NA], dtype=object), skipna=True
         )
-        # NaN is not valid for string array, just NA
-        assert not lib.is_string_array(
+        assert lib.is_string_array(
             np.array(["foo", "bar", np.nan], dtype=object), skipna=True
         )
-
+        assert not lib.is_string_array(
+            np.array(["foo", "bar", np.nan], dtype=object), skipna=False
+        )
         assert not lib.is_string_array(np.array([1, 2]))
 
     def test_to_object_array_tuples(self):