Skip to content

Commit 14cf864

Browse files
authored
REF: pass dtype to _from_sequence (#56436)
* REF: pass dtype to _from_sequence * mypy fixup * mypy fixup * restore but better type safety
1 parent 2a02b00 commit 14cf864

33 files changed

+143
-84
lines changed

pandas/_libs/parsers.pyx

+3-1
Original file line numberDiff line numberDiff line change
@@ -1471,7 +1471,9 @@ def _maybe_upcast(
14711471

14721472
elif arr.dtype == np.object_:
14731473
if use_dtype_backend:
1474-
arr = StringDtype().construct_array_type()._from_sequence(arr)
1474+
dtype = StringDtype()
1475+
cls = dtype.construct_array_type()
1476+
arr = cls._from_sequence(arr, dtype=dtype)
14751477

14761478
if use_dtype_backend and dtype_backend == "pyarrow":
14771479
import pyarrow as pa

pandas/core/arrays/arrow/array.py

+5
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
is_integer,
4141
is_list_like,
4242
is_scalar,
43+
pandas_dtype,
4344
)
4445
from pandas.core.dtypes.dtypes import DatetimeTZDtype
4546
from pandas.core.dtypes.missing import isna
@@ -273,6 +274,10 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal
273274
"""
274275
Construct a new ExtensionArray from a sequence of scalars.
275276
"""
277+
if dtype is not None and isinstance(dtype, str):
278+
# FIXME: in tests.extension.test_arrow we pass pyarrow _type_ objects
279+
# which raise when passed to pandas_dtype
280+
dtype = pandas_dtype(dtype)
276281
pa_type = to_pyarrow_type(dtype)
277282
pa_array = cls._box_pa_array(scalars, pa_type=pa_type, copy=copy)
278283
arr = cls(pa_array)

pandas/core/arrays/numeric.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -132,9 +132,12 @@ def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarr
132132
raise AbstractMethodError(cls)
133133

134134

135-
def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype):
135+
def _coerce_to_data_and_mask(
136+
values, dtype, copy: bool, dtype_cls: type[NumericDtype], default_dtype: np.dtype
137+
):
136138
checker = dtype_cls._checker
137139

140+
mask = None
138141
inferred_type = None
139142

140143
if dtype is None and hasattr(values, "dtype"):
@@ -190,7 +193,7 @@ def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype
190193
if dtype is None:
191194
dtype = default_dtype
192195
else:
193-
dtype = dtype.type
196+
dtype = dtype.numpy_dtype
194197

195198
if is_integer_dtype(dtype) and values.dtype.kind == "f" and len(values) > 0:
196199
if mask.all():
@@ -260,9 +263,8 @@ def _coerce_to_array(
260263
) -> tuple[np.ndarray, np.ndarray]:
261264
dtype_cls = cls._dtype_cls
262265
default_dtype = dtype_cls._default_np_dtype
263-
mask = None
264266
values, mask, _, _ = _coerce_to_data_and_mask(
265-
value, mask, dtype, copy, dtype_cls, default_dtype
267+
value, dtype, copy, dtype_cls, default_dtype
266268
)
267269
return values, mask
268270

pandas/core/arrays/period.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -1090,7 +1090,9 @@ def period_array(
10901090
return PeriodArray(ordinals, dtype=dtype)
10911091

10921092
data = ensure_object(arrdata)
1093-
1093+
if freq is None:
1094+
freq = libperiod.extract_freq(data)
1095+
dtype = PeriodDtype(freq)
10941096
return PeriodArray._from_sequence(data, dtype=dtype)
10951097

10961098

pandas/core/construction.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -349,7 +349,9 @@ def array(
349349

350350
elif inferred_dtype == "string":
351351
# StringArray/ArrowStringArray depending on pd.options.mode.string_storage
352-
return StringDtype().construct_array_type()._from_sequence(data, copy=copy)
352+
dtype = StringDtype()
353+
cls = dtype.construct_array_type()
354+
return cls._from_sequence(data, dtype=dtype, copy=copy)
353355

354356
elif inferred_dtype == "integer":
355357
return IntegerArray._from_sequence(data, copy=copy)
@@ -364,7 +366,7 @@ def array(
364366
return FloatingArray._from_sequence(data, copy=copy)
365367

366368
elif inferred_dtype == "boolean":
367-
return BooleanArray._from_sequence(data, copy=copy)
369+
return BooleanArray._from_sequence(data, dtype="boolean", copy=copy)
368370

369371
# Pandas overrides NumPy for
370372
# 1. datetime64[ns,us,ms,s]

pandas/core/groupby/groupby.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2330,7 +2330,7 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike:
23302330
elif isinstance(bvalues, ArrowExtensionArray) and not isinstance(
23312331
bvalues.dtype, StringDtype
23322332
):
2333-
return type(bvalues)._from_sequence(counted[0])
2333+
return type(bvalues)._from_sequence(counted[0], dtype="int64[pyarrow]")
23342334
if is_series:
23352335
assert counted.ndim == 2
23362336
assert counted.shape[0] == 1

pandas/core/indexes/base.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -5194,12 +5194,12 @@ def _get_join_target(self) -> np.ndarray:
51945194
def _from_join_target(self, result: np.ndarray) -> ArrayLike:
51955195
"""
51965196
Cast the ndarray returned from one of the libjoin.foo_indexer functions
5197-
back to type(self)._data.
5197+
back to type(self._data).
51985198
"""
51995199
if isinstance(self.values, BaseMaskedArray):
52005200
return type(self.values)(result, np.zeros(result.shape, dtype=np.bool_))
52015201
elif isinstance(self.values, (ArrowExtensionArray, StringArray)):
5202-
return type(self.values)._from_sequence(result)
5202+
return type(self.values)._from_sequence(result, dtype=self.dtype)
52035203
return result
52045204

52055205
@doc(IndexOpsMixin._memory_usage)

pandas/core/internals/construction.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -1044,7 +1044,9 @@ def convert(arr):
10441044
# i.e. maybe_convert_objects didn't convert
10451045
arr = maybe_infer_to_datetimelike(arr)
10461046
if dtype_backend != "numpy" and arr.dtype == np.dtype("O"):
1047-
arr = StringDtype().construct_array_type()._from_sequence(arr)
1047+
new_dtype = StringDtype()
1048+
arr_cls = new_dtype.construct_array_type()
1049+
arr = arr_cls._from_sequence(arr, dtype=new_dtype)
10481050
elif dtype_backend != "numpy" and isinstance(arr, np.ndarray):
10491051
if arr.dtype.kind in "iufb":
10501052
arr = pd_array(arr, copy=False)

pandas/core/strings/object_array.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ def rep(x, r):
207207
)
208208
if isinstance(self, BaseStringArray):
209209
# Not going through map, so we have to do this here.
210-
result = type(self)._from_sequence(result)
210+
result = type(self)._from_sequence(result, dtype=self.dtype)
211211
return result
212212

213213
def _str_match(

pandas/io/parsers/base_parser.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -757,7 +757,9 @@ def _infer_types(
757757
elif result.dtype == np.object_ and non_default_dtype_backend:
758758
# read_excel sends array of datetime objects
759759
if not lib.is_datetime_array(result, skipna=True):
760-
result = StringDtype().construct_array_type()._from_sequence(values)
760+
dtype = StringDtype()
761+
cls = dtype.construct_array_type()
762+
result = cls._from_sequence(values, dtype=dtype)
761763

762764
if dtype_backend == "pyarrow":
763765
pa = import_optional_dependency("pyarrow")

pandas/tests/arithmetic/test_period.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1282,7 +1282,7 @@ def test_parr_add_sub_td64_nat(self, box_with_array, transpose):
12821282
"other",
12831283
[
12841284
np.array(["NaT"] * 9, dtype="m8[ns]"),
1285-
TimedeltaArray._from_sequence(["NaT"] * 9),
1285+
TimedeltaArray._from_sequence(["NaT"] * 9, dtype="m8[ns]"),
12861286
],
12871287
)
12881288
def test_parr_add_sub_tdt64_nat_array(self, box_with_array, other):

pandas/tests/arrays/categorical/test_constructors.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -745,7 +745,9 @@ def test_interval(self):
745745

746746
def test_categorical_extension_array_nullable(self, nulls_fixture):
747747
# GH:
748-
arr = pd.arrays.StringArray._from_sequence([nulls_fixture] * 2)
748+
arr = pd.arrays.StringArray._from_sequence(
749+
[nulls_fixture] * 2, dtype=pd.StringDtype()
750+
)
749751
result = Categorical(arr)
750752
assert arr.dtype == result.categories.dtype
751753
expected = Categorical(Series([pd.NA, pd.NA], dtype=arr.dtype))

pandas/tests/arrays/datetimes/test_constructors.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ class TestDatetimeArrayConstructor:
1414
def test_from_sequence_invalid_type(self):
1515
mi = pd.MultiIndex.from_product([np.arange(5), np.arange(5)])
1616
with pytest.raises(TypeError, match="Cannot create a DatetimeArray"):
17-
DatetimeArray._from_sequence(mi)
17+
DatetimeArray._from_sequence(mi, dtype="M8[ns]")
1818

1919
def test_only_1dim_accepted(self):
2020
arr = np.array([0, 1, 2, 3], dtype="M8[h]").astype("M8[ns]")
@@ -66,7 +66,7 @@ def test_mixing_naive_tzaware_raises(self, meth):
6666
def test_from_pandas_array(self):
6767
arr = pd.array(np.arange(5, dtype=np.int64)) * 3600 * 10**9
6868

69-
result = DatetimeArray._from_sequence(arr)._with_freq("infer")
69+
result = DatetimeArray._from_sequence(arr, dtype="M8[ns]")._with_freq("infer")
7070

7171
expected = pd.date_range("1970-01-01", periods=5, freq="h")._data
7272
tm.assert_datetime_array_equal(result, expected)
@@ -100,7 +100,7 @@ def test_bool_dtype_raises(self):
100100

101101
msg = r"dtype bool cannot be converted to datetime64\[ns\]"
102102
with pytest.raises(TypeError, match=msg):
103-
DatetimeArray._from_sequence(arr)
103+
DatetimeArray._from_sequence(arr, dtype="M8[ns]")
104104

105105
with pytest.raises(TypeError, match=msg):
106106
pd.DatetimeIndex(arr)
@@ -171,8 +171,10 @@ def test_2d(self, order):
171171
if order == "F":
172172
arr = arr.T
173173

174-
res = DatetimeArray._from_sequence(arr)
175-
expected = DatetimeArray._from_sequence(arr.ravel()).reshape(arr.shape)
174+
res = DatetimeArray._from_sequence(arr, dtype=dti.dtype)
175+
expected = DatetimeArray._from_sequence(arr.ravel(), dtype=dti.dtype).reshape(
176+
arr.shape
177+
)
176178
tm.assert_datetime_array_equal(res, expected)
177179

178180

pandas/tests/arrays/datetimes/test_cumulative.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,11 @@ def test_accumulators_freq(self):
1212
"2000-01-01",
1313
"2000-01-02",
1414
"2000-01-03",
15-
]
15+
],
16+
dtype="M8[ns]",
1617
)._with_freq("infer")
1718
result = arr._accumulate("cummin")
18-
expected = DatetimeArray._from_sequence(["2000-01-01"] * 3)
19+
expected = DatetimeArray._from_sequence(["2000-01-01"] * 3, dtype="M8[ns]")
1920
tm.assert_datetime_array_equal(result, expected)
2021

2122
result = arr._accumulate("cummax")
@@ -36,6 +37,7 @@ def test_accumulators_disallowed(self, func):
3637
"2000-01-01",
3738
"2000-01-02",
3839
],
40+
dtype="M8[ns]",
3941
)._with_freq("infer")
4042
with pytest.raises(TypeError, match=f"Accumulation {func}"):
4143
arr._accumulate(func)

pandas/tests/arrays/datetimes/test_reductions.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ def test_median_2d(self, arr1d):
124124

125125
# axis = 1
126126
result = arr.median(axis=1)
127-
expected = type(arr)._from_sequence([arr1d.median()])
127+
expected = type(arr)._from_sequence([arr1d.median()], dtype=arr.dtype)
128128
tm.assert_equal(result, expected)
129129

130130
result = arr.median(axis=1, skipna=False)

pandas/tests/arrays/string_/test_string.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -64,14 +64,14 @@ def test_repr(dtype):
6464
assert repr(df.A.array) == expected
6565

6666

67-
def test_none_to_nan(cls):
68-
a = cls._from_sequence(["a", None, "b"])
67+
def test_none_to_nan(cls, dtype):
68+
a = cls._from_sequence(["a", None, "b"], dtype=dtype)
6969
assert a[1] is not None
7070
assert a[1] is na_val(a.dtype)
7171

7272

73-
def test_setitem_validates(cls):
74-
arr = cls._from_sequence(["a", "b"])
73+
def test_setitem_validates(cls, dtype):
74+
arr = cls._from_sequence(["a", "b"], dtype=dtype)
7575

7676
if cls is pd.arrays.StringArray:
7777
msg = "Cannot set non-string value '10' into a StringArray."
@@ -361,12 +361,12 @@ def test_constructor_nan_like(na):
361361

362362

363363
@pytest.mark.parametrize("copy", [True, False])
364-
def test_from_sequence_no_mutate(copy, cls, request):
364+
def test_from_sequence_no_mutate(copy, cls, dtype):
365365
nan_arr = np.array(["a", np.nan], dtype=object)
366366
expected_input = nan_arr.copy()
367367
na_arr = np.array(["a", pd.NA], dtype=object)
368368

369-
result = cls._from_sequence(nan_arr, copy=copy)
369+
result = cls._from_sequence(nan_arr, dtype=dtype, copy=copy)
370370

371371
if cls in (ArrowStringArray, ArrowStringArrayNumpySemantics):
372372
import pyarrow as pa
@@ -436,7 +436,7 @@ def test_reduce_missing(skipna, dtype):
436436

437437
@pytest.mark.parametrize("method", ["min", "max"])
438438
@pytest.mark.parametrize("skipna", [True, False])
439-
def test_min_max(method, skipna, dtype, request):
439+
def test_min_max(method, skipna, dtype):
440440
arr = pd.Series(["a", "b", "c", None], dtype=dtype)
441441
result = getattr(arr, method)(skipna=skipna)
442442
if skipna:

pandas/tests/arrays/string_/test_string_arrow.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,8 @@ def test_config(string_storage, request, using_infer_string):
3434
result = pd.array(["a", "b"])
3535
assert result.dtype.storage == string_storage
3636

37-
expected = (
38-
StringDtype(string_storage).construct_array_type()._from_sequence(["a", "b"])
39-
)
37+
dtype = StringDtype(string_storage)
38+
expected = dtype.construct_array_type()._from_sequence(["a", "b"], dtype=dtype)
4039
tm.assert_equal(result, expected)
4140

4241

0 commit comments

Comments
 (0)