From a9ee37f0a5b1ad5655771b511a4c7aed95f710f5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 23 Jul 2019 17:45:34 -0700 Subject: [PATCH 1/9] move array, extract_array to pd.core.construction --- pandas/core/api.py | 58 +++--- pandas/core/arrays/__init__.py | 1 - pandas/core/arrays/array_.py | 280 -------------------------- pandas/core/arrays/categorical.py | 3 +- pandas/core/arrays/numpy_.py | 3 +- pandas/core/internals/arrays.py | 55 ----- pandas/core/internals/blocks.py | 2 +- pandas/core/internals/construction.py | 2 +- pandas/core/reshape/reshape.py | 2 +- pandas/core/series.py | 3 +- pandas/core/sorting.py | 3 +- 11 files changed, 36 insertions(+), 376 deletions(-) delete mode 100644 pandas/core/arrays/array_.py delete mode 100644 pandas/core/internals/arrays.py diff --git a/pandas/core/api.py b/pandas/core/api.py index f3ea0976a2869..5d24fc356f7a1 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -2,6 +2,17 @@ import numpy as np +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + DatetimeTZDtype, + IntervalDtype, + PeriodDtype, +) +from pandas.core.dtypes.missing import isna, isnull, notna, notnull + +# TODO: Remove import when statsmodels updates #18264 +from pandas.core.algorithms import factorize, unique, value_counts +from pandas.core.arrays import Categorical from pandas.core.arrays.integer import ( Int8Dtype, Int16Dtype, @@ -12,44 +23,33 @@ UInt32Dtype, UInt64Dtype, ) -from pandas.core.algorithms import factorize, unique, value_counts -from pandas.core.dtypes.missing import isna, isnull, notna, notnull -from pandas.core.dtypes.dtypes import ( - CategoricalDtype, - PeriodDtype, - IntervalDtype, - DatetimeTZDtype, -) -from pandas.core.arrays import Categorical, array +from pandas.core.construction import array +from pandas.core.frame import DataFrame from pandas.core.groupby import Grouper, NamedAgg -from pandas.io.formats.format import set_eng_float_format from pandas.core.index import ( - Index, CategoricalIndex, - Int64Index, - UInt64Index, - RangeIndex, + DatetimeIndex, Float64Index, - MultiIndex, + Index, + Int64Index, IntervalIndex, - TimedeltaIndex, - DatetimeIndex, - PeriodIndex, + MultiIndex, NaT, + PeriodIndex, + RangeIndex, + TimedeltaIndex, + UInt64Index, ) +from pandas.core.indexes.datetimes import Timestamp, bdate_range, date_range +from pandas.core.indexes.interval import Interval, interval_range from pandas.core.indexes.period import Period, period_range from pandas.core.indexes.timedeltas import Timedelta, timedelta_range -from pandas.core.indexes.datetimes import Timestamp, date_range, bdate_range -from pandas.core.indexes.interval import Interval, interval_range - -from pandas.core.series import Series -from pandas.core.frame import DataFrame - -# TODO: Remove import when statsmodels updates #18264 -from pandas.core.reshape.reshape import get_dummies - from pandas.core.indexing import IndexSlice -from pandas.core.tools.numeric import to_numeric -from pandas.tseries.offsets import DateOffset +from pandas.core.reshape.reshape import get_dummies +from pandas.core.series import Series from pandas.core.tools.datetimes import to_datetime +from pandas.core.tools.numeric import to_numeric from pandas.core.tools.timedeltas import to_timedelta + +from pandas.io.formats.format import set_eng_float_format +from pandas.tseries.offsets import DateOffset diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index dab29e9ce71d3..5c83ed8cf5e24 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -1,4 +1,3 @@ -from .array_ import array # noqa: F401 from .base import ( # noqa: F401 ExtensionArray, ExtensionOpsMixin, diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py deleted file mode 100644 index 314144db57712..0000000000000 --- a/pandas/core/arrays/array_.py +++ /dev/null @@ -1,280 +0,0 @@ -from typing import Optional, Sequence, Union, cast - -import numpy as np - -from pandas._libs import lib, tslibs - -from pandas.core.dtypes.common import ( - is_datetime64_ns_dtype, - is_extension_array_dtype, - is_timedelta64_ns_dtype, -) -from pandas.core.dtypes.dtypes import ExtensionDtype, registry -from pandas.core.dtypes.generic import ABCExtensionArray - - -def array( - data: Sequence[object], - dtype: Optional[Union[str, np.dtype, ExtensionDtype]] = None, - copy: bool = True, -) -> ABCExtensionArray: - """ - Create an array. - - .. versionadded:: 0.24.0 - - Parameters - ---------- - data : Sequence of objects - The scalars inside `data` should be instances of the - scalar type for `dtype`. It's expected that `data` - represents a 1-dimensional array of data. - - When `data` is an Index or Series, the underlying array - will be extracted from `data`. - - dtype : str, np.dtype, or ExtensionDtype, optional - The dtype to use for the array. This may be a NumPy - dtype or an extension type registered with pandas using - :meth:`pandas.api.extensions.register_extension_dtype`. - - If not specified, there are two possibilities: - - 1. When `data` is a :class:`Series`, :class:`Index`, or - :class:`ExtensionArray`, the `dtype` will be taken - from the data. - 2. Otherwise, pandas will attempt to infer the `dtype` - from the data. - - Note that when `data` is a NumPy array, ``data.dtype`` is - *not* used for inferring the array type. This is because - NumPy cannot represent all the types of data that can be - held in extension arrays. - - Currently, pandas will infer an extension dtype for sequences of - - ============================== ===================================== - Scalar Type Array Type - ============================== ===================================== - :class:`pandas.Interval` :class:`pandas.arrays.IntervalArray` - :class:`pandas.Period` :class:`pandas.arrays.PeriodArray` - :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray` - :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray` - ============================== ===================================== - - For all other cases, NumPy's usual inference rules will be used. - - copy : bool, default True - Whether to copy the data, even if not necessary. Depending - on the type of `data`, creating the new array may require - copying data, even if ``copy=False``. - - Returns - ------- - ExtensionArray - The newly created array. - - Raises - ------ - ValueError - When `data` is not 1-dimensional. - - See Also - -------- - numpy.array : Construct a NumPy array. - Series : Construct a pandas Series. - Index : Construct a pandas Index. - arrays.PandasArray : ExtensionArray wrapping a NumPy array. - Series.array : Extract the array stored within a Series. - - Notes - ----- - Omitting the `dtype` argument means pandas will attempt to infer the - best array type from the values in the data. As new array types are - added by pandas and 3rd party libraries, the "best" array type may - change. We recommend specifying `dtype` to ensure that - - 1. the correct array type for the data is returned - 2. the returned array type doesn't change as new extension types - are added by pandas and third-party libraries - - Additionally, if the underlying memory representation of the returned - array matters, we recommend specifying the `dtype` as a concrete object - rather than a string alias or allowing it to be inferred. For example, - a future version of pandas or a 3rd-party library may include a - dedicated ExtensionArray for string data. In this event, the following - would no longer return a :class:`arrays.PandasArray` backed by a NumPy - array. - - >>> pd.array(['a', 'b'], dtype=str) - - ['a', 'b'] - Length: 2, dtype: str32 - - This would instead return the new ExtensionArray dedicated for string - data. If you really need the new array to be backed by a NumPy array, - specify that in the dtype. - - >>> pd.array(['a', 'b'], dtype=np.dtype(" - ['a', 'b'] - Length: 2, dtype: str32 - - Or use the dedicated constructor for the array you're expecting, and - wrap that in a PandasArray - - >>> pd.array(np.array(['a', 'b'], dtype=' - ['a', 'b'] - Length: 2, dtype: str32 - - Finally, Pandas has arrays that mostly overlap with NumPy - - * :class:`arrays.DatetimeArray` - * :class:`arrays.TimedeltaArray` - - When data with a ``datetime64[ns]`` or ``timedelta64[ns]`` dtype is - passed, pandas will always return a ``DatetimeArray`` or ``TimedeltaArray`` - rather than a ``PandasArray``. This is for symmetry with the case of - timezone-aware data, which NumPy does not natively support. - - >>> pd.array(['2015', '2016'], dtype='datetime64[ns]') - - ['2015-01-01 00:00:00', '2016-01-01 00:00:00'] - Length: 2, dtype: datetime64[ns] - - >>> pd.array(["1H", "2H"], dtype='timedelta64[ns]') - - ['01:00:00', '02:00:00'] - Length: 2, dtype: timedelta64[ns] - - Examples - -------- - If a dtype is not specified, `data` is passed through to - :meth:`numpy.array`, and a :class:`arrays.PandasArray` is returned. - - >>> pd.array([1, 2]) - - [1, 2] - Length: 2, dtype: int64 - - Or the NumPy dtype can be specified - - >>> pd.array([1, 2], dtype=np.dtype("int32")) - - [1, 2] - Length: 2, dtype: int32 - - You can use the string alias for `dtype` - - >>> pd.array(['a', 'b', 'a'], dtype='category') - [a, b, a] - Categories (2, object): [a, b] - - Or specify the actual dtype - - >>> pd.array(['a', 'b', 'a'], - ... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True)) - [a, b, a] - Categories (3, object): [a < b < c] - - Because omitting the `dtype` passes the data through to NumPy, - a mixture of valid integers and NA will return a floating-point - NumPy array. - - >>> pd.array([1, 2, np.nan]) - - [1.0, 2.0, nan] - Length: 3, dtype: float64 - - To use pandas' nullable :class:`pandas.arrays.IntegerArray`, specify - the dtype: - - >>> pd.array([1, 2, np.nan], dtype='Int64') - - [1, 2, NaN] - Length: 3, dtype: Int64 - - Pandas will infer an ExtensionArray for some types of data: - - >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) - - ['2000-01-01', '2000-01-01'] - Length: 2, dtype: period[D] - - `data` must be 1-dimensional. A ValueError is raised when the input - has the wrong dimensionality. - - >>> pd.array(1) - Traceback (most recent call last): - ... - ValueError: Cannot pass scalar '1' to 'pandas.array'. - """ - from pandas.core.arrays import ( - period_array, - IntervalArray, - PandasArray, - DatetimeArray, - TimedeltaArray, - ) - from pandas.core.internals.arrays import extract_array - - if lib.is_scalar(data): - msg = "Cannot pass scalar '{}' to 'pandas.array'." - raise ValueError(msg.format(data)) - - data = extract_array(data, extract_numpy=True) - - if dtype is None and isinstance(data, ABCExtensionArray): - dtype = data.dtype - - # this returns None for not-found dtypes. - if isinstance(dtype, str): - dtype = registry.find(dtype) or dtype - - if is_extension_array_dtype(dtype): - cls = cast(ExtensionDtype, dtype).construct_array_type() - return cls._from_sequence(data, dtype=dtype, copy=copy) - - if dtype is None: - inferred_dtype = lib.infer_dtype(data, skipna=False) - if inferred_dtype == "period": - try: - return period_array(data, copy=copy) - except tslibs.IncompatibleFrequency: - # We may have a mixture of frequencies. - # We choose to return an ndarray, rather than raising. - pass - elif inferred_dtype == "interval": - try: - return IntervalArray(data, copy=copy) - except ValueError: - # We may have a mixture of `closed` here. - # We choose to return an ndarray, rather than raising. - pass - - elif inferred_dtype.startswith("datetime"): - # datetime, datetime64 - try: - return DatetimeArray._from_sequence(data, copy=copy) - except ValueError: - # Mixture of timezones, fall back to PandasArray - pass - - elif inferred_dtype.startswith("timedelta"): - # timedelta, timedelta64 - return TimedeltaArray._from_sequence(data, copy=copy) - - # TODO(BooleanArray): handle this type - - # Pandas overrides NumPy for - # 1. datetime64[ns] - # 2. timedelta64[ns] - # so that a DatetimeArray is returned. - if is_datetime64_ns_dtype(dtype): - return DatetimeArray._from_sequence(data, dtype=dtype, copy=copy) - elif is_timedelta64_ns_dtype(dtype): - return TimedeltaArray._from_sequence(data, dtype=dtype, copy=copy) - - result = PandasArray._from_sequence(data, dtype=dtype, copy=copy) - return result diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6200cd14663f8..37d13883e29ec 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -60,6 +60,7 @@ ) from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs import pandas.core.common as com +from pandas.core.construction import extract_array from pandas.core.missing import interpolate_2d from pandas.core.sorting import nargsort @@ -2162,8 +2163,6 @@ def __setitem__(self, key, value): If (one or more) Value is not in categories or if a assigned `Categorical` does not have the same categories """ - from pandas.core.internals.arrays import extract_array - value = extract_array(value, extract_numpy=True) # require identical categories set diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 77c9a3bc98690..1c35298fcc6b8 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -16,6 +16,7 @@ from pandas import compat from pandas.core import nanops from pandas.core.algorithms import searchsorted, take, unique +from pandas.core.construction import extract_array from pandas.core.missing import backfill_1d, pad_1d from .base import ExtensionArray, ExtensionOpsMixin @@ -222,8 +223,6 @@ def __getitem__(self, item): return result def __setitem__(self, key, value): - from pandas.core.internals.arrays import extract_array - value = extract_array(value, extract_numpy=True) if not lib.is_scalar(key) and is_list_like(key): diff --git a/pandas/core/internals/arrays.py b/pandas/core/internals/arrays.py deleted file mode 100644 index 18af328bfa77f..0000000000000 --- a/pandas/core/internals/arrays.py +++ /dev/null @@ -1,55 +0,0 @@ -""" -Methods for cleaning, validating, and unboxing arrays. -""" -from pandas.core.dtypes.generic import ABCIndexClass, ABCPandasArray, ABCSeries - - -def extract_array(obj, extract_numpy=False): - """ - Extract the ndarray or ExtensionArray from a Series or Index. - - For all other types, `obj` is just returned as is. - - Parameters - ---------- - obj : object - For Series / Index, the underlying ExtensionArray is unboxed. - For Numpy-backed ExtensionArrays, the ndarray is extracted. - - extract_numpy : bool, default False - Whether to extract the ndarray from a PandasArray - - Returns - ------- - arr : object - - Examples - -------- - >>> extract_array(pd.Series(['a', 'b', 'c'], dtype='category')) - [a, b, c] - Categories (3, object): [a, b, c] - - Other objects like lists, arrays, and DataFrames are just passed through. - - >>> extract_array([1, 2, 3]) - [1, 2, 3] - - For an ndarray-backed Series / Index a PandasArray is returned. - - >>> extract_array(pd.Series([1, 2, 3])) - - [1, 2, 3] - Length: 3, dtype: int64 - - To extract all the way down to the ndarray, pass ``extract_numpy=True``. - - >>> extract_array(pd.Series([1, 2, 3]), extract_numpy=True) - array([1, 2, 3]) - """ - if isinstance(obj, (ABCIndexClass, ABCSeries)): - obj = obj.array - - if extract_numpy and isinstance(obj, ABCPandasArray): - obj = obj.to_numpy() - - return obj diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 549920e230e8a..aa22896783b86 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -77,12 +77,12 @@ ) from pandas.core.base import PandasObject import pandas.core.common as com +from pandas.core.construction import extract_array from pandas.core.indexers import ( check_setitem_lengths, is_empty_indexer, is_scalar_indexer, ) -from pandas.core.internals.arrays import extract_array import pandas.core.missing as missing from pandas.core.nanops import nanpercentile diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index c437f686bd17b..326daf3fc8a86 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -49,6 +49,7 @@ from pandas.core import algorithms, common as com from pandas.core.arrays import Categorical, ExtensionArray, period_array +from pandas.core.construction import extract_array from pandas.core.index import ( Index, _get_objs_combined_axis, @@ -60,7 +61,6 @@ create_block_manager_from_arrays, create_block_manager_from_blocks, ) -from pandas.core.internals.arrays import extract_array # --------------------------------------------------------------------- # BlockManager Interface diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 8d5b521ef7799..1f519d4c0867d 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -22,9 +22,9 @@ import pandas.core.algorithms as algos from pandas.core.arrays import SparseArray from pandas.core.arrays.categorical import _factorize_from_iterable +from pandas.core.construction import extract_array from pandas.core.frame import DataFrame from pandas.core.index import Index, MultiIndex -from pandas.core.internals.arrays import extract_array from pandas.core.series import Series from pandas.core.sorting import ( compress_group_index, diff --git a/pandas/core/series.py b/pandas/core/series.py index 98a7b279f70b3..f31f87b7704aa 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -61,6 +61,7 @@ from pandas.core.arrays.categorical import Categorical, CategoricalAccessor from pandas.core.arrays.sparse import SparseAccessor import pandas.core.common as com +from pandas.core.construction import extract_array from pandas.core.index import ( Float64Index, Index, @@ -801,8 +802,6 @@ def __array_ufunc__( self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any ): # TODO: handle DataFrame - from pandas.core.internals.construction import extract_array - cls = type(self) # for binary ops, use our custom dunder methods diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 5f3ed87424d0e..454156fd97344 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -15,6 +15,7 @@ from pandas.core.dtypes.missing import isna import pandas.core.algorithms as algorithms +from pandas.core.construction import extract_array _INT64_MAX = np.iinfo(np.int64).max @@ -240,8 +241,6 @@ def nargsort(items, kind="quicksort", ascending=True, na_position="last"): handles NaNs. It adds ascending and na_position parameters. GH #6399, #5231 """ - from pandas.core.internals.arrays import extract_array - items = extract_array(items) mask = np.asarray(isna(items)) From efe41856f439eda71bd3300385bcd542208b601a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 23 Jul 2019 17:46:56 -0700 Subject: [PATCH 2/9] put construction.py in the correct branch --- pandas/core/construction.py | 340 ++++++++++++++++++++++++++++++++++++ 1 file changed, 340 insertions(+) create mode 100644 pandas/core/construction.py diff --git a/pandas/core/construction.py b/pandas/core/construction.py new file mode 100644 index 0000000000000..1dab41b76f0e5 --- /dev/null +++ b/pandas/core/construction.py @@ -0,0 +1,340 @@ +""" +Constructor functions intended to be shared by pd.array, Series.__init__, +and Index.__new__. + +These should not depend on core.internals. +""" + + +from typing import Optional, Sequence, Union, cast + +import numpy as np + +from pandas._libs import lib, tslibs + +from pandas.core.dtypes.common import ( + is_datetime64_ns_dtype, + is_extension_array_dtype, + is_timedelta64_ns_dtype, +) +from pandas.core.dtypes.dtypes import ExtensionDtype, registry +from pandas.core.dtypes.generic import ( + ABCExtensionArray, ABCIndexClass, ABCPandasArray, ABCSeries +) + + +def array( + data: Sequence[object], + dtype: Optional[Union[str, np.dtype, ExtensionDtype]] = None, + copy: bool = True, +) -> ABCExtensionArray: + """ + Create an array. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + data : Sequence of objects + The scalars inside `data` should be instances of the + scalar type for `dtype`. It's expected that `data` + represents a 1-dimensional array of data. + + When `data` is an Index or Series, the underlying array + will be extracted from `data`. + + dtype : str, np.dtype, or ExtensionDtype, optional + The dtype to use for the array. This may be a NumPy + dtype or an extension type registered with pandas using + :meth:`pandas.api.extensions.register_extension_dtype`. + + If not specified, there are two possibilities: + + 1. When `data` is a :class:`Series`, :class:`Index`, or + :class:`ExtensionArray`, the `dtype` will be taken + from the data. + 2. Otherwise, pandas will attempt to infer the `dtype` + from the data. + + Note that when `data` is a NumPy array, ``data.dtype`` is + *not* used for inferring the array type. This is because + NumPy cannot represent all the types of data that can be + held in extension arrays. + + Currently, pandas will infer an extension dtype for sequences of + + ============================== ===================================== + Scalar Type Array Type + ============================== ===================================== + :class:`pandas.Interval` :class:`pandas.arrays.IntervalArray` + :class:`pandas.Period` :class:`pandas.arrays.PeriodArray` + :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray` + :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray` + ============================== ===================================== + + For all other cases, NumPy's usual inference rules will be used. + + copy : bool, default True + Whether to copy the data, even if not necessary. Depending + on the type of `data`, creating the new array may require + copying data, even if ``copy=False``. + + Returns + ------- + ExtensionArray + The newly created array. + + Raises + ------ + ValueError + When `data` is not 1-dimensional. + + See Also + -------- + numpy.array : Construct a NumPy array. + Series : Construct a pandas Series. + Index : Construct a pandas Index. + arrays.PandasArray : ExtensionArray wrapping a NumPy array. + Series.array : Extract the array stored within a Series. + + Notes + ----- + Omitting the `dtype` argument means pandas will attempt to infer the + best array type from the values in the data. As new array types are + added by pandas and 3rd party libraries, the "best" array type may + change. We recommend specifying `dtype` to ensure that + + 1. the correct array type for the data is returned + 2. the returned array type doesn't change as new extension types + are added by pandas and third-party libraries + + Additionally, if the underlying memory representation of the returned + array matters, we recommend specifying the `dtype` as a concrete object + rather than a string alias or allowing it to be inferred. For example, + a future version of pandas or a 3rd-party library may include a + dedicated ExtensionArray for string data. In this event, the following + would no longer return a :class:`arrays.PandasArray` backed by a NumPy + array. + + >>> pd.array(['a', 'b'], dtype=str) + + ['a', 'b'] + Length: 2, dtype: str32 + + This would instead return the new ExtensionArray dedicated for string + data. If you really need the new array to be backed by a NumPy array, + specify that in the dtype. + + >>> pd.array(['a', 'b'], dtype=np.dtype(" + ['a', 'b'] + Length: 2, dtype: str32 + + Or use the dedicated constructor for the array you're expecting, and + wrap that in a PandasArray + + >>> pd.array(np.array(['a', 'b'], dtype=' + ['a', 'b'] + Length: 2, dtype: str32 + + Finally, Pandas has arrays that mostly overlap with NumPy + + * :class:`arrays.DatetimeArray` + * :class:`arrays.TimedeltaArray` + + When data with a ``datetime64[ns]`` or ``timedelta64[ns]`` dtype is + passed, pandas will always return a ``DatetimeArray`` or ``TimedeltaArray`` + rather than a ``PandasArray``. This is for symmetry with the case of + timezone-aware data, which NumPy does not natively support. + + >>> pd.array(['2015', '2016'], dtype='datetime64[ns]') + + ['2015-01-01 00:00:00', '2016-01-01 00:00:00'] + Length: 2, dtype: datetime64[ns] + + >>> pd.array(["1H", "2H"], dtype='timedelta64[ns]') + + ['01:00:00', '02:00:00'] + Length: 2, dtype: timedelta64[ns] + + Examples + -------- + If a dtype is not specified, `data` is passed through to + :meth:`numpy.array`, and a :class:`arrays.PandasArray` is returned. + + >>> pd.array([1, 2]) + + [1, 2] + Length: 2, dtype: int64 + + Or the NumPy dtype can be specified + + >>> pd.array([1, 2], dtype=np.dtype("int32")) + + [1, 2] + Length: 2, dtype: int32 + + You can use the string alias for `dtype` + + >>> pd.array(['a', 'b', 'a'], dtype='category') + [a, b, a] + Categories (2, object): [a, b] + + Or specify the actual dtype + + >>> pd.array(['a', 'b', 'a'], + ... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True)) + [a, b, a] + Categories (3, object): [a < b < c] + + Because omitting the `dtype` passes the data through to NumPy, + a mixture of valid integers and NA will return a floating-point + NumPy array. + + >>> pd.array([1, 2, np.nan]) + + [1.0, 2.0, nan] + Length: 3, dtype: float64 + + To use pandas' nullable :class:`pandas.arrays.IntegerArray`, specify + the dtype: + + >>> pd.array([1, 2, np.nan], dtype='Int64') + + [1, 2, NaN] + Length: 3, dtype: Int64 + + Pandas will infer an ExtensionArray for some types of data: + + >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) + + ['2000-01-01', '2000-01-01'] + Length: 2, dtype: period[D] + + `data` must be 1-dimensional. A ValueError is raised when the input + has the wrong dimensionality. + + >>> pd.array(1) + Traceback (most recent call last): + ... + ValueError: Cannot pass scalar '1' to 'pandas.array'. + """ + from pandas.core.arrays import ( + period_array, + IntervalArray, + PandasArray, + DatetimeArray, + TimedeltaArray, + ) + + if lib.is_scalar(data): + msg = "Cannot pass scalar '{}' to 'pandas.array'." + raise ValueError(msg.format(data)) + + data = extract_array(data, extract_numpy=True) + + if dtype is None and isinstance(data, ABCExtensionArray): + dtype = data.dtype + + # this returns None for not-found dtypes. + if isinstance(dtype, str): + dtype = registry.find(dtype) or dtype + + if is_extension_array_dtype(dtype): + cls = cast(ExtensionDtype, dtype).construct_array_type() + return cls._from_sequence(data, dtype=dtype, copy=copy) + + if dtype is None: + inferred_dtype = lib.infer_dtype(data, skipna=False) + if inferred_dtype == "period": + try: + return period_array(data, copy=copy) + except tslibs.IncompatibleFrequency: + # We may have a mixture of frequencies. + # We choose to return an ndarray, rather than raising. + pass + elif inferred_dtype == "interval": + try: + return IntervalArray(data, copy=copy) + except ValueError: + # We may have a mixture of `closed` here. + # We choose to return an ndarray, rather than raising. + pass + + elif inferred_dtype.startswith("datetime"): + # datetime, datetime64 + try: + return DatetimeArray._from_sequence(data, copy=copy) + except ValueError: + # Mixture of timezones, fall back to PandasArray + pass + + elif inferred_dtype.startswith("timedelta"): + # timedelta, timedelta64 + return TimedeltaArray._from_sequence(data, copy=copy) + + # TODO(BooleanArray): handle this type + + # Pandas overrides NumPy for + # 1. datetime64[ns] + # 2. timedelta64[ns] + # so that a DatetimeArray is returned. + if is_datetime64_ns_dtype(dtype): + return DatetimeArray._from_sequence(data, dtype=dtype, copy=copy) + elif is_timedelta64_ns_dtype(dtype): + return TimedeltaArray._from_sequence(data, dtype=dtype, copy=copy) + + result = PandasArray._from_sequence(data, dtype=dtype, copy=copy) + return result + + +def extract_array(obj, extract_numpy=False): + """ + Extract the ndarray or ExtensionArray from a Series or Index. + + For all other types, `obj` is just returned as is. + + Parameters + ---------- + obj : object + For Series / Index, the underlying ExtensionArray is unboxed. + For Numpy-backed ExtensionArrays, the ndarray is extracted. + + extract_numpy : bool, default False + Whether to extract the ndarray from a PandasArray + + Returns + ------- + arr : object + + Examples + -------- + >>> extract_array(pd.Series(['a', 'b', 'c'], dtype='category')) + [a, b, c] + Categories (3, object): [a, b, c] + + Other objects like lists, arrays, and DataFrames are just passed through. + + >>> extract_array([1, 2, 3]) + [1, 2, 3] + + For an ndarray-backed Series / Index a PandasArray is returned. + + >>> extract_array(pd.Series([1, 2, 3])) + + [1, 2, 3] + Length: 3, dtype: int64 + + To extract all the way down to the ndarray, pass ``extract_numpy=True``. + + >>> extract_array(pd.Series([1, 2, 3]), extract_numpy=True) + array([1, 2, 3]) + """ + if isinstance(obj, (ABCIndexClass, ABCSeries)): + obj = obj.array + + if extract_numpy and isinstance(obj, ABCPandasArray): + obj = obj.to_numpy() + + return obj From 7b11c4e2a93443477862ec919943a11428164b9d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 23 Jul 2019 17:52:22 -0700 Subject: [PATCH 3/9] whitespace fixup --- pandas/core/construction.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 1dab41b76f0e5..c414908720c7d 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -4,8 +4,6 @@ These should not depend on core.internals. """ - - from typing import Optional, Sequence, Union, cast import numpy as np @@ -19,7 +17,7 @@ ) from pandas.core.dtypes.dtypes import ExtensionDtype, registry from pandas.core.dtypes.generic import ( - ABCExtensionArray, ABCIndexClass, ABCPandasArray, ABCSeries + ABCExtensionArray, ABCIndexClass, ABCPandasArray, ABCSeries ) From 64950291638a4d51800b2cf1c5b91070184e4130 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 23 Jul 2019 18:33:57 -0700 Subject: [PATCH 4/9] de-runtime imports --- pandas/core/algorithms.py | 3 +-- pandas/core/api.py | 57 ++++++++++++++++++++------------------- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c7230dd7385c2..21d12d02c9008 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -50,6 +50,7 @@ from pandas.core.dtypes.missing import isna, na_value_for_dtype from pandas.core import common as com +from pandas.core.construction import array from pandas.core.indexers import validate_indices _shared_docs = {} # type: Dict[str, str] @@ -1855,8 +1856,6 @@ def searchsorted(arr, value, side="left", sorter=None): and is_integer_dtype(arr) and (is_integer(value) or is_integer_dtype(value)) ): - from .arrays.array_ import array - # if `arr` and `value` have different dtypes, `arr` would be # recast by numpy, causing a slow search. # Before searching below, we therefore try to give `value` the diff --git a/pandas/core/api.py b/pandas/core/api.py index 5d24fc356f7a1..73323d93b8215 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -2,17 +2,6 @@ import numpy as np -from pandas.core.dtypes.dtypes import ( - CategoricalDtype, - DatetimeTZDtype, - IntervalDtype, - PeriodDtype, -) -from pandas.core.dtypes.missing import isna, isnull, notna, notnull - -# TODO: Remove import when statsmodels updates #18264 -from pandas.core.algorithms import factorize, unique, value_counts -from pandas.core.arrays import Categorical from pandas.core.arrays.integer import ( Int8Dtype, Int16Dtype, @@ -23,33 +12,45 @@ UInt32Dtype, UInt64Dtype, ) +from pandas.core.algorithms import factorize, unique, value_counts +from pandas.core.dtypes.missing import isna, isnull, notna, notnull +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + PeriodDtype, + IntervalDtype, + DatetimeTZDtype, +) +from pandas.core.arrays import Categorical from pandas.core.construction import array -from pandas.core.frame import DataFrame from pandas.core.groupby import Grouper, NamedAgg +from pandas.io.formats.format import set_eng_float_format from pandas.core.index import ( - CategoricalIndex, - DatetimeIndex, - Float64Index, Index, + CategoricalIndex, Int64Index, - IntervalIndex, - MultiIndex, - NaT, - PeriodIndex, + UInt64Index, RangeIndex, + Float64Index, + MultiIndex, + IntervalIndex, TimedeltaIndex, - UInt64Index, + DatetimeIndex, + PeriodIndex, + NaT, ) -from pandas.core.indexes.datetimes import Timestamp, bdate_range, date_range -from pandas.core.indexes.interval import Interval, interval_range from pandas.core.indexes.period import Period, period_range from pandas.core.indexes.timedeltas import Timedelta, timedelta_range -from pandas.core.indexing import IndexSlice -from pandas.core.reshape.reshape import get_dummies +from pandas.core.indexes.datetimes import Timestamp, date_range, bdate_range +from pandas.core.indexes.interval import Interval, interval_range + from pandas.core.series import Series -from pandas.core.tools.datetimes import to_datetime -from pandas.core.tools.numeric import to_numeric -from pandas.core.tools.timedeltas import to_timedelta +from pandas.core.frame import DataFrame -from pandas.io.formats.format import set_eng_float_format +# TODO: Remove import when statsmodels updates #18264 +from pandas.core.reshape.reshape import get_dummies + +from pandas.core.indexing import IndexSlice +from pandas.core.tools.numeric import to_numeric from pandas.tseries.offsets import DateOffset +from pandas.core.tools.datetimes import to_datetime +from pandas.core.tools.timedeltas import to_timedelta From 617fdef401b722096d385e582924cd46daa7d8a6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 23 Jul 2019 18:38:50 -0700 Subject: [PATCH 5/9] isort --- pandas/core/construction.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index c414908720c7d..ef069ed12ce1a 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -17,7 +17,10 @@ ) from pandas.core.dtypes.dtypes import ExtensionDtype, registry from pandas.core.dtypes.generic import ( - ABCExtensionArray, ABCIndexClass, ABCPandasArray, ABCSeries + ABCExtensionArray, + ABCIndexClass, + ABCPandasArray, + ABCSeries, ) From 28b4c929bed4f28abdcb59992aab99de1632c5ad Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 23 Jul 2019 19:40:46 -0700 Subject: [PATCH 6/9] move sanitize_array to core.construction --- pandas/core/arrays/categorical.py | 5 +- pandas/core/arrays/sparse.py | 2 +- pandas/core/construction.py | 211 +++++++++++++++++++++++++- pandas/core/internals/construction.py | 198 +----------------------- pandas/core/series.py | 3 +- 5 files changed, 215 insertions(+), 204 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 37d13883e29ec..8b0325748a353 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -60,7 +60,7 @@ ) from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs import pandas.core.common as com -from pandas.core.construction import extract_array +from pandas.core.construction import extract_array, sanitize_array from pandas.core.missing import interpolate_2d from pandas.core.sorting import nargsort @@ -375,7 +375,6 @@ def __init__( values = maybe_infer_to_datetimelike(values, convert_dates=True) if not isinstance(values, np.ndarray): values = _convert_to_list_like(values) - from pandas.core.internals.construction import sanitize_array # By convention, empty lists result in object dtype: if len(values) == 0: @@ -2525,8 +2524,6 @@ def isin(self, values): >>> s.isin(['lama']) array([ True, False, True, False, True, False]) """ - from pandas.core.internals.construction import sanitize_array - if not is_list_like(values): raise TypeError( "only list-like objects are allowed to be passed" diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 9376b49112f6f..b8f41b140b245 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -52,6 +52,7 @@ from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin from pandas.core.base import PandasObject import pandas.core.common as com +from pandas.core.construction import sanitize_array from pandas.core.missing import interpolate_2d import pandas.core.ops as ops @@ -664,7 +665,6 @@ def __init__( if not is_array_like(data): try: # probably shared code in sanitize_series - from pandas.core.internals.construction import sanitize_array data = sanitize_array(data, index=None) except ValueError: diff --git a/pandas/core/construction.py b/pandas/core/construction.py index ef069ed12ce1a..c45a5f46c4328 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -7,13 +7,34 @@ from typing import Optional, Sequence, Union, cast import numpy as np +import numpy.ma as ma from pandas._libs import lib, tslibs - +from pandas._libs.tslibs import IncompatibleFrequency, OutOfBoundsDatetime + +from pandas.core.dtypes.cast import ( + construct_1d_arraylike_from_scalar, + construct_1d_ndarray_preserving_na, + construct_1d_object_array_from_listlike, + infer_dtype_from_scalar, + maybe_cast_to_datetime, + maybe_cast_to_integer_array, + maybe_castable, + maybe_convert_platform, + maybe_upcast, +) from pandas.core.dtypes.common import ( + is_categorical_dtype, is_datetime64_ns_dtype, is_extension_array_dtype, + is_extension_type, + is_float_dtype, + is_integer_dtype, + is_iterator, + is_list_like, + is_object_dtype, is_timedelta64_ns_dtype, + pandas_dtype, ) from pandas.core.dtypes.dtypes import ExtensionDtype, registry from pandas.core.dtypes.generic import ( @@ -22,6 +43,9 @@ ABCPandasArray, ABCSeries, ) +from pandas.core.dtypes.missing import isna + +import pandas.core.common as com def array( @@ -339,3 +363,188 @@ def extract_array(obj, extract_numpy=False): obj = obj.to_numpy() return obj + + +def sanitize_array(data, index, dtype=None, copy=False, raise_cast_failure=False): + """ + Sanitize input data to an ndarray, copy if specified, coerce to the + dtype if specified. + """ + if dtype is not None: + dtype = pandas_dtype(dtype) + + if isinstance(data, ma.MaskedArray): + mask = ma.getmaskarray(data) + if mask.any(): + data, fill_value = maybe_upcast(data, copy=True) + data.soften_mask() # set hardmask False if it was True + data[mask] = fill_value + else: + data = data.copy() + + # extract ndarray or ExtensionArray, ensure we have no PandasArray + data = extract_array(data, extract_numpy=True) + + # GH#846 + if isinstance(data, np.ndarray): + + if dtype is not None and is_float_dtype(data.dtype) and is_integer_dtype(dtype): + # possibility of nan -> garbage + try: + subarr = _try_cast(data, dtype, copy, True) + except ValueError: + if copy: + subarr = data.copy() + else: + subarr = np.array(data, copy=False) + else: + # we will try to copy be-definition here + subarr = _try_cast(data, dtype, copy, raise_cast_failure) + + elif isinstance(data, ABCExtensionArray): + # it is already ensured above this is not a PandasArray + subarr = data + + if dtype is not None: + subarr = subarr.astype(dtype, copy=copy) + elif copy: + subarr = subarr.copy() + return subarr + + elif isinstance(data, (list, tuple)) and len(data) > 0: + if dtype is not None: + try: + subarr = _try_cast(data, dtype, copy, raise_cast_failure) + except Exception: + if raise_cast_failure: # pragma: no cover + raise + subarr = np.array(data, dtype=object, copy=copy) + subarr = lib.maybe_convert_objects(subarr) + + else: + subarr = maybe_convert_platform(data) + + subarr = maybe_cast_to_datetime(subarr, dtype) + + elif isinstance(data, range): + # GH#16804 + arr = np.arange(data.start, data.stop, data.step, dtype="int64") + subarr = _try_cast(arr, dtype, copy, raise_cast_failure) + else: + subarr = _try_cast(data, dtype, copy, raise_cast_failure) + + # scalar like, GH + if getattr(subarr, "ndim", 0) == 0: + if isinstance(data, list): # pragma: no cover + subarr = np.array(data, dtype=object) + elif index is not None: + value = data + + # figure out the dtype from the value (upcast if necessary) + if dtype is None: + dtype, value = infer_dtype_from_scalar(value) + else: + # need to possibly convert the value here + value = maybe_cast_to_datetime(value, dtype) + + subarr = construct_1d_arraylike_from_scalar(value, len(index), dtype) + + else: + return subarr.item() + + # the result that we want + elif subarr.ndim == 1: + if index is not None: + + # a 1-element ndarray + if len(subarr) != len(index) and len(subarr) == 1: + subarr = construct_1d_arraylike_from_scalar( + subarr[0], len(index), subarr.dtype + ) + + elif subarr.ndim > 1: + if isinstance(data, np.ndarray): + raise Exception("Data must be 1-dimensional") + else: + subarr = com.asarray_tuplesafe(data, dtype=dtype) + + # This is to prevent mixed-type Series getting all casted to + # NumPy string type, e.g. NaN --> '-1#IND'. + if issubclass(subarr.dtype.type, str): + # GH#16605 + # If not empty convert the data to dtype + # GH#19853: If data is a scalar, subarr has already the result + if not lib.is_scalar(data): + if not np.all(isna(data)): + data = np.array(data, dtype=dtype, copy=False) + subarr = np.array(data, dtype=object, copy=copy) + + if ( + not (is_extension_array_dtype(subarr.dtype) or is_extension_array_dtype(dtype)) + and is_object_dtype(subarr.dtype) + and not is_object_dtype(dtype) + ): + inferred = lib.infer_dtype(subarr, skipna=False) + if inferred == "period": + from pandas.core.arrays import period_array + try: + subarr = period_array(subarr) + except IncompatibleFrequency: + pass + + return subarr + + +def _try_cast(arr, dtype, copy, raise_cast_failure): + """ + Convert input to numpy ndarray and optionally cast to a given dtype. + + Parameters + ---------- + arr : array-like + dtype : np.dtype, ExtensionDtype or None + copy : bool + If False, don't copy the data if not needed. + raise_cast_failure : bool + If True, and if a dtype is specified, raise errors during casting. + Otherwise an object array is returned. + """ + # perf shortcut as this is the most common case + if isinstance(arr, np.ndarray): + if maybe_castable(arr) and not copy and dtype is None: + return arr + + try: + # GH#15832: Check if we are requesting a numeric dype and + # that we can convert the data to the requested dtype. + if is_integer_dtype(dtype): + subarr = maybe_cast_to_integer_array(arr, dtype) + + subarr = maybe_cast_to_datetime(arr, dtype) + # Take care in creating object arrays (but iterators are not + # supported): + if is_object_dtype(dtype) and ( + is_list_like(subarr) + and not (is_iterator(subarr) or isinstance(subarr, np.ndarray)) + ): + subarr = construct_1d_object_array_from_listlike(subarr) + elif not is_extension_type(subarr): + subarr = construct_1d_ndarray_preserving_na(subarr, dtype, copy=copy) + except OutOfBoundsDatetime: + # in case of out of bound datetime64 -> always raise + raise + except (ValueError, TypeError): + if is_categorical_dtype(dtype): + # We *do* allow casting to categorical, since we know + # that Categorical is the only array type for 'category'. + subarr = dtype.construct_array_type( + arr, dtype.categories, ordered=dtype._ordered) + elif is_extension_array_dtype(dtype): + # create an extension array from its dtype + array_type = dtype.construct_array_type()._from_sequence + subarr = array_type(arr, dtype=dtype, copy=copy) + elif dtype is not None and raise_cast_failure: + raise + else: + subarr = np.array(arr, dtype=object, copy=copy) + return subarr diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 326daf3fc8a86..74b16f0e72883 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -8,18 +8,12 @@ import numpy.ma as ma from pandas._libs import lib -from pandas._libs.tslibs import IncompatibleFrequency, OutOfBoundsDatetime import pandas.compat as compat from pandas.compat import PY36, raise_with_traceback from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, - construct_1d_ndarray_preserving_na, - construct_1d_object_array_from_listlike, - infer_dtype_from_scalar, maybe_cast_to_datetime, - maybe_cast_to_integer_array, - maybe_castable, maybe_convert_platform, maybe_infer_to_datetimelike, maybe_upcast, @@ -29,13 +23,9 @@ is_datetime64tz_dtype, is_dtype_equal, is_extension_array_dtype, - is_extension_type, - is_float_dtype, is_integer_dtype, - is_iterator, is_list_like, is_object_dtype, - pandas_dtype, ) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -45,11 +35,10 @@ ABCSeries, ABCTimedeltaIndex, ) -from pandas.core.dtypes.missing import isna from pandas.core import algorithms, common as com -from pandas.core.arrays import Categorical, ExtensionArray, period_array -from pandas.core.construction import extract_array +from pandas.core.arrays import Categorical +from pandas.core.construction import sanitize_array from pandas.core.index import ( Index, _get_objs_combined_axis, @@ -625,186 +614,3 @@ def sanitize_index(data, index, copy=False): data = sanitize_array(data, index, copy=copy) return data - - -def sanitize_array(data, index, dtype=None, copy=False, raise_cast_failure=False): - """ - Sanitize input data to an ndarray, copy if specified, coerce to the - dtype if specified. - """ - if dtype is not None: - dtype = pandas_dtype(dtype) - - if isinstance(data, ma.MaskedArray): - mask = ma.getmaskarray(data) - if mask.any(): - data, fill_value = maybe_upcast(data, copy=True) - data.soften_mask() # set hardmask False if it was True - data[mask] = fill_value - else: - data = data.copy() - - # extract ndarray or ExtensionArray, ensure we have no PandasArray - data = extract_array(data, extract_numpy=True) - - # GH#846 - if isinstance(data, np.ndarray): - - if dtype is not None and is_float_dtype(data.dtype) and is_integer_dtype(dtype): - # possibility of nan -> garbage - try: - subarr = _try_cast(data, dtype, copy, True) - except ValueError: - if copy: - subarr = data.copy() - else: - subarr = np.array(data, copy=False) - else: - # we will try to copy be-definition here - subarr = _try_cast(data, dtype, copy, raise_cast_failure) - - elif isinstance(data, ExtensionArray): - # it is already ensured above this is not a PandasArray - subarr = data - - if dtype is not None: - subarr = subarr.astype(dtype, copy=copy) - elif copy: - subarr = subarr.copy() - return subarr - - elif isinstance(data, (list, tuple)) and len(data) > 0: - if dtype is not None: - try: - subarr = _try_cast(data, dtype, copy, raise_cast_failure) - except Exception: - if raise_cast_failure: # pragma: no cover - raise - subarr = np.array(data, dtype=object, copy=copy) - subarr = lib.maybe_convert_objects(subarr) - - else: - subarr = maybe_convert_platform(data) - - subarr = maybe_cast_to_datetime(subarr, dtype) - - elif isinstance(data, range): - # GH#16804 - arr = np.arange(data.start, data.stop, data.step, dtype="int64") - subarr = _try_cast(arr, dtype, copy, raise_cast_failure) - else: - subarr = _try_cast(data, dtype, copy, raise_cast_failure) - - # scalar like, GH - if getattr(subarr, "ndim", 0) == 0: - if isinstance(data, list): # pragma: no cover - subarr = np.array(data, dtype=object) - elif index is not None: - value = data - - # figure out the dtype from the value (upcast if necessary) - if dtype is None: - dtype, value = infer_dtype_from_scalar(value) - else: - # need to possibly convert the value here - value = maybe_cast_to_datetime(value, dtype) - - subarr = construct_1d_arraylike_from_scalar(value, len(index), dtype) - - else: - return subarr.item() - - # the result that we want - elif subarr.ndim == 1: - if index is not None: - - # a 1-element ndarray - if len(subarr) != len(index) and len(subarr) == 1: - subarr = construct_1d_arraylike_from_scalar( - subarr[0], len(index), subarr.dtype - ) - - elif subarr.ndim > 1: - if isinstance(data, np.ndarray): - raise Exception("Data must be 1-dimensional") - else: - subarr = com.asarray_tuplesafe(data, dtype=dtype) - - # This is to prevent mixed-type Series getting all casted to - # NumPy string type, e.g. NaN --> '-1#IND'. - if issubclass(subarr.dtype.type, str): - # GH#16605 - # If not empty convert the data to dtype - # GH#19853: If data is a scalar, subarr has already the result - if not lib.is_scalar(data): - if not np.all(isna(data)): - data = np.array(data, dtype=dtype, copy=False) - subarr = np.array(data, dtype=object, copy=copy) - - if ( - not (is_extension_array_dtype(subarr.dtype) or is_extension_array_dtype(dtype)) - and is_object_dtype(subarr.dtype) - and not is_object_dtype(dtype) - ): - inferred = lib.infer_dtype(subarr, skipna=False) - if inferred == "period": - try: - subarr = period_array(subarr) - except IncompatibleFrequency: - pass - - return subarr - - -def _try_cast(arr, dtype, copy, raise_cast_failure): - """ - Convert input to numpy ndarray and optionally cast to a given dtype. - - Parameters - ---------- - arr : array-like - dtype : np.dtype, ExtensionDtype or None - copy : bool - If False, don't copy the data if not needed. - raise_cast_failure : bool - If True, and if a dtype is specified, raise errors during casting. - Otherwise an object array is returned. - """ - # perf shortcut as this is the most common case - if isinstance(arr, np.ndarray): - if maybe_castable(arr) and not copy and dtype is None: - return arr - - try: - # GH#15832: Check if we are requesting a numeric dype and - # that we can convert the data to the requested dtype. - if is_integer_dtype(dtype): - subarr = maybe_cast_to_integer_array(arr, dtype) - - subarr = maybe_cast_to_datetime(arr, dtype) - # Take care in creating object arrays (but iterators are not - # supported): - if is_object_dtype(dtype) and ( - is_list_like(subarr) - and not (is_iterator(subarr) or isinstance(subarr, np.ndarray)) - ): - subarr = construct_1d_object_array_from_listlike(subarr) - elif not is_extension_type(subarr): - subarr = construct_1d_ndarray_preserving_na(subarr, dtype, copy=copy) - except OutOfBoundsDatetime: - # in case of out of bound datetime64 -> always raise - raise - except (ValueError, TypeError): - if is_categorical_dtype(dtype): - # We *do* allow casting to categorical, since we know - # that Categorical is the only array type for 'category'. - subarr = Categorical(arr, dtype.categories, ordered=dtype._ordered) - elif is_extension_array_dtype(dtype): - # create an extension array from its dtype - array_type = dtype.construct_array_type()._from_sequence - subarr = array_type(arr, dtype=dtype, copy=copy) - elif dtype is not None and raise_cast_failure: - raise - else: - subarr = np.array(arr, dtype=object, copy=copy) - return subarr diff --git a/pandas/core/series.py b/pandas/core/series.py index f31f87b7704aa..17d1d8fbff091 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -61,7 +61,7 @@ from pandas.core.arrays.categorical import Categorical, CategoricalAccessor from pandas.core.arrays.sparse import SparseAccessor import pandas.core.common as com -from pandas.core.construction import extract_array +from pandas.core.construction import extract_array, sanitize_array from pandas.core.index import ( Float64Index, Index, @@ -77,7 +77,6 @@ from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.core.indexing import check_bool_indexer from pandas.core.internals import SingleBlockManager -from pandas.core.internals.construction import sanitize_array from pandas.core.strings import StringMethods from pandas.core.tools.datetimes import to_datetime From f07e018e6d9cc7c9554f55bc50d62958cff1668d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 23 Jul 2019 20:01:38 -0700 Subject: [PATCH 7/9] Fix usage --- pandas/core/construction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index c45a5f46c4328..c4e4166835929 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -537,7 +537,7 @@ def _try_cast(arr, dtype, copy, raise_cast_failure): if is_categorical_dtype(dtype): # We *do* allow casting to categorical, since we know # that Categorical is the only array type for 'category'. - subarr = dtype.construct_array_type( + subarr = dtype.construct_array_type()( arr, dtype.categories, ordered=dtype._ordered) elif is_extension_array_dtype(dtype): # create an extension array from its dtype From 525c51b394b2bcd87bbfce56772d9cc412c52f1f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 23 Jul 2019 20:34:29 -0700 Subject: [PATCH 8/9] blackify --- pandas/core/construction.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index c4e4166835929..9528723a6dc0f 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -487,6 +487,7 @@ def sanitize_array(data, index, dtype=None, copy=False, raise_cast_failure=False inferred = lib.infer_dtype(subarr, skipna=False) if inferred == "period": from pandas.core.arrays import period_array + try: subarr = period_array(subarr) except IncompatibleFrequency: @@ -538,7 +539,8 @@ def _try_cast(arr, dtype, copy, raise_cast_failure): # We *do* allow casting to categorical, since we know # that Categorical is the only array type for 'category'. subarr = dtype.construct_array_type()( - arr, dtype.categories, ordered=dtype._ordered) + arr, dtype.categories, ordered=dtype._ordered + ) elif is_extension_array_dtype(dtype): # create an extension array from its dtype array_type = dtype.construct_array_type()._from_sequence From 6a8dc367592aab485a123d66b430c334e694769d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 24 Jul 2019 09:32:43 -0700 Subject: [PATCH 9/9] mypy fixup --- pandas/core/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9053edf2d1424..42e636ed2204f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6,7 +6,7 @@ import operator import pickle from textwrap import dedent -from typing import Callable, FrozenSet, List, Optional, Set +from typing import Callable, Dict, FrozenSet, List, Optional, Set import warnings import weakref @@ -73,7 +73,7 @@ # goal is to be able to define the docs close to function, while still being # able to share -_shared_docs = dict() +_shared_docs = dict() # type: Dict[str, str] _shared_doc_kwargs = dict( axes="keywords for axes", klass="Series/DataFrame",