diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 4260c0836bbea..b49b9a67c4743 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -170,7 +170,7 @@ jobs: - name: Setup Python id: setup_python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.10' cache: 'pip' diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index a2c42af53c3a8..04d8b8e006985 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -40,7 +40,7 @@ jobs: - name: Setup Python id: setup_python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.10' diff --git a/.github/workflows/stale-pr.yml b/.github/workflows/stale-pr.yml index 11b81d11f7876..792afe8f4faf5 100644 --- a/.github/workflows/stale-pr.yml +++ b/.github/workflows/stale-pr.yml @@ -14,7 +14,7 @@ jobs: if: github.repository_owner == 'pandas-dev' runs-on: ubuntu-22.04 steps: - - uses: actions/stale@v8 + - uses: actions/stale@v9 with: repo-token: ${{ secrets.GITHUB_TOKEN }} stale-pr-message: "This pull request is stale because it has been open for thirty days with no activity. Please [update](https://pandas.pydata.org/pandas-docs/stable/development/contributing.html#updating-your-pull-request) and respond to this comment if you're still interested in working on this." diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index ffcd2ae32c09c..88d705dbd9251 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -354,7 +354,7 @@ jobs: fetch-depth: 0 - name: Set up Python Dev Version - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.12-dev' diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 67c6f9bedd9c9..ca5dacad1b599 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -53,7 +53,7 @@ jobs: fetch-depth: 0 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.11' diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py index 5e23cba2e1074..6b1f75187f887 100644 --- a/asv_bench/benchmarks/arithmetic.py +++ b/asv_bench/benchmarks/arithmetic.py @@ -12,7 +12,6 @@ date_range, to_timedelta, ) -from pandas.core.algorithms import checked_add_with_arr from .pandas_vb_common import numeric_dtypes @@ -389,42 +388,6 @@ def time_add_timedeltas(self, df): df["timedelta"] + df["timedelta"] -class AddOverflowScalar: - params = [1, -1, 0] - param_names = ["scalar"] - - def setup(self, scalar): - N = 10**6 - self.arr = np.arange(N) - - def time_add_overflow_scalar(self, scalar): - checked_add_with_arr(self.arr, scalar) - - -class AddOverflowArray: - def setup(self): - N = 10**6 - self.arr = np.arange(N) - self.arr_rev = np.arange(-N, 0) - self.arr_mixed = np.array([1, -1]).repeat(N / 2) - self.arr_nan_1 = np.random.choice([True, False], size=N) - self.arr_nan_2 = np.random.choice([True, False], size=N) - - def time_add_overflow_arr_rev(self): - checked_add_with_arr(self.arr, self.arr_rev) - - def time_add_overflow_arr_mask_nan(self): - checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1) - - def time_add_overflow_b_mask_nan(self): - checked_add_with_arr(self.arr, self.arr_mixed, b_mask=self.arr_nan_1) - - def time_add_overflow_both_arg_nan(self): - checked_add_with_arr( - self.arr, self.arr_mixed, arr_mask=self.arr_nan_1, b_mask=self.arr_nan_2 - ) - - hcal = pd.tseries.holiday.USFederalHolidayCalendar() # These offsets currently raise a NotImplementedError with .apply_index() non_apply = [ diff --git a/ci/code_checks.sh b/ci/code_checks.sh index e91629744463f..e41f625e583c0 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -14,6 +14,8 @@ # $ ./ci/code_checks.sh single-docs # check single-page docs build warning-free # $ ./ci/code_checks.sh notebooks # check execution of documentation notebooks +set -uo pipefail + [[ -z "$1" || "$1" == "code" || "$1" == "doctests" || "$1" == "docstrings" || "$1" == "single-docs" || "$1" == "notebooks" ]] || \ { echo "Unknown command $1. Usage: $0 [code|doctests|docstrings|single-docs|notebooks]"; exit 9999; } diff --git a/doc/make.py b/doc/make.py index dfa8ae6c1e34c..c4b7ab124f68f 100755 --- a/doc/make.py +++ b/doc/make.py @@ -236,8 +236,9 @@ def html(self): os.remove(zip_fname) if ret_code == 0: - if self.single_doc_html is not None and not self.no_browser: - self._open_browser(self.single_doc_html) + if self.single_doc_html is not None: + if not self.no_browser: + self._open_browser(self.single_doc_html) else: self._add_redirects() if self.whatsnew and not self.no_browser: diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 723c33280a679..57b83a294963b 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -1,6 +1,6 @@ .. _whatsnew_214: -What's new in 2.1.4 (December ??, 2023) +What's new in 2.1.4 (December 8, 2023) --------------------------------------- These are the changes in pandas 2.1.4. See :ref:`release` for a full changelog @@ -14,7 +14,6 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression when trying to read a pickled pandas :class:`DataFrame` from pandas 1.3 (:issue:`55137`) -- .. --------------------------------------------------------------------------- .. _whatsnew_214.bug_fixes: @@ -23,24 +22,20 @@ Bug fixes ~~~~~~~~~ - Bug in :class:`Series` constructor raising DeprecationWarning when ``index`` is a list of :class:`Series` (:issue:`55228`) - Bug in :class:`Series` when trying to cast date-like string inputs to :class:`ArrowDtype` of ``pyarrow.timestamp`` (:issue:`56266`) +- Bug in :class:`Timestamp` construction with ``ts_input="now"`` or ``ts_input="today"`` giving a different unit from :meth:`Timestamp.now` or :meth:`Timestamp.today` (:issue:`55879`) - Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`) +- Fixed bug in :func:`read_csv` not respecting object dtype when ``infer_string`` option is set (:issue:`56047`) - Fixed bug in :func:`to_numeric` converting to extension dtype for ``string[pyarrow_numpy]`` dtype (:issue:`56179`) - Fixed bug in :meth:`.DataFrameGroupBy.min` and :meth:`.DataFrameGroupBy.max` not preserving extension dtype for empty object (:issue:`55619`) - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - Fixed bug in :meth:`DataFrame.to_hdf` raising when columns have ``StringDtype`` (:issue:`55088`) - Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) +- Fixed bug in :meth:`Series.__ne__` resulting in False for comparison between ``NA`` and string value for ``dtype="string[pyarrow_numpy]"`` (:issue:`56122`) - Fixed bug in :meth:`Series.mode` not keeping object dtype when ``infer_string`` is set (:issue:`56183`) +- Fixed bug in :meth:`Series.reset_index` not preserving object dtype when ``infer_string`` is set (:issue:`56160`) - Fixed bug in :meth:`Series.str.split` and :meth:`Series.str.rsplit` when ``pat=None`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56271`) - Fixed bug in :meth:`Series.str.translate` losing object dtype when string option is set (:issue:`56152`) -.. --------------------------------------------------------------------------- -.. _whatsnew_214.other: - -Other -~~~~~ -- -- - .. --------------------------------------------------------------------------- .. _whatsnew_214.contributors: diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 67b4052b386c0..8209525721b98 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -218,6 +218,7 @@ Other enhancements - :meth:`~DataFrame.to_sql` with method parameter set to ``multi`` works with Oracle on the backend - :attr:`Series.attrs` / :attr:`DataFrame.attrs` now uses a deepcopy for propagating ``attrs`` (:issue:`54134`). +- :func:`get_dummies` now returning extension dtypes ``boolean`` or ``bool[pyarrow]`` that are compatible with the input dtype (:issue:`56273`) - :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`) - :func:`read_sas` returns ``datetime64`` dtypes with resolutions better matching those stored natively in SAS, and avoids returning object-dtype in cases that cannot be stored with ``datetime64[ns]`` dtype (:issue:`56127`) - :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs`. (:issue:`54264`) @@ -246,7 +247,7 @@ These are bug fixes that might have notable behavior changes. In previous versions of pandas, :func:`merge` and :meth:`DataFrame.join` did not always return a result that followed the documented sort behavior. pandas now -follows the documented sort behavior in merge and join operations (:issue:`54611`). +follows the documented sort behavior in merge and join operations (:issue:`54611`, :issue:`56426`, :issue:`56443`). As documented, ``sort=True`` sorts the join keys lexicographically in the resulting :class:`DataFrame`. With ``sort=False``, the order of the join keys depends on the @@ -431,6 +432,7 @@ Other Deprecations ^^^^^^^^^^^^^^^^^^ - Changed :meth:`Timedelta.resolution_string` to return ``h``, ``min``, ``s``, ``ms``, ``us``, and ``ns`` instead of ``H``, ``T``, ``S``, ``L``, ``U``, and ``N``, for compatibility with respective deprecations in frequency aliases (:issue:`52536`) - Deprecated :func:`pandas.api.types.is_interval` and :func:`pandas.api.types.is_period`, use ``isinstance(obj, pd.Interval)`` and ``isinstance(obj, pd.Period)`` instead (:issue:`55264`) +- Deprecated :func:`pd.core.internals.api.make_block`, use public APIs instead (:issue:`40226`) - Deprecated :func:`read_gbq` and :meth:`DataFrame.to_gbq`. Use ``pandas_gbq.read_gbq`` and ``pandas_gbq.to_gbq`` instead https://pandas-gbq.readthedocs.io/en/latest/api.html (:issue:`55525`) - Deprecated :meth:`.DataFrameGroupBy.fillna` and :meth:`.SeriesGroupBy.fillna`; use :meth:`.DataFrameGroupBy.ffill`, :meth:`.DataFrameGroupBy.bfill` for forward and backward filling or :meth:`.DataFrame.fillna` to fill with a single value (or the Series equivalents) (:issue:`55718`) - Deprecated :meth:`Index.format`, use ``index.astype(str)`` or ``index.map(formatter)`` instead (:issue:`55413`) @@ -438,6 +440,7 @@ Other Deprecations - Deprecated :meth:`Series.view`, use :meth:`Series.astype` instead to change the dtype (:issue:`20251`) - Deprecated ``core.internals`` members ``Block``, ``ExtensionBlock``, and ``DatetimeTZBlock``, use public APIs instead (:issue:`55139`) - Deprecated ``year``, ``month``, ``quarter``, ``day``, ``hour``, ``minute``, and ``second`` keywords in the :class:`PeriodIndex` constructor, use :meth:`PeriodIndex.from_fields` instead (:issue:`55960`) +- Deprecated accepting a type as an argument in :meth:`Index.view`, call without any arguments instead (:issue:`55709`) - Deprecated allowing non-integer ``periods`` argument in :func:`date_range`, :func:`timedelta_range`, :func:`period_range`, and :func:`interval_range` (:issue:`56036`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_clipboard`. (:issue:`54229`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_csv` except ``path_or_buf``. (:issue:`54229`) @@ -454,7 +457,10 @@ Other Deprecations - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_string` except ``buf``. (:issue:`54229`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_xml` except ``path_or_buffer``. (:issue:`54229`) - Deprecated allowing passing :class:`BlockManager` objects to :class:`DataFrame` or :class:`SingleBlockManager` objects to :class:`Series` (:issue:`52419`) +- Deprecated behavior of :meth:`Index.insert` with an object-dtype index silently performing type inference on the result, explicitly call ``result.infer_objects(copy=False)`` for the old behavior instead (:issue:`51363`) +- Deprecated casting non-datetimelike values (mainly strings) in :meth:`Series.isin` and :meth:`Index.isin` with ``datetime64``, ``timedelta64``, and :class:`PeriodDtype` dtypes (:issue:`53111`) - Deprecated downcasting behavior in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, :meth:`DataFrame.mask`, :meth:`Series.clip`, :meth:`DataFrame.clip`; in a future version these will not infer object-dtype columns to non-object dtype, or all-round floats to integer dtype. Call ``result.infer_objects(copy=False)`` on the result for object inference, or explicitly cast floats to ints. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`53656`) +- Deprecated dtype inference when setting a :class:`Index` into a :class:`DataFrame`, cast explicitly instead (:issue:`56102`) - Deprecated including the groups in computations when using :meth:`.DataFrameGroupBy.apply` and :meth:`.DataFrameGroupBy.resample`; pass ``include_groups=False`` to exclude the groups (:issue:`7155`) - Deprecated indexing an :class:`Index` with a boolean indexer of length zero (:issue:`55820`) - Deprecated not passing a tuple to :class:`.DataFrameGroupBy.get_group` or :class:`.SeriesGroupBy.get_group` when grouping by a length-1 list-like (:issue:`25971`) @@ -473,12 +479,12 @@ Other Deprecations - Deprecated the ``kind`` keyword in :meth:`Series.resample` and :meth:`DataFrame.resample`, explicitly cast the object's ``index`` instead (:issue:`55895`) - Deprecated the ``ordinal`` keyword in :class:`PeriodIndex`, use :meth:`PeriodIndex.from_ordinals` instead (:issue:`55960`) - Deprecated the ``unit`` keyword in :class:`TimedeltaIndex` construction, use :func:`to_timedelta` instead (:issue:`55499`) +- Deprecated the behavior of :meth:`DataFrame.replace` and :meth:`Series.replace` with :class:`CategoricalDtype`; in a future version replace will change the values while preserving the categories. To change the categories, use ``ser.cat.rename_categories`` instead (:issue:`55147`) - Deprecated the behavior of :meth:`Series.value_counts` and :meth:`Index.value_counts` with object dtype; in a future version these will not perform dtype inference on the resulting :class:`Index`, do ``result.index = result.index.infer_objects()`` to retain the old behavior (:issue:`56161`) - Deprecated the default of ``observed=False`` in :meth:`DataFrame.pivot_table`; will be ``True`` in a future version (:issue:`56236`) - Deprecated the extension test classes ``BaseNoReduceTests``, ``BaseBooleanReduceTests``, and ``BaseNumericReduceTests``, use ``BaseReduceTests`` instead (:issue:`54663`) - Deprecated the option ``mode.data_manager`` and the ``ArrayManager``; only the ``BlockManager`` will be available in future versions (:issue:`55043`) - Deprecated the previous implementation of :class:`DataFrame.stack`; specify ``future_stack=True`` to adopt the future version (:issue:`53515`) -- .. --------------------------------------------------------------------------- .. _whatsnew_220.performance: @@ -524,6 +530,7 @@ Datetimelike ^^^^^^^^^^^^ - Bug in :class:`DatetimeIndex` construction when passing both a ``tz`` and either ``dayfirst`` or ``yearfirst`` ignoring dayfirst/yearfirst (:issue:`55813`) - Bug in :class:`DatetimeIndex` when passing an object-dtype ndarray of float objects and a ``tz`` incorrectly localizing the result (:issue:`55780`) +- Bug in :func:`Series.isin` with :class:`DatetimeTZDtype` dtype and comparison values that are all ``NaT`` incorrectly returning all-``False`` even if the series contains ``NaT`` entries (:issue:`56427`) - Bug in :func:`concat` raising ``AttributeError`` when concatenating all-NA DataFrame with :class:`DatetimeTZDtype` dtype DataFrame. (:issue:`52093`) - Bug in :func:`testing.assert_extension_array_equal` that could use the wrong unit when comparing resolutions (:issue:`55730`) - Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing a list of mixed-string-and-numeric types incorrectly raising (:issue:`55780`) @@ -533,6 +540,7 @@ Datetimelike - Bug in :meth:`Index.is_monotonic_increasing` and :meth:`Index.is_monotonic_decreasing` always caching :meth:`Index.is_unique` as ``True`` when first value in index is ``NaT`` (:issue:`55755`) - Bug in :meth:`Index.view` to a datetime64 dtype with non-supported resolution incorrectly raising (:issue:`55710`) - Bug in :meth:`Series.dt.round` with non-nanosecond resolution and ``NaT`` entries incorrectly raising ``OverflowError`` (:issue:`56158`) +- Bug in :meth:`Series.fillna` with non-nanosecond resolution dtypes and higher-resolution vector values returning incorrect (internally-corrupted) results (:issue:`56410`) - Bug in :meth:`Tick.delta` with very large ticks raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) - Bug in :meth:`Timestamp.unit` being inferred incorrectly from an ISO8601 format string with minute or hour resolution and a timezone offset (:issue:`56208`) - Bug in ``.astype`` converting from a higher-resolution ``datetime64`` dtype to a lower-resolution ``datetime64`` dtype (e.g. ``datetime64[us]->datetim64[ms]``) silently overflowing with values near the lower implementation bound (:issue:`55979`) @@ -546,7 +554,6 @@ Datetimelike - Bug in parsing datetime strings with nanosecond resolution with non-ISO8601 formats incorrectly truncating sub-microsecond components (:issue:`56051`) - Bug in parsing datetime strings with sub-second resolution and trailing zeros incorrectly inferring second or millisecond resolution (:issue:`55737`) - Bug in the results of :func:`to_datetime` with an floating-dtype argument with ``unit`` not matching the pointwise results of :class:`Timestamp` (:issue:`56037`) -- Timedelta ^^^^^^^^^ @@ -576,6 +583,11 @@ Strings ^^^^^^^ - Bug in :func:`pandas.api.types.is_string_dtype` while checking object array with no elements is of the string dtype (:issue:`54661`) - Bug in :meth:`DataFrame.apply` failing when ``engine="numba"`` and columns or index have ``StringDtype`` (:issue:`56189`) +- Bug in :meth:`DataFrame.reindex` not matching :class:`Index` with ``string[pyarrow_numpy]`` dtype (:issue:`56106`) +- Bug in :meth:`Index.str.cat` always casting result to object dtype (:issue:`56157`) +- Bug in :meth:`Series.__mul__` for :class:`ArrowDtype` with ``pyarrow.string`` dtype and ``string[pyarrow]`` for the pyarrow backend (:issue:`51970`) +- Bug in :meth:`Series.str.find` when ``start < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56411`) +- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56404`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`) Interval @@ -643,6 +655,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.asfreq` and :meth:`Series.asfreq` with a :class:`DatetimeIndex` with non-nanosecond resolution incorrectly converting to nanosecond resolution (:issue:`55958`) - Bug in :meth:`DataFrame.ewm` when passed ``times`` with non-nanosecond ``datetime64`` or :class:`DatetimeTZDtype` dtype (:issue:`56262`) - Bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) +- Bug in :meth:`DataFrame.resample` when resampling on a :class:`ArrowDtype` of ``pyarrow.timestamp`` or ``pyarrow.duration`` type (:issue:`55989`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`) - @@ -650,7 +663,9 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ - Bug in :func:`concat` ignoring ``sort`` parameter when passed :class:`DatetimeIndex` indexes (:issue:`54769`) +- Bug in :func:`concat` renaming :class:`Series` when ``ignore_index=False`` (:issue:`15047`) - Bug in :func:`merge_asof` raising ``TypeError`` when ``by`` dtype is not ``object``, ``int64``, or ``uint64`` (:issue:`22794`) +- Bug in :func:`merge_asof` raising incorrect error for string dtype (:issue:`56444`) - Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`) - Bug in :meth:`DataFrame.melt` where an exception was raised if ``var_name`` was not a string (:issue:`55948`) - Bug in :meth:`DataFrame.melt` where it would not preserve the datetime (:issue:`55254`) diff --git a/meson.build b/meson.build index 0bc04c59d8716..06623a305ab54 100644 --- a/meson.build +++ b/meson.build @@ -7,7 +7,8 @@ project( meson_version: '>=1.2.1', default_options: [ 'buildtype=release', - 'c_std=c11' + 'c_std=c11', + 'warning_level=2', ] ) diff --git a/pandas/_libs/include/pandas/datetime/pd_datetime.h b/pandas/_libs/include/pandas/datetime/pd_datetime.h index a51f8cea71513..98e5521af2506 100644 --- a/pandas/_libs/include/pandas/datetime/pd_datetime.h +++ b/pandas/_libs/include/pandas/datetime/pd_datetime.h @@ -50,7 +50,7 @@ typedef struct { NPY_DATETIMEUNIT *, int *, int *, const char *, int, FormatRequirement); int (*get_datetime_iso_8601_strlen)(int, NPY_DATETIMEUNIT); - int (*make_iso_8601_datetime)(npy_datetimestruct *, char *, int, int, + int (*make_iso_8601_datetime)(npy_datetimestruct *, char *, size_t, int, NPY_DATETIMEUNIT); int (*make_iso_8601_timedelta)(pandas_timedeltastruct *, char *, size_t *); } PandasDateTime_CAPI; diff --git a/pandas/_libs/include/pandas/portable.h b/pandas/_libs/include/pandas/portable.h index be9080172fe42..1d0509d9e9724 100644 --- a/pandas/_libs/include/pandas/portable.h +++ b/pandas/_libs/include/pandas/portable.h @@ -23,3 +23,15 @@ The full license is in the LICENSE file, distributed with this software. #define isspace_ascii(c) (((c) == ' ') || (((unsigned)(c) - '\t') < 5)) #define toupper_ascii(c) ((((unsigned)(c) - 'a') < 26) ? ((c) & 0x5f) : (c)) #define tolower_ascii(c) ((((unsigned)(c) - 'A') < 26) ? ((c) | 0x20) : (c)) + +#if defined(_WIN32) +#define PD_FALLTHROUGH \ + do { \ + } while (0) /* fallthrough */ +#elif __has_attribute(__fallthrough__) +#define PD_FALLTHROUGH __attribute__((__fallthrough__)) +#else +#define PD_FALLTHROUGH \ + do { \ + } while (0) /* fallthrough */ +#endif diff --git a/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime_strings.h b/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime_strings.h index d96ca79d70cb7..75e69f30ada1e 100644 --- a/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime_strings.h +++ b/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime_strings.h @@ -85,7 +85,7 @@ int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base); * Returns 0 on success, -1 on failure (for example if the output * string was too short). */ -int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, +int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, size_t outlen, int utc, NPY_DATETIMEUNIT base); /* diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 8493f8bd066e0..c483f35513a40 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2756,8 +2756,11 @@ def maybe_convert_objects(ndarray[object] objects, res[:] = NPY_NAT return res elif dtype is not None: - # EA, we don't expect to get here, but _could_ implement - raise NotImplementedError(dtype) + # i.e. PeriodDtype, DatetimeTZDtype + cls = dtype.construct_array_type() + obj = cls._from_sequence([], dtype=dtype) + taker = -np.ones((objects).shape, dtype=np.intp) + return obj.take(taker, allow_fill=True) else: # we don't guess seen.object_ = True diff --git a/pandas/_libs/src/datetime/pd_datetime.c b/pandas/_libs/src/datetime/pd_datetime.c index 606edf1184aad..19de51be6e1b2 100644 --- a/pandas/_libs/src/datetime/pd_datetime.c +++ b/pandas/_libs/src/datetime/pd_datetime.c @@ -21,6 +21,7 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #include "datetime.h" #include "pandas/datetime/pd_datetime.h" +#include "pandas/portable.h" static void pandas_datetime_destructor(PyObject *op) { void *ptr = PyCapsule_GetPointer(op, PandasDateTime_CAPSULE_NAME); @@ -188,7 +189,7 @@ static npy_datetime PyDateTimeToEpoch(PyObject *dt, NPY_DATETIMEUNIT base) { return npy_dt; } -static int pandas_datetime_exec(PyObject *module) { +static int pandas_datetime_exec(PyObject *Py_UNUSED(module)) { PyDateTime_IMPORT; PandasDateTime_CAPI *capi = PyMem_Malloc(sizeof(PandasDateTime_CAPI)); if (capi == NULL) { diff --git a/pandas/_libs/src/parser/pd_parser.c b/pandas/_libs/src/parser/pd_parser.c index 2e16f5b756dd0..48f3cd14cbc30 100644 --- a/pandas/_libs/src/parser/pd_parser.c +++ b/pandas/_libs/src/parser/pd_parser.c @@ -100,7 +100,7 @@ static void pandas_parser_destructor(PyObject *op) { PyMem_Free(ptr); } -static int pandas_parser_exec(PyObject *module) { +static int pandas_parser_exec(PyObject *Py_UNUSED(module)) { PandasParser_CAPI *capi = PyMem_Malloc(sizeof(PandasParser_CAPI)); if (capi == NULL) { PyErr_NoMemory(); diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 74a3a51c61e15..0e4188bea4dc7 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -795,7 +795,7 @@ static int tokenize_bytes(parser_t *self, size_t line_limit, break; } else if (!isblank(c)) { self->state = START_FIELD; - // fall through to subsequent state + PD_FALLTHROUGH; // fall through to subsequent state } else { // if whitespace char, keep slurping break; @@ -849,12 +849,12 @@ static int tokenize_bytes(parser_t *self, size_t line_limit, self->state = WHITESPACE_LINE; break; } - // fall through } // normal character - fall through // to handle as START_FIELD self->state = START_FIELD; + PD_FALLTHROUGH; } case START_FIELD: // expecting field @@ -1130,10 +1130,10 @@ int parser_consume_rows(parser_t *self, size_t nrows) { /* if word_deletions == 0 (i.e. this case) then char_count must * be 0 too, as no data needs to be skipped */ - const int64_t char_count = word_deletions >= 1 - ? (self->word_starts[word_deletions - 1] + - strlen(self->words[word_deletions - 1]) + 1) - : 0; + const uint64_t char_count = + word_deletions >= 1 ? (self->word_starts[word_deletions - 1] + + strlen(self->words[word_deletions - 1]) + 1) + : 0; TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions, char_count)); @@ -1415,9 +1415,11 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, int negative = 0; switch (*p) { case '-': - negative = 1; // Fall through to increment position. + negative = 1; + PD_FALLTHROUGH; // Fall through to increment position. case '+': p++; + break; } int exponent = 0; @@ -1485,9 +1487,11 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, negative = 0; switch (*++p) { case '-': - negative = 1; // Fall through to increment pos. + negative = 1; + PD_FALLTHROUGH; // Fall through to increment position. case '+': p++; + break; } // Process string of digits. @@ -1595,9 +1599,11 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, int negative = 0; switch (*p) { case '-': - negative = 1; // Fall through to increment position. + negative = 1; + PD_FALLTHROUGH; // Fall through to increment position. case '+': p++; + break; } double number = 0.; @@ -1656,9 +1662,11 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, negative = 0; switch (*++p) { case '-': - negative = 1; // Fall through to increment pos. + negative = 1; + PD_FALLTHROUGH; // Fall through to increment position. case '+': p++; + break; } // Process string of digits. @@ -1762,8 +1770,8 @@ static char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal, return s_copy; } -double round_trip(const char *p, char **q, char decimal, char sci, char tsep, - int skip_trailing, int *error, int *maybe_int) { +double round_trip(const char *p, char **q, char decimal, char Py_UNUSED(sci), + char tsep, int skip_trailing, int *error, int *maybe_int) { // 'normalize' representation to C-locale; replace decimal with '.' and // remove thousands separator. char *endptr; @@ -1975,7 +1983,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, break; } if ((number < pre_max) || - ((number == pre_max) && (d - '0' <= dig_pre_max))) { + ((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) { number = number * 10 + (d - '0'); d = *++p; @@ -1987,7 +1995,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, } else { while (isdigit_ascii(d)) { if ((number < pre_max) || - ((number == pre_max) && (d - '0' <= dig_pre_max))) { + ((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) { number = number * 10 + (d - '0'); d = *++p; diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c index ab24752203c57..06e3251db8315 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c @@ -243,7 +243,7 @@ static void set_datetimestruct_days(npy_int64 days, npy_datetimestruct *dts) { for (i = 0; i < 12; ++i) { if (days < month_lengths[i]) { dts->month = i + 1; - dts->day = days + 1; + dts->day = (npy_int32)days + 1; return; } else { days -= month_lengths[i]; @@ -568,7 +568,7 @@ void pandas_datetime_to_datetimestruct(npy_datetime dt, NPY_DATETIMEUNIT base, case NPY_FR_M: out->year = 1970 + extract_unit(&dt, 12); - out->month = dt + 1; + out->month = (npy_int32)dt + 1; break; case NPY_FR_W: @@ -584,72 +584,72 @@ void pandas_datetime_to_datetimestruct(npy_datetime dt, NPY_DATETIMEUNIT base, perday = 24LL; set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = dt; + out->hour = (npy_int32)dt; break; case NPY_FR_m: perday = 24LL * 60; set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (int)extract_unit(&dt, 60); - out->min = (int)dt; + out->hour = (npy_int32)extract_unit(&dt, 60); + out->min = (npy_int32)dt; break; case NPY_FR_s: perday = 24LL * 60 * 60; set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (int)extract_unit(&dt, 60 * 60); - out->min = (int)extract_unit(&dt, 60); - out->sec = (int)dt; + out->hour = (npy_int32)extract_unit(&dt, 60 * 60); + out->min = (npy_int32)extract_unit(&dt, 60); + out->sec = (npy_int32)dt; break; case NPY_FR_ms: perday = 24LL * 60 * 60 * 1000; set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (int)extract_unit(&dt, 1000LL * 60 * 60); - out->min = (int)extract_unit(&dt, 1000LL * 60); - out->sec = (int)extract_unit(&dt, 1000LL); - out->us = (int)(dt * 1000); + out->hour = (npy_int32)extract_unit(&dt, 1000LL * 60 * 60); + out->min = (npy_int32)extract_unit(&dt, 1000LL * 60); + out->sec = (npy_int32)extract_unit(&dt, 1000LL); + out->us = (npy_int32)(dt * 1000); break; case NPY_FR_us: perday = 24LL * 60LL * 60LL * 1000LL * 1000LL; set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 60 * 60); - out->min = (int)extract_unit(&dt, 1000LL * 1000 * 60); - out->sec = (int)extract_unit(&dt, 1000LL * 1000); - out->us = (int)dt; + out->hour = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 60 * 60); + out->min = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 60); + out->sec = (npy_int32)extract_unit(&dt, 1000LL * 1000); + out->us = (npy_int32)dt; break; case NPY_FR_ns: perday = 24LL * 60LL * 60LL * 1000LL * 1000LL * 1000LL; set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60); - out->min = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60); - out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000); - out->us = (int)extract_unit(&dt, 1000LL); - out->ps = (int)(dt * 1000); + out->hour = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60); + out->min = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 60); + out->sec = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000); + out->us = (npy_int32)extract_unit(&dt, 1000LL); + out->ps = (npy_int32)(dt * 1000); break; case NPY_FR_ps: perday = 24LL * 60 * 60 * 1000 * 1000 * 1000 * 1000; set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60); - out->min = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60); - out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000); - out->us = (int)extract_unit(&dt, 1000LL); - out->ps = (int)(dt * 1000); + out->hour = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60); + out->min = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 60); + out->sec = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000); + out->us = (npy_int32)extract_unit(&dt, 1000LL); + out->ps = (npy_int32)(dt * 1000); break; case NPY_FR_fs: /* entire range is only +- 2.6 hours */ - out->hour = - (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 1000 * 60 * 60); + out->hour = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * + 1000 * 60 * 60); if (out->hour < 0) { out->year = 1969; out->month = 12; @@ -657,17 +657,18 @@ void pandas_datetime_to_datetimestruct(npy_datetime dt, NPY_DATETIMEUNIT base, out->hour += 24; assert(out->hour >= 0); } - out->min = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 1000 * 60); - out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 1000); - out->us = (int)extract_unit(&dt, 1000LL * 1000 * 1000); - out->ps = (int)extract_unit(&dt, 1000LL); - out->as = (int)(dt * 1000); + out->min = + (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 1000 * 60); + out->sec = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 1000); + out->us = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000); + out->ps = (npy_int32)extract_unit(&dt, 1000LL); + out->as = (npy_int32)(dt * 1000); break; case NPY_FR_as: /* entire range is only +- 9.2 seconds */ out->sec = - (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 1000 * 1000); + (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 1000 * 1000); if (out->sec < 0) { out->year = 1969; out->month = 12; @@ -677,9 +678,9 @@ void pandas_datetime_to_datetimestruct(npy_datetime dt, NPY_DATETIMEUNIT base, out->sec += 60; assert(out->sec >= 0); } - out->us = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000); - out->ps = (int)extract_unit(&dt, 1000LL * 1000); - out->as = (int)dt; + out->us = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000); + out->ps = (npy_int32)extract_unit(&dt, 1000LL * 1000); + out->as = (npy_int32)dt; break; default: @@ -741,21 +742,21 @@ void pandas_timedelta_to_timedeltastruct(npy_timedelta td, } if (frac >= 3600) { - out->hrs = frac / 3600LL; + out->hrs = (npy_int32)(frac / 3600LL); frac -= out->hrs * 3600LL; } else { out->hrs = 0; } if (frac >= 60) { - out->min = frac / 60LL; + out->min = (npy_int32)(frac / 60LL); frac -= out->min * 60LL; } else { out->min = 0; } if (frac >= 0) { - out->sec = frac; + out->sec = (npy_int32)frac; frac -= out->sec; } else { out->sec = 0; @@ -769,11 +770,11 @@ void pandas_timedelta_to_timedeltastruct(npy_timedelta td, ifrac = td - (out->days * per_day + sfrac); if (ifrac != 0) { - out->ms = ifrac / (1000LL * 1000LL); + out->ms = (npy_int32)(ifrac / (1000LL * 1000LL)); ifrac -= out->ms * 1000LL * 1000LL; - out->us = ifrac / 1000LL; + out->us = (npy_int32)(ifrac / 1000LL); ifrac -= out->us * 1000LL; - out->ns = ifrac; + out->ns = (npy_int32)ifrac; } else { out->ms = 0; out->us = 0; @@ -813,21 +814,21 @@ void pandas_timedelta_to_timedeltastruct(npy_timedelta td, } if (frac >= 3600) { - out->hrs = frac / 3600LL; + out->hrs = (npy_int32)(frac / 3600LL); frac -= out->hrs * 3600LL; } else { out->hrs = 0; } if (frac >= 60) { - out->min = frac / 60LL; + out->min = (npy_int32)(frac / 60LL); frac -= out->min * 60LL; } else { out->min = 0; } if (frac >= 0) { - out->sec = frac; + out->sec = (npy_int32)frac; frac -= out->sec; } else { out->sec = 0; @@ -841,11 +842,11 @@ void pandas_timedelta_to_timedeltastruct(npy_timedelta td, ifrac = td - (out->days * per_day + sfrac); if (ifrac != 0) { - out->ms = ifrac / 1000LL; + out->ms = (npy_int32)(ifrac / 1000LL); ifrac -= out->ms * 1000LL; - out->us = ifrac / 1L; + out->us = (npy_int32)(ifrac / 1L); ifrac -= out->us * 1L; - out->ns = ifrac; + out->ns = (npy_int32)ifrac; } else { out->ms = 0; out->us = 0; @@ -885,21 +886,21 @@ void pandas_timedelta_to_timedeltastruct(npy_timedelta td, } if (frac >= 3600) { - out->hrs = frac / 3600LL; + out->hrs = (npy_int32)(frac / 3600LL); frac -= out->hrs * 3600LL; } else { out->hrs = 0; } if (frac >= 60) { - out->min = frac / 60LL; + out->min = (npy_int32)(frac / 60LL); frac -= out->min * 60LL; } else { out->min = 0; } if (frac >= 0) { - out->sec = frac; + out->sec = (npy_int32)frac; frac -= out->sec; } else { out->sec = 0; @@ -913,7 +914,7 @@ void pandas_timedelta_to_timedeltastruct(npy_timedelta td, ifrac = td - (out->days * per_day + sfrac); if (ifrac != 0) { - out->ms = ifrac; + out->ms = (npy_int32)ifrac; out->us = 0; out->ns = 0; } else { @@ -956,21 +957,21 @@ void pandas_timedelta_to_timedeltastruct(npy_timedelta td, } if (frac >= 3600) { - out->hrs = frac / 3600LL; + out->hrs = (npy_int32)(frac / 3600LL); frac -= out->hrs * 3600LL; } else { out->hrs = 0; } if (frac >= 60) { - out->min = frac / 60LL; + out->min = (npy_int32)(frac / 60LL); frac -= out->min * 60LL; } else { out->min = 0; } if (frac >= 0) { - out->sec = frac; + out->sec = (npy_int32)frac; frac -= out->sec; } else { out->sec = 0; @@ -998,9 +999,9 @@ void pandas_timedelta_to_timedeltastruct(npy_timedelta td, out->days = td / 1440LL; td -= out->days * 1440LL; - out->hrs = td / 60LL; + out->hrs = (npy_int32)(td / 60LL); td -= out->hrs * 60LL; - out->min = td; + out->min = (npy_int32)td; out->sec = 0; out->ms = 0; @@ -1011,7 +1012,7 @@ void pandas_timedelta_to_timedeltastruct(npy_timedelta td, case NPY_FR_h: out->days = td / 24LL; td -= out->days * 24LL; - out->hrs = td; + out->hrs = (npy_int32)td; out->min = 0; out->sec = 0; diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c index 73bce9ab27a8b..a46f5bc467c5d 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c @@ -35,6 +35,7 @@ This file implements string parsing and creation for NumPy datetime. #include #include +#include "pandas/portable.h" #include "pandas/vendored/numpy/datetime/np_datetime.h" #include "pandas/vendored/numpy/datetime/np_datetime_strings.h" @@ -767,27 +768,38 @@ int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base) { /* return 4;*/ case NPY_FR_as: len += 3; /* "###" */ + PD_FALLTHROUGH; case NPY_FR_fs: len += 3; /* "###" */ + PD_FALLTHROUGH; case NPY_FR_ps: len += 3; /* "###" */ + PD_FALLTHROUGH; case NPY_FR_ns: len += 3; /* "###" */ + PD_FALLTHROUGH; case NPY_FR_us: len += 3; /* "###" */ + PD_FALLTHROUGH; case NPY_FR_ms: len += 4; /* ".###" */ + PD_FALLTHROUGH; case NPY_FR_s: len += 3; /* ":##" */ + PD_FALLTHROUGH; case NPY_FR_m: len += 3; /* ":##" */ + PD_FALLTHROUGH; case NPY_FR_h: len += 3; /* "T##" */ + PD_FALLTHROUGH; case NPY_FR_D: case NPY_FR_W: len += 3; /* "-##" */ + PD_FALLTHROUGH; case NPY_FR_M: len += 3; /* "-##" */ + PD_FALLTHROUGH; case NPY_FR_Y: len += 21; /* 64-bit year */ break; @@ -823,10 +835,10 @@ int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base) { * Returns 0 on success, -1 on failure (for example if the output * string was too short). */ -int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, +int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, size_t outlen, int utc, NPY_DATETIMEUNIT base) { char *substr = outstr; - int sublen = outlen; + size_t sublen = outlen; int tmplen; /* @@ -851,7 +863,7 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, tmplen = snprintf(substr, sublen, "%04" NPY_INT64_FMT, dts->year); #endif // _WIN32 /* If it ran out of space or there isn't space for the NULL terminator */ - if (tmplen < 0 || tmplen > sublen) { + if (tmplen < 0 || (size_t)tmplen > sublen) { goto string_too_short; } substr += tmplen; diff --git a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c index 876d9f276f5d2..c8d8b5ab6bd6e 100644 --- a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c @@ -40,6 +40,7 @@ Numeric decoder derived from TCL library // Licence at LICENSES/ULTRAJSON_LICENSE +#include "pandas/portable.h" #include "pandas/vendored/ujson/lib/ultrajson.h" #include #include @@ -461,6 +462,7 @@ int Buffer_EscapeStringUnvalidated(JSONObjectEncoder *enc, const char *io, { if (enc->encodeHTMLChars) { // Fall through to \u00XX case below. + PD_FALLTHROUGH; } else { // Same as default case below. (*of++) = (*io); @@ -645,6 +647,7 @@ int Buffer_EscapeStringValidated(JSOBJ obj, JSONObjectEncoder *enc, case 29: { if (enc->encodeHTMLChars) { // Fall through to \u00XX case 30 below. + PD_FALLTHROUGH; } else { // Same as case 1 above. *(of++) = (*io++); diff --git a/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c b/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c index b7ee58c63a275..7cc20a52f1849 100644 --- a/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c +++ b/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c @@ -42,66 +42,74 @@ Numeric decoder derived from TCL library #define PY_SSIZE_T_CLEAN #include -static int Object_objectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) { +static int Object_objectAddKey(void *Py_UNUSED(prv), JSOBJ obj, JSOBJ name, + JSOBJ value) { int ret = PyDict_SetItem(obj, name, value); Py_DECREF((PyObject *)name); Py_DECREF((PyObject *)value); return ret == 0 ? 1 : 0; } -static int Object_arrayAddItem(void *prv, JSOBJ obj, JSOBJ value) { +static int Object_arrayAddItem(void *Py_UNUSED(prv), JSOBJ obj, JSOBJ value) { int ret = PyList_Append(obj, value); Py_DECREF((PyObject *)value); return ret == 0 ? 1 : 0; } -static JSOBJ Object_newString(void *prv, wchar_t *start, wchar_t *end) { +static JSOBJ Object_newString(void *Py_UNUSED(prv), wchar_t *start, + wchar_t *end) { return PyUnicode_FromWideChar(start, (end - start)); } -static JSOBJ Object_newTrue(void *prv) { Py_RETURN_TRUE; } +static JSOBJ Object_newTrue(void *Py_UNUSED(prv)) { Py_RETURN_TRUE; } -static JSOBJ Object_newFalse(void *prv) { Py_RETURN_FALSE; } +static JSOBJ Object_newFalse(void *Py_UNUSED(prv)) { Py_RETURN_FALSE; } -static JSOBJ Object_newNull(void *prv) { Py_RETURN_NONE; } +static JSOBJ Object_newNull(void *Py_UNUSED(prv)) { Py_RETURN_NONE; } -static JSOBJ Object_newPosInf(void *prv) { +static JSOBJ Object_newPosInf(void *Py_UNUSED(prv)) { return PyFloat_FromDouble(Py_HUGE_VAL); } -static JSOBJ Object_newNegInf(void *prv) { +static JSOBJ Object_newNegInf(void *Py_UNUSED(prv)) { return PyFloat_FromDouble(-Py_HUGE_VAL); } -static JSOBJ Object_newObject(void *prv, void *decoder) { return PyDict_New(); } +static JSOBJ Object_newObject(void *Py_UNUSED(prv), void *Py_UNUSED(decoder)) { + return PyDict_New(); +} -static JSOBJ Object_endObject(void *prv, JSOBJ obj) { return obj; } +static JSOBJ Object_endObject(void *Py_UNUSED(prv), JSOBJ obj) { return obj; } -static JSOBJ Object_newArray(void *prv, void *decoder) { return PyList_New(0); } +static JSOBJ Object_newArray(void *Py_UNUSED(prv), void *Py_UNUSED(decoder)) { + return PyList_New(0); +} -static JSOBJ Object_endArray(void *prv, JSOBJ obj) { return obj; } +static JSOBJ Object_endArray(void *Py_UNUSED(prv), JSOBJ obj) { return obj; } -static JSOBJ Object_newInteger(void *prv, JSINT32 value) { - return PyLong_FromLong((long)value); +static JSOBJ Object_newInteger(void *Py_UNUSED(prv), JSINT32 value) { + return PyLong_FromLong(value); } -static JSOBJ Object_newLong(void *prv, JSINT64 value) { +static JSOBJ Object_newLong(void *Py_UNUSED(prv), JSINT64 value) { return PyLong_FromLongLong(value); } -static JSOBJ Object_newUnsignedLong(void *prv, JSUINT64 value) { +static JSOBJ Object_newUnsignedLong(void *Py_UNUSED(prv), JSUINT64 value) { return PyLong_FromUnsignedLongLong(value); } -static JSOBJ Object_newDouble(void *prv, double value) { +static JSOBJ Object_newDouble(void *Py_UNUSED(prv), double value) { return PyFloat_FromDouble(value); } -static void Object_releaseObject(void *prv, JSOBJ obj, void *_decoder) { +static void Object_releaseObject(void *Py_UNUSED(prv), JSOBJ obj, + void *Py_UNUSED(decoder)) { Py_XDECREF(((PyObject *)obj)); } -PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs) { +PyObject *JSONToObj(PyObject *Py_UNUSED(self), PyObject *args, + PyObject *kwargs) { JSONObjectDecoder dec = {.newString = Object_newString, .objectAddKey = Object_objectAddKey, .arrayAddItem = Object_arrayAddItem, diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index ef88b97918d76..8bba95dd456de 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -66,9 +66,9 @@ int object_is_na_type(PyObject *obj); typedef struct __NpyArrContext { PyObject *array; char *dataptr; - int curdim; // current dimension in array's order - int stridedim; // dimension we are striding over - int inc; // stride dimension increment (+/- 1) + npy_intp curdim; // current dimension in array's order + npy_intp stridedim; // dimension we are striding over + int inc; // stride dimension increment (+/- 1) npy_intp dim; npy_intp stride; npy_intp ndim; @@ -81,8 +81,8 @@ typedef struct __NpyArrContext { } NpyArrContext; typedef struct __PdBlockContext { - int colIdx; - int ncols; + Py_ssize_t colIdx; + Py_ssize_t ncols; int transpose; NpyArrContext **npyCtxts; // NpyArrContext for each column @@ -1934,40 +1934,42 @@ PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args, PyObject *odefHandler = 0; int indent = 0; - PyObjectEncoder pyEncoder = {{ - Object_beginTypeContext, - Object_endTypeContext, - Object_getStringValue, - Object_getLongValue, - NULL, // getIntValue is unused - Object_getDoubleValue, - Object_getBigNumStringValue, - Object_iterBegin, - Object_iterNext, - Object_iterEnd, - Object_iterGetValue, - Object_iterGetName, - Object_releaseObject, - PyObject_Malloc, - PyObject_Realloc, - PyObject_Free, - -1, // recursionMax - idoublePrecision, - 1, // forceAscii - 0, // encodeHTMLChars - indent, // indent - }}; + PyObjectEncoder pyEncoder = { + { + .beginTypeContext = Object_beginTypeContext, + .endTypeContext = Object_endTypeContext, + .getStringValue = Object_getStringValue, + .getLongValue = Object_getLongValue, + .getIntValue = NULL, + .getDoubleValue = Object_getDoubleValue, + .getBigNumStringValue = Object_getBigNumStringValue, + .iterBegin = Object_iterBegin, + .iterNext = Object_iterNext, + .iterEnd = Object_iterEnd, + .iterGetValue = Object_iterGetValue, + .iterGetName = Object_iterGetName, + .releaseObject = Object_releaseObject, + .malloc = PyObject_Malloc, + .realloc = PyObject_Realloc, + .free = PyObject_Free, + .recursionMax = -1, + .doublePrecision = idoublePrecision, + .forceASCII = 1, + .encodeHTMLChars = 0, + .indent = indent, + .errorMsg = NULL, + }, + .npyCtxtPassthru = NULL, + .blkCtxtPassthru = NULL, + .npyType = -1, + .npyValue = NULL, + .datetimeIso = 0, + .datetimeUnit = NPY_FR_ms, + .outputFormat = COLUMNS, + .defaultHandler = NULL, + }; JSONObjectEncoder *encoder = (JSONObjectEncoder *)&pyEncoder; - pyEncoder.npyCtxtPassthru = NULL; - pyEncoder.blkCtxtPassthru = NULL; - pyEncoder.npyType = -1; - pyEncoder.npyValue = NULL; - pyEncoder.datetimeIso = 0; - pyEncoder.datetimeUnit = NPY_FR_ms; - pyEncoder.outputFormat = COLUMNS; - pyEncoder.defaultHandler = 0; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiOssOOi", kwlist, &oinput, &oensureAscii, &idoublePrecision, &oencodeHTMLChars, &sOrient, &sdateFormat, diff --git a/pandas/_libs/src/vendored/ujson/python/ujson.c b/pandas/_libs/src/vendored/ujson/python/ujson.c index 351f5226b57c9..075411a23b075 100644 --- a/pandas/_libs/src/vendored/ujson/python/ujson.c +++ b/pandas/_libs/src/vendored/ujson/python/ujson.c @@ -56,9 +56,11 @@ PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs); "encode_html_chars=True to encode < > & as unicode escape sequences." static PyMethodDef ujsonMethods[] = { - {"ujson_dumps", (PyCFunction)objToJSON, METH_VARARGS | METH_KEYWORDS, + {"ujson_dumps", (PyCFunction)(void (*)(void))objToJSON, + METH_VARARGS | METH_KEYWORDS, "Converts arbitrary object recursively into JSON. " ENCODER_HELP_TEXT}, - {"ujson_loads", (PyCFunction)JSONToObj, METH_VARARGS | METH_KEYWORDS, + {"ujson_loads", (PyCFunction)(void (*)(void))JSONToObj, + METH_VARARGS | METH_KEYWORDS, "Converts JSON as string to dict object structure. Use precise_float=True " "to use high precision float decoder."}, {NULL, NULL, 0, NULL} /* Sentinel */ diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index c622121578dcb..88a9a259ac8ec 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -30,19 +30,16 @@ "get_unit_from_dtype", "periods_per_day", "periods_per_second", - "is_supported_unit", - "npy_unit_to_abbrev", - "get_supported_reso", "guess_datetime_format", + "add_overflowsafe", + "get_supported_dtype", + "is_supported_dtype", ] from pandas._libs.tslibs import dtypes # pylint: disable=import-self from pandas._libs.tslibs.conversion import localize_pydatetime from pandas._libs.tslibs.dtypes import ( Resolution, - get_supported_reso, - is_supported_unit, - npy_unit_to_abbrev, periods_per_day, periods_per_second, ) @@ -55,7 +52,10 @@ from pandas._libs.tslibs.np_datetime import ( OutOfBoundsDatetime, OutOfBoundsTimedelta, + add_overflowsafe, astype_overflowsafe, + get_supported_dtype, + is_supported_dtype, is_unitless, py_get_unit_from_dtype as get_unit_from_dtype, ) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 8cca5598d0e06..4def5e2c9340e 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -599,11 +599,13 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, # Issue 9000, we short-circuit rather than going # into np_datetime_strings which returns utc dt = datetime.now(tz) + return convert_datetime_to_tsobject(dt, tz, nanos=0, reso=NPY_FR_us) elif ts == "today": # Issue 9000, we short-circuit rather than going # into np_datetime_strings which returns a normalized datetime dt = datetime.now(tz) # equiv: datetime.today().replace(tzinfo=tz) + return convert_datetime_to_tsobject(dt, tz, nanos=0, reso=NPY_FR_us) else: string_to_dts_failed = string_to_dts( ts, &dts, &out_bestunit, &out_local, @@ -647,8 +649,6 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, reso = get_supported_reso(out_bestunit) return convert_datetime_to_tsobject(dt, tz, nanos=nanos, reso=reso) - return convert_datetime_to_tsobject(dt, tz) - cdef check_overflows(_TSObject obj, NPY_DATETIMEUNIT reso=NPY_FR_ns): """ diff --git a/pandas/_libs/tslibs/dtypes.pxd b/pandas/_libs/tslibs/dtypes.pxd index bda4fcf04234b..88cfa6ca60d93 100644 --- a/pandas/_libs/tslibs/dtypes.pxd +++ b/pandas/_libs/tslibs/dtypes.pxd @@ -3,13 +3,13 @@ from numpy cimport int64_t from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT -cpdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit) +cdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit) cpdef NPY_DATETIMEUNIT abbrev_to_npy_unit(str abbrev) cdef NPY_DATETIMEUNIT freq_group_code_to_npy_unit(int freq) noexcept nogil cpdef int64_t periods_per_day(NPY_DATETIMEUNIT reso=*) except? -1 cpdef int64_t periods_per_second(NPY_DATETIMEUNIT reso) except? -1 -cpdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso) -cpdef bint is_supported_unit(NPY_DATETIMEUNIT reso) +cdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso) +cdef bint is_supported_unit(NPY_DATETIMEUNIT reso) cpdef freq_to_period_freqstr(freq_n, freq_name) cdef dict c_OFFSET_TO_PERIOD_FREQSTR diff --git a/pandas/_libs/tslibs/dtypes.pyi b/pandas/_libs/tslibs/dtypes.pyi index 76649aaaa41bf..72d12ca2d9dc7 100644 --- a/pandas/_libs/tslibs/dtypes.pyi +++ b/pandas/_libs/tslibs/dtypes.pyi @@ -4,9 +4,6 @@ OFFSET_TO_PERIOD_FREQSTR: dict[str, str] def periods_per_day(reso: int = ...) -> int: ... def periods_per_second(reso: int) -> int: ... -def is_supported_unit(reso: int) -> bool: ... -def npy_unit_to_abbrev(unit: int) -> str: ... -def get_supported_reso(reso: int) -> int: ... def abbrev_to_npy_unit(abbrev: str) -> int: ... def freq_to_period_freqstr(freq_n: int, freq_name: str) -> str: ... diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index 17f517e5e7264..52e1133e596c5 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -559,7 +559,7 @@ class NpyDatetimeUnit(Enum): NPY_FR_GENERIC = NPY_DATETIMEUNIT.NPY_FR_GENERIC -cpdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso): +cdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso): # If we have an unsupported reso, return the nearest supported reso. if reso == NPY_DATETIMEUNIT.NPY_FR_GENERIC: # TODO: or raise ValueError? trying this gives unraisable errors, but @@ -572,7 +572,7 @@ cpdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso): return reso -cpdef bint is_supported_unit(NPY_DATETIMEUNIT reso): +cdef bint is_supported_unit(NPY_DATETIMEUNIT reso): return ( reso == NPY_DATETIMEUNIT.NPY_FR_ns or reso == NPY_DATETIMEUNIT.NPY_FR_us @@ -581,7 +581,7 @@ cpdef bint is_supported_unit(NPY_DATETIMEUNIT reso): ) -cpdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit): +cdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit): if unit == NPY_DATETIMEUNIT.NPY_FR_ns or unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC: # generic -> default to nanoseconds return "ns" diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index a87c3d3f0955d..cb2658d343772 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -118,3 +118,5 @@ cdef int64_t convert_reso( NPY_DATETIMEUNIT to_reso, bint round_ok, ) except? -1 + +cpdef cnp.ndarray add_overflowsafe(cnp.ndarray left, cnp.ndarray right) diff --git a/pandas/_libs/tslibs/np_datetime.pyi b/pandas/_libs/tslibs/np_datetime.pyi index c42bc43ac9d89..00ef35c50e532 100644 --- a/pandas/_libs/tslibs/np_datetime.pyi +++ b/pandas/_libs/tslibs/np_datetime.pyi @@ -19,3 +19,9 @@ def is_unitless(dtype: np.dtype) -> bool: ... def compare_mismatched_resolutions( left: np.ndarray, right: np.ndarray, op ) -> npt.NDArray[np.bool_]: ... +def add_overflowsafe( + left: npt.NDArray[np.int64], + right: npt.NDArray[np.int64], +) -> npt.NDArray[np.int64]: ... +def get_supported_dtype(dtype: np.dtype) -> np.dtype: ... +def is_supported_dtype(dtype: np.dtype) -> bool: ... diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 9958206c51b7a..54a5bcf3164ee 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -1,3 +1,4 @@ +cimport cython from cpython.datetime cimport ( PyDateTime_CheckExact, PyDateTime_DATE_GET_HOUR, @@ -38,6 +39,8 @@ from numpy cimport ( ) from pandas._libs.tslibs.dtypes cimport ( + get_supported_reso, + is_supported_unit, npy_unit_to_abbrev, npy_unit_to_attrname, ) @@ -90,6 +93,28 @@ def py_get_unit_from_dtype(dtype): return get_unit_from_dtype(dtype) +def get_supported_dtype(dtype: cnp.dtype) -> cnp.dtype: + reso = get_unit_from_dtype(dtype) + new_reso = get_supported_reso(reso) + new_unit = npy_unit_to_abbrev(new_reso) + + # Accessing dtype.kind here incorrectly(?) gives "" instead of "m"/"M", + # so we check type_num instead + if dtype.type_num == cnp.NPY_DATETIME: + new_dtype = np.dtype(f"M8[{new_unit}]") + else: + new_dtype = np.dtype(f"m8[{new_unit}]") + return new_dtype + + +def is_supported_dtype(dtype: cnp.dtype) -> bool: + if dtype.type_num not in [cnp.NPY_DATETIME, cnp.NPY_TIMEDELTA]: + raise ValueError("is_unitless dtype must be datetime64 or timedelta64") + cdef: + NPY_DATETIMEUNIT unit = get_unit_from_dtype(dtype) + return is_supported_unit(unit) + + def is_unitless(dtype: cnp.dtype) -> bool: """ Check if a datetime64 or timedelta64 dtype has no attached unit. @@ -678,3 +703,43 @@ cdef int64_t _convert_reso_with_dtstruct( raise OutOfBoundsDatetime from err return result + + +@cython.overflowcheck(True) +cpdef cnp.ndarray add_overflowsafe(cnp.ndarray left, cnp.ndarray right): + """ + Overflow-safe addition for datetime64/timedelta64 dtypes. + + `right` may either be zero-dim or of the same shape as `left`. + """ + cdef: + Py_ssize_t N = left.size + int64_t lval, rval, res_value + ndarray iresult = cnp.PyArray_EMPTY( + left.ndim, left.shape, cnp.NPY_INT64, 0 + ) + cnp.broadcast mi = cnp.PyArray_MultiIterNew3(iresult, left, right) + + # Note: doing this try/except outside the loop improves performance over + # doing it inside the loop. + try: + for i in range(N): + # Analogous to: lval = lvalues[i] + lval = (cnp.PyArray_MultiIter_DATA(mi, 1))[0] + + # Analogous to: rval = rvalues[i] + rval = (cnp.PyArray_MultiIter_DATA(mi, 2))[0] + + if lval == NPY_DATETIME_NAT or rval == NPY_DATETIME_NAT: + res_value = NPY_DATETIME_NAT + else: + res_value = lval + rval + + # Analogous to: result[i] = res_value + (cnp.PyArray_MultiIter_DATA(mi, 0))[0] = res_value + + cnp.PyArray_MultiIter_NEXT(mi) + except OverflowError as err: + raise OverflowError("Overflow in int64 addition") from err + + return iresult diff --git a/pandas/_libs/tslibs/period.pyi b/pandas/_libs/tslibs/period.pyi index 846d238beadbd..22f3bdbe668de 100644 --- a/pandas/_libs/tslibs/period.pyi +++ b/pandas/_libs/tslibs/period.pyi @@ -63,7 +63,7 @@ class PeriodMixin: def end_time(self) -> Timestamp: ... @property def start_time(self) -> Timestamp: ... - def _require_matching_freq(self, other, base: bool = ...) -> None: ... + def _require_matching_freq(self, other: BaseOffset, base: bool = ...) -> None: ... class Period(PeriodMixin): ordinal: int # int64_t @@ -87,7 +87,7 @@ class Period(PeriodMixin): @classmethod def _maybe_convert_freq(cls, freq) -> BaseOffset: ... @classmethod - def _from_ordinal(cls, ordinal: int, freq) -> Period: ... + def _from_ordinal(cls, ordinal: int, freq: BaseOffset) -> Period: ... @classmethod def now(cls, freq: Frequency) -> Period: ... def strftime(self, fmt: str | None) -> str: ... diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 22f96f8f6c3fa..eeaf472c23b60 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1714,21 +1714,16 @@ cdef class PeriodMixin: """ return self.to_timestamp(how="end") - def _require_matching_freq(self, other, base=False): + def _require_matching_freq(self, other: BaseOffset, bint base=False): # See also arrays.period.raise_on_incompatible - if is_offset_object(other): - other_freq = other - else: - other_freq = other.freq - if base: - condition = self.freq.base != other_freq.base + condition = self.freq.base != other.base else: - condition = self.freq != other_freq + condition = self.freq != other if condition: freqstr = freq_to_period_freqstr(self.freq.n, self.freq.name) - other_freqstr = freq_to_period_freqstr(other_freq.n, other_freq.name) + other_freqstr = freq_to_period_freqstr(other.n, other.name) msg = DIFFERENT_FREQ.format( cls=type(self).__name__, own_freq=freqstr, @@ -1761,21 +1756,12 @@ cdef class _Period(PeriodMixin): @classmethod def _maybe_convert_freq(cls, object freq) -> BaseOffset: """ - Internally we allow integer and tuple representations (for now) that - are not recognized by to_offset, so we convert them here. Also, a - Period's freq attribute must have `freq.n > 0`, which we check for here. + A Period's freq attribute must have `freq.n > 0`, which we check for here. Returns ------- DateOffset """ - if isinstance(freq, int): - # We already have a dtype code - dtype = PeriodDtypeBase(freq, 1) - freq = dtype._freqstr - elif isinstance(freq, PeriodDtypeBase): - freq = freq._freqstr - freq = to_offset(freq, is_period=True) if freq.n <= 0: @@ -1785,7 +1771,7 @@ cdef class _Period(PeriodMixin): return freq @classmethod - def _from_ordinal(cls, ordinal: int64_t, freq) -> "Period": + def _from_ordinal(cls, ordinal: int64_t, freq: BaseOffset) -> "Period": """ Fast creation from an ordinal and freq that are already validated! """ @@ -1803,7 +1789,7 @@ cdef class _Period(PeriodMixin): return False elif op == Py_NE: return True - self._require_matching_freq(other) + self._require_matching_freq(other.freq) return PyObject_RichCompareBool(self.ordinal, other.ordinal, op) elif other is NaT: return op == Py_NE @@ -1893,7 +1879,7 @@ cdef class _Period(PeriodMixin): ): return self + (-other) elif is_period_object(other): - self._require_matching_freq(other) + self._require_matching_freq(other.freq) # GH 23915 - mul by base freq since __add__ is agnostic of n return (self.ordinal - other.ordinal) * self.freq.base elif other is NaT: @@ -1993,8 +1979,10 @@ cdef class _Period(PeriodMixin): return endpoint - np.timedelta64(1, "ns") if freq is None: - freq = self._dtype._get_to_timestamp_base() - base = freq + freq_code = self._dtype._get_to_timestamp_base() + dtype = PeriodDtypeBase(freq_code, 1) + freq = dtype._freqstr + base = freq_code else: freq = self._maybe_convert_freq(freq) base = freq._period_dtype_code @@ -2841,7 +2829,8 @@ class Period(_Period): FutureWarning, stacklevel=find_stack_level(), ) - + if ordinal == NPY_NAT: + return NaT return cls._from_ordinal(ordinal, freq) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 6cad71b3dfd18..d9db2bc5cddb4 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -1206,8 +1206,8 @@ def assert_frame_equal( # compare by blocks if by_blocks: - rblocks = right._to_dict_of_blocks(copy=False) - lblocks = left._to_dict_of_blocks(copy=False) + rblocks = right._to_dict_of_blocks() + lblocks = left._to_dict_of_blocks() for dtype in list(set(list(lblocks.keys()) + list(rblocks.keys()))): assert dtype in lblocks assert dtype in rblocks diff --git a/pandas/conftest.py b/pandas/conftest.py index 003474b57c8e1..7c829ed4b8cb9 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1903,7 +1903,7 @@ def using_copy_on_write() -> bool: @pytest.fixture def warn_copy_on_write() -> bool: """ - Fixture to check if Copy-on-Write is enabled. + Fixture to check if Copy-on-Write is in warning mode. """ return ( pd.options.mode.copy_on_write == "warn" @@ -1914,9 +1914,9 @@ def warn_copy_on_write() -> bool: @pytest.fixture def using_infer_string() -> bool: """ - Fixture to check if infer_string is enabled. + Fixture to check if infer string option is enabled. """ - return pd.options.future.infer_string + return pd.options.future.infer_string is True warsaws = ["Europe/Warsaw", "dateutil/Europe/Warsaw"] diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 82de8ae96160f..ec0fe32163ec8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -932,7 +932,10 @@ def value_counts_internal( idx = Index(keys) if idx.dtype == bool and keys.dtype == object: idx = idx.astype(object) - elif idx.dtype != keys.dtype: + elif ( + idx.dtype != keys.dtype # noqa: PLR1714 # # pylint: disable=R1714 + and idx.dtype != "string[pyarrow_numpy]" + ): warnings.warn( # GH#56161 "The behavior of value_counts with object-dtype is deprecated. " @@ -1119,98 +1122,6 @@ def rank( return ranks -def checked_add_with_arr( - arr: npt.NDArray[np.int64], - b: int | npt.NDArray[np.int64], - arr_mask: npt.NDArray[np.bool_] | None = None, - b_mask: npt.NDArray[np.bool_] | None = None, -) -> npt.NDArray[np.int64]: - """ - Perform array addition that checks for underflow and overflow. - - Performs the addition of an int64 array and an int64 integer (or array) - but checks that they do not result in overflow first. For elements that - are indicated to be NaN, whether or not there is overflow for that element - is automatically ignored. - - Parameters - ---------- - arr : np.ndarray[int64] addend. - b : array or scalar addend. - arr_mask : np.ndarray[bool] or None, default None - array indicating which elements to exclude from checking - b_mask : np.ndarray[bool] or None, default None - array or scalar indicating which element(s) to exclude from checking - - Returns - ------- - sum : An array for elements x + b for each element x in arr if b is - a scalar or an array for elements x + y for each element pair - (x, y) in (arr, b). - - Raises - ------ - OverflowError if any x + y exceeds the maximum or minimum int64 value. - """ - # For performance reasons, we broadcast 'b' to the new array 'b2' - # so that it has the same size as 'arr'. - b2 = np.broadcast_to(b, arr.shape) - if b_mask is not None: - # We do the same broadcasting for b_mask as well. - b2_mask = np.broadcast_to(b_mask, arr.shape) - else: - b2_mask = None - - # For elements that are NaN, regardless of their value, we should - # ignore whether they overflow or not when doing the checked add. - if arr_mask is not None and b2_mask is not None: - not_nan = np.logical_not(arr_mask | b2_mask) - elif arr_mask is not None: - not_nan = np.logical_not(arr_mask) - elif b_mask is not None: - # error: Argument 1 to "__call__" of "_UFunc_Nin1_Nout1" has - # incompatible type "Optional[ndarray[Any, dtype[bool_]]]"; - # expected "Union[_SupportsArray[dtype[Any]], _NestedSequence - # [_SupportsArray[dtype[Any]]], bool, int, float, complex, str - # , bytes, _NestedSequence[Union[bool, int, float, complex, str - # , bytes]]]" - not_nan = np.logical_not(b2_mask) # type: ignore[arg-type] - else: - not_nan = np.empty(arr.shape, dtype=bool) - not_nan.fill(True) - - # gh-14324: For each element in 'arr' and its corresponding element - # in 'b2', we check the sign of the element in 'b2'. If it is positive, - # we then check whether its sum with the element in 'arr' exceeds - # np.iinfo(np.int64).max. If so, we have an overflow error. If it - # it is negative, we then check whether its sum with the element in - # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow - # error as well. - i8max = lib.i8max - i8min = iNaT - - mask1 = b2 > 0 - mask2 = b2 < 0 - - if not mask1.any(): - to_raise = ((i8min - b2 > arr) & not_nan).any() - elif not mask2.any(): - to_raise = ((i8max - b2 < arr) & not_nan).any() - else: - to_raise = ((i8max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or ( - (i8min - b2[mask2] > arr[mask2]) & not_nan[mask2] - ).any() - - if to_raise: - raise OverflowError("Overflow in int64 addition") - - result = arr + b - if arr_mask is not None or b2_mask is not None: - np.putmask(result, ~not_nan, iNaT) - - return result - - # ---- # # take # # ---- # diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index d6f4dbfe7f549..cb8f802239146 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -13,10 +13,7 @@ from pandas._libs import lib from pandas._libs.arrays import NDArrayBacked -from pandas._libs.tslibs import ( - get_unit_from_dtype, - is_supported_unit, -) +from pandas._libs.tslibs import is_supported_dtype from pandas._typing import ( ArrayLike, AxisInt, @@ -141,16 +138,12 @@ def view(self, dtype: Dtype | None = None) -> ArrayLike: cls = dtype.construct_array_type() # type: ignore[assignment] dt64_values = arr.view(f"M8[{dtype.unit}]") return cls(dt64_values, dtype=dtype) - elif lib.is_np_dtype(dtype, "M") and is_supported_unit( - get_unit_from_dtype(dtype) - ): + elif lib.is_np_dtype(dtype, "M") and is_supported_dtype(dtype): from pandas.core.arrays import DatetimeArray dt64_values = arr.view(dtype) return DatetimeArray(dt64_values, dtype=dtype) - elif lib.is_np_dtype(dtype, "m") and is_supported_unit( - get_unit_from_dtype(dtype) - ): + elif lib.is_np_dtype(dtype, "m") and is_supported_dtype(dtype): from pandas.core.arrays import TimedeltaArray td64_values = arr.view(dtype) @@ -430,6 +423,12 @@ def _where(self: Self, mask: npt.NDArray[np.bool_], value) -> Self: value = self._validate_setitem_value(value) res_values = np.where(mask, self._ndarray, value) + if res_values.dtype != self._ndarray.dtype: + raise AssertionError( + # GH#56410 + "Something has gone wrong, please report a bug at " + "github.com/pandas-dev/pandas/" + ) return self._from_backing_data(res_values) # ------------------------------------------------------------------------ diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 1609bf50a834a..d8b074fe61322 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -668,16 +668,22 @@ def _evaluate_op_method(self, other, op, arrow_funcs): pa_type = self._pa_array.type other = self._box_pa(other) - if (pa.types.is_string(pa_type) or pa.types.is_binary(pa_type)) and op in [ - operator.add, - roperator.radd, - ]: - sep = pa.scalar("", type=pa_type) - if op is operator.add: - result = pc.binary_join_element_wise(self._pa_array, other, sep) - else: - result = pc.binary_join_element_wise(other, self._pa_array, sep) - return type(self)(result) + if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type): + if op in [operator.add, roperator.radd, operator.mul, roperator.rmul]: + sep = pa.scalar("", type=pa_type) + if op is operator.add: + result = pc.binary_join_element_wise(self._pa_array, other, sep) + elif op is roperator.radd: + result = pc.binary_join_element_wise(other, self._pa_array, sep) + else: + if not ( + isinstance(other, pa.Scalar) and pa.types.is_integer(other.type) + ): + raise TypeError("Can only string multiply by an integer.") + result = pc.binary_join_element_wise( + *([self._pa_array] * other.as_py()), sep + ) + return type(self)(result) if ( isinstance(other, pa.Scalar) @@ -1017,7 +1023,7 @@ def fillna( return super().fillna(value=value, method=method, limit=limit, copy=copy) - def isin(self, values) -> npt.NDArray[np.bool_]: + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: # short-circuit to return all False array. if not len(values): return np.zeros(len(self), dtype=bool) @@ -2149,7 +2155,15 @@ def _str_replace( ) func = pc.replace_substring_regex if regex else pc.replace_substring - result = func(self._pa_array, pattern=pat, replacement=repl, max_replacements=n) + # https://github.com/apache/arrow/issues/39149 + # GH 56404, unexpected behavior with negative max_replacements with pyarrow. + pa_max_replacements = None if n < 0 else n + result = func( + self._pa_array, + pattern=pat, + replacement=repl, + max_replacements=pa_max_replacements, + ) return type(self)(result) def _str_repeat(self, repeats: int | Sequence[int]): @@ -2179,7 +2193,8 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) result = pc.find_substring(slices, sub) not_found = pc.equal(result, -1) - offset_result = pc.add(result, end - start) + start_offset = max(0, start) + offset_result = pc.add(result, start_offset) result = pc.if_else(not_found, result, offset_result) elif start == 0 and end is None: slices = self._pa_array diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index e61e374009163..3272a594f4cf4 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1355,7 +1355,7 @@ def equals(self, other: object) -> bool: equal_na = self.isna() & other.isna() # type: ignore[operator] return bool((equal_values | equal_na).all()) - def isin(self, values) -> npt.NDArray[np.bool_]: + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: """ Pointwise comparison for set containment in the given values. @@ -1363,7 +1363,7 @@ def isin(self, values) -> npt.NDArray[np.bool_]: Parameters ---------- - values : Sequence + values : np.ndarray or ExtensionArray Returns ------- diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index eec833c600177..20aec52b606b6 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2570,7 +2570,7 @@ def describe(self) -> DataFrame: return result - def isin(self, values) -> npt.NDArray[np.bool_]: + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: """ Check whether `values` are contained in Categorical. @@ -2580,7 +2580,7 @@ def isin(self, values) -> npt.NDArray[np.bool_]: Parameters ---------- - values : set or list-like + values : np.ndarray or ExtensionArray The sequence of values to test. Passing in a single string will raise a ``TypeError``. Instead, turn a single string into a list of one element. @@ -2611,13 +2611,6 @@ def isin(self, values) -> npt.NDArray[np.bool_]: >>> s.isin(['lama']) array([ True, False, True, False, True, False]) """ - if not is_list_like(values): - values_type = type(values).__name__ - raise TypeError( - "only list-like objects are allowed to be passed " - f"to isin(), you passed a `{values_type}`" - ) - values = sanitize_array(values, None, None) null_mask = np.asarray(isna(values)) code_values = self.categories.get_indexer_for(values) code_values = code_values[null_mask | (code_values >= 0)] @@ -2626,6 +2619,8 @@ def isin(self, values) -> npt.NDArray[np.bool_]: def _replace(self, *, to_replace, value, inplace: bool = False): from pandas import Index + orig_dtype = self.dtype + inplace = validate_bool_kwarg(inplace, "inplace") cat = self if inplace else self.copy() @@ -2656,6 +2651,17 @@ def _replace(self, *, to_replace, value, inplace: bool = False): new_dtype = CategoricalDtype(new_categories, ordered=self.dtype.ordered) NDArrayBacked.__init__(cat, new_codes, new_dtype) + if new_dtype != orig_dtype: + warnings.warn( + # GH#55147 + "The behavior of Series.replace (and DataFrame.replace) with " + "CategoricalDtype is deprecated. In a future version, replace " + "will only be used for cases that preserve the categories. " + "To change the categories, use ser.cat.rename_categories " + "instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) if not inplace: return cat diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index a88f40013b3f6..91dd40c2deced 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -35,6 +35,7 @@ Tick, Timedelta, Timestamp, + add_overflowsafe, astype_overflowsafe, get_unit_from_dtype, iNaT, @@ -112,7 +113,6 @@ ops, ) from pandas.core.algorithms import ( - checked_add_with_arr, isin, map_array, unique1d, @@ -646,6 +646,9 @@ def _validation_error_message(self, value, allow_listlike: bool = False) -> str: def _validate_listlike(self, value, allow_object: bool = False): if isinstance(value, type(self)): + if self.dtype.kind in "mM" and not allow_object: + # error: "DatetimeLikeArrayMixin" has no attribute "as_unit" + value = value.as_unit(self.unit, round_ok=False) # type: ignore[attr-defined] return value if isinstance(value, list) and len(value) == 0: @@ -694,6 +697,9 @@ def _validate_listlike(self, value, allow_object: bool = False): msg = self._validation_error_message(value, True) raise TypeError(msg) + if self.dtype.kind in "mM" and not allow_object: + # error: "DatetimeLikeArrayMixin" has no attribute "as_unit" + value = value.as_unit(self.unit, round_ok=False) # type: ignore[attr-defined] return value def _validate_setitem_value(self, value): @@ -734,26 +740,25 @@ def map(self, mapper, na_action=None): else: return result.array - def isin(self, values) -> npt.NDArray[np.bool_]: + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: """ Compute boolean array of whether each value is found in the passed set of values. Parameters ---------- - values : set or sequence of values + values : np.ndarray or ExtensionArray Returns ------- ndarray[bool] """ - if not hasattr(values, "dtype"): - values = np.asarray(values) - if values.dtype.kind in "fiuc": # TODO: de-duplicate with equals, validate_comparison_value return np.zeros(self.shape, dtype=bool) + values = ensure_wrapped_if_datetimelike(values) + if not isinstance(values, type(self)): inferable = [ "timedelta", @@ -764,6 +769,14 @@ def isin(self, values) -> npt.NDArray[np.bool_]: "period", ] if values.dtype == object: + values = lib.maybe_convert_objects( + values, # type: ignore[arg-type] + convert_non_numeric=True, + dtype_if_all_nat=self.dtype, + ) + if values.dtype != object: + return self.isin(values) + inferred = lib.infer_dtype(values, skipna=False) if inferred not in inferable: if inferred == "string": @@ -778,18 +791,36 @@ def isin(self, values) -> npt.NDArray[np.bool_]: values = type(self)._from_sequence(values) except ValueError: return isin(self.astype(object), values) + else: + warnings.warn( + # GH#53111 + f"The behavior of 'isin' with dtype={self.dtype} and " + "castable values (e.g. strings) is deprecated. In a " + "future version, these will not be considered matching " + "by isin. Explicitly cast to the appropriate dtype before " + "calling isin instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) if self.dtype.kind in "mM": self = cast("DatetimeArray | TimedeltaArray", self) - values = values.as_unit(self.unit) + # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]" + # has no attribute "as_unit" + values = values.as_unit(self.unit) # type: ignore[union-attr] try: - self._check_compatible_with(values) + # error: Argument 1 to "_check_compatible_with" of "DatetimeLikeArrayMixin" + # has incompatible type "ExtensionArray | ndarray[Any, Any]"; expected + # "Period | Timestamp | Timedelta | NaTType" + self._check_compatible_with(values) # type: ignore[arg-type] except (TypeError, ValueError): # Includes tzawareness mismatch and IncompatibleFrequencyError return np.zeros(self.shape, dtype=bool) - return isin(self.asi8, values.asi8) + # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]" + # has no attribute "asi8" + return isin(self.asi8, values.asi8) # type: ignore[union-attr] # ------------------------------------------------------------------ # Null Handling @@ -1013,7 +1044,7 @@ def _get_i8_values_and_mask( self, other ) -> tuple[int | npt.NDArray[np.int64], None | npt.NDArray[np.bool_]]: """ - Get the int64 values and b_mask to pass to checked_add_with_arr. + Get the int64 values and b_mask to pass to add_overflowsafe. """ if isinstance(other, Period): i8values = other.ordinal @@ -1069,9 +1100,7 @@ def _add_datetimelike_scalar(self, other) -> DatetimeArray: self = cast("TimedeltaArray", self) other_i8, o_mask = self._get_i8_values_and_mask(other) - result = checked_add_with_arr( - self.asi8, other_i8, arr_mask=self._isnan, b_mask=o_mask - ) + result = add_overflowsafe(self.asi8, np.asarray(other_i8, dtype="i8")) res_values = result.view(f"M8[{self.unit}]") dtype = tz_to_dtype(tz=other.tz, unit=self.unit) @@ -1134,9 +1163,7 @@ def _sub_datetimelike(self, other: Timestamp | DatetimeArray) -> TimedeltaArray: raise type(err)(new_message) from err other_i8, o_mask = self._get_i8_values_and_mask(other) - res_values = checked_add_with_arr( - self.asi8, -other_i8, arr_mask=self._isnan, b_mask=o_mask - ) + res_values = add_overflowsafe(self.asi8, np.asarray(-other_i8, dtype="i8")) res_m8 = res_values.view(f"timedelta64[{self.unit}]") new_freq = self._get_arithmetic_result_freq(other) @@ -1202,9 +1229,7 @@ def _add_timedeltalike(self, other: Timedelta | TimedeltaArray): self = cast("DatetimeArray | TimedeltaArray", self) other_i8, o_mask = self._get_i8_values_and_mask(other) - new_values = checked_add_with_arr( - self.asi8, other_i8, arr_mask=self._isnan, b_mask=o_mask - ) + new_values = add_overflowsafe(self.asi8, np.asarray(other_i8, dtype="i8")) res_values = new_values.view(self._ndarray.dtype) new_freq = self._get_arithmetic_result_freq(other) @@ -1272,9 +1297,7 @@ def _sub_periodlike(self, other: Period | PeriodArray) -> npt.NDArray[np.object_ self._check_compatible_with(other) other_i8, o_mask = self._get_i8_values_and_mask(other) - new_i8_data = checked_add_with_arr( - self.asi8, -other_i8, arr_mask=self._isnan, b_mask=o_mask - ) + new_i8_data = add_overflowsafe(self.asi8, np.asarray(-other_i8, dtype="i8")) new_data = np.array([self.freq.base * x for x in new_i8_data]) if o_mask is None: @@ -2103,7 +2126,9 @@ def _validate_frequency(cls, index, freq: BaseOffset, **kwargs): ) from err @classmethod - def _generate_range(cls, start, end, periods, freq, *args, **kwargs) -> Self: + def _generate_range( + cls, start, end, periods: int | None, freq, *args, **kwargs + ) -> Self: raise AbstractMethodError(cls) # -------------------------------------------------------------- @@ -2119,12 +2144,12 @@ def unit(self) -> str: # "ExtensionDtype"; expected "Union[DatetimeTZDtype, dtype[Any]]" return dtype_to_unit(self.dtype) # type: ignore[arg-type] - def as_unit(self, unit: str) -> Self: + def as_unit(self, unit: str, round_ok: bool = True) -> Self: if unit not in ["s", "ms", "us", "ns"]: raise ValueError("Supported units are 's', 'ms', 'us', 'ns'") dtype = np.dtype(f"{self.dtype.kind}8[{unit}]") - new_values = astype_overflowsafe(self._ndarray, dtype, round_ok=True) + new_values = astype_overflowsafe(self._ndarray, dtype, round_ok=round_ok) if isinstance(self.dtype, np.dtype): new_dtype = new_values.dtype diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 496a6987c3264..0074645a482b2 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -27,14 +27,13 @@ astype_overflowsafe, fields, get_resolution, - get_supported_reso, + get_supported_dtype, get_unit_from_dtype, ints_to_pydatetime, is_date_array_normalized, - is_supported_unit, + is_supported_dtype, is_unitless, normalize_i8_timestamps, - npy_unit_to_abbrev, timezones, to_offset, tz_convert_from_utc, @@ -405,7 +404,7 @@ def _generate_range( # type: ignore[override] cls, start, end, - periods, + periods: int | None, freq, tz=None, normalize: bool = False, @@ -441,9 +440,9 @@ def _generate_range( # type: ignore[override] else: unit = "ns" - if start is not None and unit is not None: + if start is not None: start = start.as_unit(unit, round_ok=False) - if end is not None and unit is not None: + if end is not None: end = end.as_unit(unit, round_ok=False) left_inclusive, right_inclusive = validate_inclusive(inclusive) @@ -452,14 +451,8 @@ def _generate_range( # type: ignore[override] if tz is not None: # Localize the start and end arguments - start_tz = None if start is None else start.tz - end_tz = None if end is None else end.tz - start = _maybe_localize_point( - start, start_tz, start, freq, tz, ambiguous, nonexistent - ) - end = _maybe_localize_point( - end, end_tz, end, freq, tz, ambiguous, nonexistent - ) + start = _maybe_localize_point(start, freq, tz, ambiguous, nonexistent) + end = _maybe_localize_point(end, freq, tz, ambiguous, nonexistent) if freq is not None: # We break Day arithmetic (fixed 24 hour) here and opt for @@ -505,6 +498,7 @@ def _generate_range( # type: ignore[override] # Nanosecond-granularity timestamps aren't always correctly # representable with doubles, so we limit the range that we # pass to np.linspace as much as possible + periods = cast(int, periods) i8values = ( np.linspace(0, end._value - start._value, periods, dtype="int64") + start._value @@ -717,7 +711,7 @@ def astype(self, dtype, copy: bool = True): self.tz is None and lib.is_np_dtype(dtype, "M") and not is_unitless(dtype) - and is_supported_unit(get_unit_from_dtype(dtype)) + and is_supported_dtype(dtype) ): # unit conversion e.g. datetime64[s] res_values = astype_overflowsafe(self._ndarray, dtype, copy=True) @@ -2312,7 +2306,7 @@ def _sequence_to_dt64( assert isinstance(result, np.ndarray), type(result) assert result.dtype.kind == "M" assert result.dtype != "M8" - assert is_supported_unit(get_unit_from_dtype(result.dtype)) + assert is_supported_dtype(result.dtype) return result, tz @@ -2326,14 +2320,10 @@ def _construct_from_dt64_naive( # lib.is_np_dtype(data.dtype) new_dtype = data.dtype - data_unit = get_unit_from_dtype(new_dtype) - if not is_supported_unit(data_unit): + if not is_supported_dtype(new_dtype): # Cast to the nearest supported unit, generally "s" - new_reso = get_supported_reso(data_unit) - new_unit = npy_unit_to_abbrev(new_reso) - new_dtype = np.dtype(f"M8[{new_unit}]") + new_dtype = get_supported_dtype(new_dtype) data = astype_overflowsafe(data, dtype=new_dtype, copy=False) - data_unit = get_unit_from_dtype(new_dtype) copy = False if data.dtype.byteorder == ">": @@ -2351,6 +2341,7 @@ def _construct_from_dt64_naive( if data.ndim > 1: data = data.ravel() + data_unit = get_unit_from_dtype(new_dtype) data = tzconversion.tz_localize_to_utc( data.view("i8"), tz, ambiguous=ambiguous, creso=data_unit ) @@ -2557,7 +2548,7 @@ def _validate_dt64_dtype(dtype): if ( isinstance(dtype, np.dtype) - and (dtype.kind != "M" or not is_supported_unit(get_unit_from_dtype(dtype))) + and (dtype.kind != "M" or not is_supported_dtype(dtype)) ) or not isinstance(dtype, (np.dtype, DatetimeTZDtype)): raise ValueError( f"Unexpected value for 'dtype': '{dtype}'. " @@ -2688,7 +2679,9 @@ def _maybe_normalize_endpoints( return start, end -def _maybe_localize_point(ts, is_none, is_not_none, freq, tz, ambiguous, nonexistent): +def _maybe_localize_point( + ts: Timestamp | None, freq, tz, ambiguous, nonexistent +) -> Timestamp | None: """ Localize a start or end Timestamp to the timezone of the corresponding start or end Timestamp @@ -2696,8 +2689,6 @@ def _maybe_localize_point(ts, is_none, is_not_none, freq, tz, ambiguous, nonexis Parameters ---------- ts : start or end Timestamp to potentially localize - is_none : argument that should be None - is_not_none : argument that should not be None freq : Tick, DateOffset, or None tz : str, timezone object or None ambiguous: str, localization behavior for ambiguous times @@ -2710,7 +2701,7 @@ def _maybe_localize_point(ts, is_none, is_not_none, freq, tz, ambiguous, nonexis # Make sure start and end are timezone localized if: # 1) freq = a Timedelta-like frequency (Tick) # 2) freq = None i.e. generating a linspaced range - if is_none is None and is_not_none is not None: + if ts is not None and ts.tzinfo is None: # Note: We can't ambiguous='infer' a singular ambiguous time; however, # we have historically defaulted ambiguous=False ambiguous = ambiguous if ambiguous != "infer" else False diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 126484ed4a2a0..383f8a49fd02c 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1789,12 +1789,8 @@ def contains(self, other): other < self._right if self.open_right else other <= self._right ) - def isin(self, values) -> npt.NDArray[np.bool_]: - if not hasattr(values, "dtype"): - values = np.array(values) - values = extract_array(values, extract_numpy=True) - - if isinstance(values.dtype, IntervalDtype): + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: + if isinstance(values, IntervalArray): if self.closed != values.closed: # not comparable -> no overlap return np.zeros(self.shape, dtype=bool) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 201ce44ed0163..b35c1033df384 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -15,10 +15,7 @@ lib, missing as libmissing, ) -from pandas._libs.tslibs import ( - get_unit_from_dtype, - is_supported_unit, -) +from pandas._libs.tslibs import is_supported_dtype from pandas._typing import ( ArrayLike, AstypeArg, @@ -876,9 +873,7 @@ def _maybe_mask_result( return BooleanArray(result, mask, copy=False) - elif lib.is_np_dtype(result.dtype, "m") and is_supported_unit( - get_unit_from_dtype(result.dtype) - ): + elif lib.is_np_dtype(result.dtype, "m") and is_supported_dtype(result.dtype): # e.g. test_numeric_arr_mul_tdscalar_numexpr_path from pandas.core.arrays import TimedeltaArray @@ -955,7 +950,7 @@ def take( # error: Return type "BooleanArray" of "isin" incompatible with return type # "ndarray" in supertype "ExtensionArray" - def isin(self, values) -> BooleanArray: # type: ignore[override] + def isin(self, values: ArrayLike) -> BooleanArray: # type: ignore[override] from pandas.core.arrays import BooleanArray # algorithms.isin will eventually convert values to an ndarray, so no extra diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index efe0c0df45e00..d83a37088daec 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -8,10 +8,7 @@ import numpy as np from pandas._libs import lib -from pandas._libs.tslibs import ( - get_unit_from_dtype, - is_supported_unit, -) +from pandas._libs.tslibs import is_supported_dtype from pandas.compat.numpy import function as nv from pandas.core.dtypes.astype import astype_array @@ -553,9 +550,7 @@ def _cmp_method(self, other, op): def _wrap_ndarray_result(self, result: np.ndarray): # If we have timedelta64[ns] result, return a TimedeltaArray instead # of a NumpyExtensionArray - if result.dtype.kind == "m" and is_supported_unit( - get_unit_from_dtype(result.dtype) - ): + if result.dtype.kind == "m" and is_supported_dtype(result.dtype): from pandas.core.arrays import TimedeltaArray return TimedeltaArray._simple_new(result, dtype=result.dtype) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index a8c21cfbb6e2f..e3492dd21ea57 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -25,6 +25,7 @@ NaT, NaTType, Timedelta, + add_overflowsafe, astype_overflowsafe, dt64arr_to_periodarr as c_dt64arr_to_periodarr, get_unit_from_dtype, @@ -35,6 +36,7 @@ ) from pandas._libs.tslibs.dtypes import ( FreqGroup, + PeriodDtypeBase, freq_to_period_freqstr, ) from pandas._libs.tslibs.fields import isleapyear_arr @@ -71,7 +73,6 @@ ) from pandas.core.dtypes.missing import isna -import pandas.core.algorithms as algos from pandas.core.arrays import datetimelike as dtl import pandas.core.common as com @@ -370,10 +371,15 @@ def _unbox_scalar( # type: ignore[override] def _scalar_from_string(self, value: str) -> Period: return Period(value, freq=self.freq) - def _check_compatible_with(self, other) -> None: + # error: Argument 1 of "_check_compatible_with" is incompatible with + # supertype "DatetimeLikeArrayMixin"; supertype defines the argument type + # as "Period | Timestamp | Timedelta | NaTType" + def _check_compatible_with(self, other: Period | NaTType | PeriodArray) -> None: # type: ignore[override] if other is NaT: return - self._require_matching_freq(other) + # error: Item "NaTType" of "Period | NaTType | PeriodArray" has no + # attribute "freq" + self._require_matching_freq(other.freq) # type: ignore[union-attr] # -------------------------------------------------------------------- # Data / Attributes @@ -647,8 +653,10 @@ def to_timestamp(self, freq=None, how: str = "start") -> DatetimeArray: return (self + self.freq).to_timestamp(how="start") - adjust if freq is None: - freq = self._dtype._get_to_timestamp_base() - base = freq + freq_code = self._dtype._get_to_timestamp_base() + dtype = PeriodDtypeBase(freq_code, 1) + freq = dtype._freqstr + base = freq_code else: freq = Period._maybe_convert_freq(freq) base = freq._period_dtype_code @@ -847,7 +855,7 @@ def _addsub_int_array_or_scalar( assert op in [operator.add, operator.sub] if op is operator.sub: other = -other - res_values = algos.checked_add_with_arr(self.asi8, other, arr_mask=self._isnan) + res_values = add_overflowsafe(self.asi8, np.asarray(other, dtype="i8")) return type(self)(res_values, dtype=self.dtype) def _add_offset(self, other: BaseOffset): @@ -912,12 +920,7 @@ def _add_timedelta_arraylike( "not an integer multiple of the PeriodArray's freq." ) from err - b_mask = np.isnat(delta) - - res_values = algos.checked_add_with_arr( - self.asi8, delta.view("i8"), arr_mask=self._isnan, b_mask=b_mask - ) - np.putmask(res_values, self._isnan | b_mask, iNaT) + res_values = add_overflowsafe(self.asi8, np.asarray(delta.view("i8"))) return type(self)(res_values, dtype=self.dtype) def _check_timedeltalike_freq_compat(self, other): diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 6eb1387c63a0a..c98fbb836cc6d 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -270,7 +270,7 @@ def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame: Examples -------- >>> import scipy.sparse - >>> mat = scipy.sparse.eye(3) + >>> mat = scipy.sparse.eye(3, dtype=float) >>> pd.DataFrame.sparse.from_spmatrix(mat) 0 1 2 0 1.0 0.0 0.0 diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 96ebb4901f797..21fe7cd8180ad 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,6 +1,7 @@ from __future__ import annotations from functools import partial +import operator import re from typing import ( TYPE_CHECKING, @@ -53,6 +54,7 @@ from collections.abc import Sequence from pandas._typing import ( + ArrayLike, AxisInt, Dtype, Scalar, @@ -211,7 +213,7 @@ def _maybe_convert_setitem_value(self, value): raise TypeError("Scalar must be NA or str") return super()._maybe_convert_setitem_value(value) - def isin(self, values) -> npt.NDArray[np.bool_]: + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: value_set = [ pa_scalar.as_py() for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values] @@ -663,7 +665,10 @@ def _convert_int_dtype(self, result): def _cmp_method(self, other, op): result = super()._cmp_method(other, op) - return result.to_numpy(np.bool_, na_value=False) + if op == operator.ne: + return result.to_numpy(np.bool_, na_value=True) + else: + return result.to_numpy(np.bool_, na_value=False) def value_counts(self, dropna: bool = True) -> Series: from pandas import Series diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index f55d3de8878ad..ccb63d6677b1a 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -19,11 +19,9 @@ Tick, Timedelta, astype_overflowsafe, - get_supported_reso, - get_unit_from_dtype, + get_supported_dtype, iNaT, - is_supported_unit, - npy_unit_to_abbrev, + is_supported_dtype, periods_per_second, ) from pandas._libs.tslibs.conversion import cast_from_unit_vectorized @@ -352,7 +350,7 @@ def astype(self, dtype, copy: bool = True): return self.copy() return self - if is_supported_unit(get_unit_from_dtype(dtype)): + if is_supported_dtype(dtype): # unit conversion e.g. timedelta64[s] res_values = astype_overflowsafe(self._ndarray, dtype, copy=False) return type(self)._simple_new( @@ -1064,12 +1062,9 @@ def sequence_to_td64ns( copy = False elif lib.is_np_dtype(data.dtype, "m"): - data_unit = get_unit_from_dtype(data.dtype) - if not is_supported_unit(data_unit): + if not is_supported_dtype(data.dtype): # cast to closest supported unit, i.e. s or ns - new_reso = get_supported_reso(data_unit) - new_unit = npy_unit_to_abbrev(new_reso) - new_dtype = np.dtype(f"m8[{new_unit}]") + new_dtype = get_supported_dtype(data.dtype) data = astype_overflowsafe(data, dtype=new_dtype, copy=False) copy = False @@ -1173,7 +1168,7 @@ def _validate_td64_dtype(dtype) -> DtypeObj: if not lib.is_np_dtype(dtype, "m"): raise ValueError(f"dtype '{dtype}' is invalid, should be np.timedelta64 dtype") - elif not is_supported_unit(get_unit_from_dtype(dtype)): + elif not is_supported_dtype(dtype): raise ValueError("Supported timedelta64 resolutions are 's', 'ms', 'us', 'ns'") return dtype diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index 85d412d044ba8..cd852ba9249cf 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -110,7 +110,7 @@ def _align_core(terms): ax, itm = axis, items if not axes[ax].is_(itm): - axes[ax] = axes[ax].join(itm, how="outer") + axes[ax] = axes[ax].union(itm) for i, ndim in ndims.items(): for axis, items in zip(range(ndim), axes): diff --git a/pandas/core/construction.py b/pandas/core/construction.py index a0a92a99abe51..8cb76e57eba7e 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -24,8 +24,8 @@ from pandas._libs import lib from pandas._libs.tslibs import ( Period, - get_unit_from_dtype, - is_supported_unit, + get_supported_dtype, + is_supported_dtype, ) from pandas._typing import ( AnyArrayLike, @@ -370,9 +370,9 @@ def array( # 1. datetime64[ns,us,ms,s] # 2. timedelta64[ns,us,ms,s] # so that a DatetimeArray is returned. - if lib.is_np_dtype(dtype, "M") and is_supported_unit(get_unit_from_dtype(dtype)): + if lib.is_np_dtype(dtype, "M") and is_supported_dtype(dtype): return DatetimeArray._from_sequence(data, dtype=dtype, copy=copy) - if lib.is_np_dtype(dtype, "m") and is_supported_unit(get_unit_from_dtype(dtype)): + if lib.is_np_dtype(dtype, "m") and is_supported_dtype(dtype): return TimedeltaArray._from_sequence(data, dtype=dtype, copy=copy) elif lib.is_np_dtype(dtype, "mM"): @@ -490,12 +490,14 @@ def ensure_wrapped_if_datetimelike(arr): if arr.dtype.kind == "M": from pandas.core.arrays import DatetimeArray - return DatetimeArray._from_sequence(arr) + dtype = get_supported_dtype(arr.dtype) + return DatetimeArray._from_sequence(arr, dtype=dtype) elif arr.dtype.kind == "m": from pandas.core.arrays import TimedeltaArray - return TimedeltaArray._from_sequence(arr) + dtype = get_supported_dtype(arr.dtype) + return TimedeltaArray._from_sequence(arr, dtype=dtype) return arr diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 320f028f4484c..d5144174d3c71 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -36,8 +36,7 @@ OutOfBoundsTimedelta, Timedelta, Timestamp, - get_unit_from_dtype, - is_supported_unit, + is_supported_dtype, ) from pandas._libs.tslibs.timedeltas import array_to_timedelta64 from pandas.errors import ( @@ -1266,8 +1265,7 @@ def _ensure_nanosecond_dtype(dtype: DtypeObj) -> None: pass elif dtype.kind in "mM": - reso = get_unit_from_dtype(dtype) - if not is_supported_unit(reso): + if not is_supported_dtype(dtype): # pre-2.0 we would silently swap in nanos for lower-resolutions, # raise for above-nano resolutions if dtype.name in ["datetime64", "timedelta64"]: diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 3d12e334e7c0f..2245359fd8eac 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -211,8 +211,8 @@ def is_sparse(arr) -> bool: warnings.warn( "is_sparse is deprecated and will be removed in a future " "version. Check `isinstance(dtype, pd.SparseDtype)` instead.", - FutureWarning, - stacklevel=find_stack_level(), + DeprecationWarning, + stacklevel=2, ) dtype = getattr(arr, "dtype", arr) @@ -329,8 +329,8 @@ def is_datetime64tz_dtype(arr_or_dtype) -> bool: warnings.warn( "is_datetime64tz_dtype is deprecated and will be removed in a future " "version. Check `isinstance(dtype, pd.DatetimeTZDtype)` instead.", - FutureWarning, - stacklevel=find_stack_level(), + DeprecationWarning, + stacklevel=2, ) if isinstance(arr_or_dtype, DatetimeTZDtype): # GH#33400 fastpath for dtype object @@ -408,8 +408,8 @@ def is_period_dtype(arr_or_dtype) -> bool: warnings.warn( "is_period_dtype is deprecated and will be removed in a future version. " "Use `isinstance(dtype, pd.PeriodDtype)` instead", - FutureWarning, - stacklevel=find_stack_level(), + DeprecationWarning, + stacklevel=2, ) if isinstance(arr_or_dtype, ExtensionDtype): # GH#33400 fastpath for dtype object @@ -454,8 +454,8 @@ def is_interval_dtype(arr_or_dtype) -> bool: warnings.warn( "is_interval_dtype is deprecated and will be removed in a future version. " "Use `isinstance(dtype, pd.IntervalDtype)` instead", - FutureWarning, - stacklevel=find_stack_level(), + DeprecationWarning, + stacklevel=2, ) if isinstance(arr_or_dtype, ExtensionDtype): # GH#33400 fastpath for dtype object @@ -499,8 +499,8 @@ def is_categorical_dtype(arr_or_dtype) -> bool: warnings.warn( "is_categorical_dtype is deprecated and will be removed in a future " "version. Use isinstance(dtype, pd.CategoricalDtype) instead", - FutureWarning, - stacklevel=find_stack_level(), + DeprecationWarning, + stacklevel=2, ) if isinstance(arr_or_dtype, ExtensionDtype): # GH#33400 fastpath for dtype object @@ -838,8 +838,8 @@ def is_int64_dtype(arr_or_dtype) -> bool: warnings.warn( "is_int64_dtype is deprecated and will be removed in a future " "version. Use dtype == np.int64 instead.", - FutureWarning, - stacklevel=find_stack_level(), + DeprecationWarning, + stacklevel=2, ) return _is_dtype_type(arr_or_dtype, classes(np.int64)) @@ -1241,8 +1241,8 @@ def is_bool_dtype(arr_or_dtype) -> bool: "The behavior of is_bool_dtype with an object-dtype Index " "of bool objects is deprecated. In a future version, " "this will return False. Cast the Index to a bool dtype instead.", - FutureWarning, - stacklevel=find_stack_level(), + DeprecationWarning, + stacklevel=2, ) return True return False diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1630c2c31920d..8ba9926c054ba 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4895,7 +4895,6 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: inplace = validate_bool_kwarg(inplace, "inplace") kwargs["level"] = kwargs.pop("level", 0) + 1 - # TODO(CoW) those index/column resolvers create unnecessary refs to `self` index_resolvers = self._get_index_resolvers() column_resolvers = self._get_cleaned_column_resolvers() resolvers = column_resolvers, index_resolvers @@ -5220,7 +5219,21 @@ def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]: if is_list_like(value): com.require_length_match(value, self.index) - return sanitize_array(value, self.index, copy=True, allow_2d=True), None + arr = sanitize_array(value, self.index, copy=True, allow_2d=True) + if ( + isinstance(value, Index) + and value.dtype == "object" + and arr.dtype != value.dtype + ): # + # TODO: Remove kludge in sanitize_array for string mode when enforcing + # this deprecation + warnings.warn( + "Setting an Index with object dtype into a DataFrame will no longer " + "infer another dtype. Cast the Index explicitly before setting.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return arr, None @property def _series(self): @@ -7492,8 +7505,8 @@ def nlargest( - ``first`` : prioritize the first occurrence(s) - ``last`` : prioritize the last occurrence(s) - - ``all`` : do not drop any duplicates, even it means - selecting more than `n` items. + - ``all`` : keep all the ties of the smallest item even if it means + selecting more than ``n`` items. Returns ------- @@ -7555,7 +7568,9 @@ def nlargest( Italy 59000000 1937894 IT Brunei 434000 12128 BN - When using ``keep='all'``, all duplicate items are maintained: + When using ``keep='all'``, the number of element kept can go beyond ``n`` + if there are duplicate values for the smallest element, all the + ties are kept: >>> df.nlargest(3, 'population', keep='all') population GDP alpha-2 @@ -7565,6 +7580,16 @@ def nlargest( Maldives 434000 4520 MV Brunei 434000 12128 BN + However, ``nlargest`` does not keep ``n`` distinct largest elements: + + >>> df.nlargest(5, 'population', keep='all') + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Malta 434000 12011 MT + Maldives 434000 4520 MV + Brunei 434000 12128 BN + To order by the largest values in column "population" and then "GDP", we can specify multiple columns like in the next example. @@ -7601,8 +7626,8 @@ def nsmallest( - ``first`` : take the first occurrence. - ``last`` : take the last occurrence. - - ``all`` : do not drop any duplicates, even it means - selecting more than `n` items. + - ``all`` : keep all the ties of the largest item even if it means + selecting more than ``n`` items. Returns ------- @@ -7656,7 +7681,9 @@ def nsmallest( Tuvalu 11300 38 TV Nauru 337000 182 NR - When using ``keep='all'``, all duplicate items are maintained: + When using ``keep='all'``, the number of element kept can go beyond ``n`` + if there are duplicate values for the largest element, all the + ties are kept. >>> df.nsmallest(3, 'population', keep='all') population GDP alpha-2 @@ -7665,6 +7692,16 @@ def nsmallest( Iceland 337000 17036 IS Nauru 337000 182 NR + However, ``nsmallest`` does not keep ``n`` distinct + smallest elements: + + >>> df.nsmallest(4, 'population', keep='all') + population GDP alpha-2 + Tuvalu 11300 38 TV + Anguilla 11300 311 AI + Iceland 337000 17036 IS + Nauru 337000 182 NR + To order by the smallest values in column "population" and then "GDP", we can specify multiple columns like in the next example. @@ -12485,7 +12522,7 @@ def isin_(x): # ---------------------------------------------------------------------- # Internal Interface Methods - def _to_dict_of_blocks(self, copy: bool = True): + def _to_dict_of_blocks(self): """ Return a dict of dtype -> Constructor Types that each is a homogeneous dtype. @@ -12497,7 +12534,7 @@ def _to_dict_of_blocks(self, copy: bool = True): mgr = cast(BlockManager, mgr_to_mgr(mgr, "block")) return { k: self._constructor_from_mgr(v, axes=v.axes).__finalize__(self) - for k, v, in mgr.to_dict(copy=copy).items() + for k, v, in mgr.to_dict().items() } @property diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 90946b8d9b5f4..ef10958ac1153 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3955,7 +3955,7 @@ def to_csv( >>> df = pd.DataFrame({{'name': ['Raphael', 'Donatello'], ... 'mask': ['red', 'purple'], ... 'weapon': ['sai', 'bo staff']}}) - >>> df.to_csv('out.csv', index=False) # doctest: +SKIP + >>> df.to_csv('out.csv', index=False) # doctest: +SKIP Create 'out.zip' containing 'out.csv' @@ -8972,7 +8972,7 @@ def clip( Clips using specific lower and upper thresholds per column: - >>> df.clip([-2, -1], [4,5]) + >>> df.clip([-2, -1], [4, 5]) col_0 col_1 0 4 -1 1 -2 -1 diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 5a2f8d8454526..c4cc30f9631ea 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -470,10 +470,9 @@ def _aggregate_named(self, func, *args, **kwargs): __examples_series_doc = dedent( """ - >>> ser = pd.Series( - ... [390.0, 350.0, 30.0, 20.0], - ... index=["Falcon", "Falcon", "Parrot", "Parrot"], - ... name="Max Speed") + >>> ser = pd.Series([390.0, 350.0, 30.0, 20.0], + ... index=["Falcon", "Falcon", "Parrot", "Parrot"], + ... name="Max Speed") >>> grouped = ser.groupby([1, 1, 2, 2]) >>> grouped.transform(lambda x: (x - x.mean()) / x.std()) Falcon 0.707107 @@ -1331,14 +1330,10 @@ class DataFrameGroupBy(GroupBy[DataFrame]): """ Examples -------- - >>> df = pd.DataFrame( - ... { - ... "A": [1, 1, 2, 2], + >>> data = {"A": [1, 1, 2, 2], ... "B": [1, 2, 3, 4], - ... "C": [0.362838, 0.227877, 1.267767, -0.562860], - ... } - ... ) - + ... "C": [0.362838, 0.227877, 1.267767, -0.562860]} + >>> df = pd.DataFrame(data) >>> df A B C 0 1 1 0.362838 @@ -1393,7 +1388,8 @@ class DataFrameGroupBy(GroupBy[DataFrame]): >>> df.groupby("A").agg( ... b_min=pd.NamedAgg(column="B", aggfunc="min"), - ... c_sum=pd.NamedAgg(column="C", aggfunc="sum")) + ... c_sum=pd.NamedAgg(column="C", aggfunc="sum") + ... ) b_min c_sum A 1 1 0.590715 @@ -2012,7 +2008,7 @@ def _get_data_to_aggregate( mgr = obj._mgr if numeric_only: - mgr = mgr.get_numeric_data(copy=False) + mgr = mgr.get_numeric_data() return mgr def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame: @@ -2154,7 +2150,7 @@ def idxmax( >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], ... 'co2_emissions': [37.2, 19.66, 1712]}, - ... index=['Pork', 'Wheat Products', 'Beef']) + ... index=['Pork', 'Wheat Products', 'Beef']) >>> df consumption co2_emissions @@ -2236,7 +2232,7 @@ def idxmin( >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], ... 'co2_emissions': [37.2, 19.66, 1712]}, - ... index=['Pork', 'Wheat Products', 'Beef']) + ... index=['Pork', 'Wheat Products', 'Beef']) >>> df consumption co2_emissions @@ -2319,9 +2315,9 @@ def value_counts( Examples -------- >>> df = pd.DataFrame({ - ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], - ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], - ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR'] + ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], + ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], + ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR'] ... }) >>> df diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 7d284db4eba2c..e51983f0aabb7 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -232,8 +232,8 @@ class providing the base-class of operations. """, "dataframe_examples": """ >>> df = pd.DataFrame({'A': 'a a b'.split(), - ... 'B': [1,2,3], - ... 'C': [4,6,5]}) + ... 'B': [1, 2, 3], + ... 'C': [4, 6, 5]}) >>> g1 = df.groupby('A', group_keys=False) >>> g2 = df.groupby('A', group_keys=True) @@ -313,7 +313,7 @@ class providing the base-class of operations. The resulting dtype will reflect the return value of the passed ``func``. - >>> g1.apply(lambda x: x*2 if x.name == 'a' else x/2) + >>> g1.apply(lambda x: x * 2 if x.name == 'a' else x / 2) a 0.0 a 2.0 b 1.0 @@ -322,7 +322,7 @@ class providing the base-class of operations. In the above, the groups are not part of the index. We can have them included by using ``g2`` where ``group_keys=True``: - >>> g2.apply(lambda x: x*2 if x.name == 'a' else x/2) + >>> g2.apply(lambda x: x * 2 if x.name == 'a' else x / 2) a a 0.0 a 2.0 b b 1.0 @@ -421,14 +421,18 @@ class providing the base-class of operations. functions that expect Series, DataFrames, GroupBy or Resampler objects. Instead of writing ->>> h(g(f(df.groupby('group')), arg1=a), arg2=b, arg3=c) # doctest: +SKIP +>>> h = lambda x, arg2, arg3: x + 1 - arg2 * arg3 +>>> g = lambda x, arg1: x * 5 / arg1 +>>> f = lambda x: x ** 4 +>>> df = pd.DataFrame([["a", 4], ["b", 5]], columns=["group", "value"]) +>>> h(g(f(df.groupby('group')), arg1=1), arg2=2, arg3=3) # doctest: +SKIP You can write >>> (df.groupby('group') ... .pipe(f) -... .pipe(g, arg1=a) -... .pipe(h, arg2=b, arg3=c)) # doctest: +SKIP +... .pipe(g, arg1=1) +... .pipe(h, arg2=2, arg3=3)) # doctest: +SKIP which is much more readable. diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index fc914831b7a72..4703c12db602d 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -330,7 +330,6 @@ def _get_grouper( return grouper, obj - @final def _set_grouper( self, obj: NDFrameT, sort: bool = False, *, gpr_index: Index | None = None ) -> tuple[NDFrameT, Index, npt.NDArray[np.intp] | None]: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6c9f93d3482a7..5dc4a85ba9792 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -159,7 +159,10 @@ ExtensionArray, TimedeltaArray, ) -from pandas.core.arrays.string_ import StringArray +from pandas.core.arrays.string_ import ( + StringArray, + StringDtype, +) from pandas.core.base import ( IndexOpsMixin, PandasObject, @@ -365,9 +368,6 @@ class Index(IndexOpsMixin, PandasObject): Index([1, 2, 3], dtype='uint8') """ - # To hand over control to subclasses - _join_precedence = 1 - # similar to __array_priority__, positions Index after Series and DataFrame # but before ExtensionArray. Should NOT be overridden by subclasses. __pandas_priority__ = 2000 @@ -1012,6 +1012,16 @@ def view(self, cls=None): result = self._data.view(cls) else: + if cls is not None: + warnings.warn( + # GH#55709 + f"Passing a type in {type(self).__name__}.view is deprecated " + "and will raise in a future version. " + "Call view without any argument to retain the old behavior.", + FutureWarning, + stacklevel=find_stack_level(), + ) + result = self._view() if isinstance(result, Index): result._id = self._id @@ -4551,6 +4561,7 @@ def join( Index([1, 2, 3, 4, 5, 6], dtype='int64') """ other = ensure_index(other) + sort = sort or how == "outer" if isinstance(self, ABCDatetimeIndex) and isinstance(other, ABCDatetimeIndex): if (self.tz is None) ^ (other.tz is None): @@ -4565,9 +4576,6 @@ def join( pother, how=how, level=level, return_indexers=True, sort=sort ) - lindexer: np.ndarray | None - rindexer: np.ndarray | None - # try to figure out the join level # GH3662 if level is None and (self._is_multi or other._is_multi): @@ -4581,34 +4589,38 @@ def join( if level is not None and (self._is_multi or other._is_multi): return self._join_level(other, level, how=how) + lidx: np.ndarray | None + ridx: np.ndarray | None + if len(other) == 0: if how in ("left", "outer"): - join_index = self._view() - rindexer = np.broadcast_to(np.intp(-1), len(join_index)) - return join_index, None, rindexer + if sort and not self.is_monotonic_increasing: + lidx = self.argsort() + join_index = self.take(lidx) + else: + lidx = None + join_index = self._view() + ridx = np.broadcast_to(np.intp(-1), len(join_index)) + return join_index, lidx, ridx elif how in ("right", "inner", "cross"): join_index = other._view() - lindexer = np.array([]) - return join_index, lindexer, None + lidx = np.array([], dtype=np.intp) + return join_index, lidx, None if len(self) == 0: if how in ("right", "outer"): - join_index = other._view() - lindexer = np.broadcast_to(np.intp(-1), len(join_index)) - return join_index, lindexer, None + if sort and not other.is_monotonic_increasing: + ridx = other.argsort() + join_index = other.take(ridx) + else: + ridx = None + join_index = other._view() + lidx = np.broadcast_to(np.intp(-1), len(join_index)) + return join_index, lidx, ridx elif how in ("left", "inner", "cross"): join_index = self._view() - rindexer = np.array([]) - return join_index, None, rindexer - - if self._join_precedence < other._join_precedence: - flip: dict[JoinHow, JoinHow] = {"right": "left", "left": "right"} - how = flip.get(how, how) - join_index, lidx, ridx = other.join( - self, how=how, level=level, return_indexers=True - ) - lidx, ridx = ridx, lidx - return join_index, lidx, ridx + ridx = np.array([], dtype=np.intp) + return join_index, None, ridx if self.dtype != other.dtype: dtype = self._find_common_type_compat(other) @@ -4653,18 +4665,20 @@ def _join_via_get_indexer( # Note: at this point we have checked matching dtypes if how == "left": - join_index = self + join_index = self.sort_values() if sort else self elif how == "right": - join_index = other + join_index = other.sort_values() if sort else other elif how == "inner": join_index = self.intersection(other, sort=sort) elif how == "outer": - # TODO: sort=True here for backwards compat. It may - # be better to use the sort parameter passed into join - join_index = self.union(other) - - if sort and how in ["left", "right"]: - join_index = join_index.sort_values() + try: + join_index = self.union(other, sort=sort) + except TypeError: + join_index = self.union(other) + try: + join_index = _maybe_try_sort(join_index, sort) + except TypeError: + pass if join_index is self: lindexer = None @@ -5574,6 +5588,14 @@ def equals(self, other: Any) -> bool: # quickly return if the lengths are different return False + if ( + isinstance(self.dtype, StringDtype) + and self.dtype.storage == "pyarrow_numpy" + and other.dtype != self.dtype + ): + # special case for object behavior + return other.equals(self.astype(object)) + if is_object_dtype(self.dtype) and not is_object_dtype(other.dtype): # if other is not object, use other's logic for coercion return other.equals(self) @@ -6523,18 +6545,6 @@ def isin(self, values, level=None) -> npt.NDArray[np.bool_]: >>> midx.isin([(1, 'red'), (3, 'red')]) array([ True, False, False]) - - For a DatetimeIndex, string values in `values` are converted to - Timestamps. - - >>> dates = ['2000-03-11', '2000-03-12', '2000-03-13'] - >>> dti = pd.to_datetime(dates) - >>> dti - DatetimeIndex(['2000-03-11', '2000-03-12', '2000-03-13'], - dtype='datetime64[ns]', freq=None) - - >>> dti.isin(['2000-03-11']) - array([ True, False, False]) """ if level is not None: self._validate_index_level(level) @@ -6939,14 +6949,24 @@ def insert(self, loc: int, item) -> Index: loc = loc if loc >= 0 else loc - 1 new_values[loc] = item - idx = Index._with_infer(new_values, name=self.name) + out = Index._with_infer(new_values, name=self.name) if ( using_pyarrow_string_dtype() - and is_string_dtype(idx.dtype) + and is_string_dtype(out.dtype) and new_values.dtype == object ): - idx = idx.astype(new_values.dtype) - return idx + out = out.astype(new_values.dtype) + if self.dtype == object and out.dtype != object: + # GH#51363 + warnings.warn( + "The behavior of Index.insert with object-dtype is deprecated, " + "in a future version this will return an object-dtype Index " + "instead of inferring a non-object dtype. To retain the old " + "behavior, do `idx.insert(loc, item).infer_objects(copy=False)`", + FutureWarning, + stacklevel=find_stack_level(), + ) + return out def drop( self, diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 264ca8aa11495..2b03a64236128 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -442,8 +442,6 @@ class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin, ABC): _is_monotonic_decreasing = Index.is_monotonic_decreasing _is_unique = Index.is_unique - _join_precedence = 10 - @property def unit(self) -> str: return self._data.unit diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 86693f241ddb1..46343b84afb43 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -862,7 +862,8 @@ def levels(self) -> FrozenList: Examples -------- >>> index = pd.MultiIndex.from_product([['mammal'], - ... ('goat', 'human', 'cat', 'dog')], names=['Category', 'Animals']) + ... ('goat', 'human', 'cat', 'dog')], + ... names=['Category', 'Animals']) >>> leg_num = pd.DataFrame(data=(4, 2, 4, 4), index=index, columns=['Legs']) >>> leg_num Legs diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index e3928621a4e48..c233295b25700 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1893,7 +1893,15 @@ def _setitem_with_indexer(self, indexer, value, name: str = "iloc"): # just replacing the block manager here # so the object is the same index = self.obj._get_axis(i) - labels = index.insert(len(index), key) + with warnings.catch_warnings(): + # TODO: re-issue this with setitem-specific message? + warnings.filterwarnings( + "ignore", + "The behavior of Index.insert with object-dtype " + "is deprecated", + category=FutureWarning, + ) + labels = index.insert(len(index), key) # We are expanding the Series/DataFrame values to match # the length of thenew index `labels`. GH#40096 ensure @@ -2186,7 +2194,14 @@ def _setitem_with_indexer_missing(self, indexer, value): # and set inplace if self.ndim == 1: index = self.obj.index - new_index = index.insert(len(index), indexer) + with warnings.catch_warnings(): + # TODO: re-issue this with setitem-specific message? + warnings.filterwarnings( + "ignore", + "The behavior of Index.insert with object-dtype is deprecated", + category=FutureWarning, + ) + new_index = index.insert(len(index), indexer) # we have a coerced indexer, e.g. a float # that matches in an int64 Index, so diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index b0b3937ca47ea..e5ef44d07061e 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -9,10 +9,12 @@ from __future__ import annotations from typing import TYPE_CHECKING +import warnings import numpy as np from pandas._libs.internals import BlockPlacement +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import pandas_dtype from pandas.core.dtypes.dtypes import ( @@ -50,6 +52,14 @@ def make_block( - Block.make_block_same_class - Block.__init__ """ + warnings.warn( + # GH#40226 + "make_block is deprecated and will be removed in a future version. " + "Use public APIs instead.", + DeprecationWarning, + stacklevel=find_stack_level(), + ) + if dtype is not None: dtype = pandas_dtype(dtype) @@ -113,7 +123,6 @@ def maybe_infer_ndim(values, placement: BlockPlacement, ndim: int | None) -> int def __getattr__(name: str): # GH#55139 - import warnings if name in [ "Block", diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py index 0fd91b163aba9..33f5b9feb1387 100644 --- a/pandas/core/internals/base.py +++ b/pandas/core/internals/base.py @@ -252,12 +252,16 @@ def replace(self, to_replace, value, inplace: bool) -> Self: value=value, inplace=inplace, using_cow=using_copy_on_write(), + already_warned=_AlreadyWarned(), ) @final def replace_regex(self, **kwargs) -> Self: return self.apply_with_block( - "_replace_regex", **kwargs, using_cow=using_copy_on_write() + "_replace_regex", + **kwargs, + using_cow=using_copy_on_write(), + already_warned=_AlreadyWarned(), ) @final @@ -278,13 +282,18 @@ def replace_list( inplace=inplace, regex=regex, using_cow=using_copy_on_write(), + already_warned=_AlreadyWarned(), ) bm._consolidate_inplace() return bm def interpolate(self, inplace: bool, **kwargs) -> Self: return self.apply_with_block( - "interpolate", inplace=inplace, **kwargs, using_cow=using_copy_on_write() + "interpolate", + inplace=inplace, + **kwargs, + using_cow=using_copy_on_write(), + already_warned=_AlreadyWarned(), ) def pad_or_backfill(self, inplace: bool, **kwargs) -> Self: @@ -293,6 +302,7 @@ def pad_or_backfill(self, inplace: bool, **kwargs) -> Self: inplace=inplace, **kwargs, using_cow=using_copy_on_write(), + already_warned=_AlreadyWarned(), ) def shift(self, periods: int, fill_value) -> Self: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 85780bad1e403..1af2d9e739038 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -830,6 +830,7 @@ def replace( # mask may be pre-computed if we're called from replace_list mask: npt.NDArray[np.bool_] | None = None, using_cow: bool = False, + already_warned=None, ) -> list[Block]: """ replace the to_replace value with value, possible to create new @@ -874,6 +875,20 @@ def replace( # and rest? blk = self._maybe_copy(using_cow, inplace) putmask_inplace(blk.values, mask, value) + if ( + inplace + and warn_copy_on_write() + and already_warned is not None + and not already_warned.warned_already + ): + if self.refs.has_reference(): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + already_warned.warned_already = True + if not (self.is_object and value is None): # if the user *explicitly* gave None, we keep None, otherwise # may downcast to NaN @@ -934,6 +949,7 @@ def _replace_regex( inplace: bool = False, mask=None, using_cow: bool = False, + already_warned=None, ) -> list[Block]: """ Replace elements by the given value. @@ -968,6 +984,20 @@ def _replace_regex( replace_regex(block.values, rx, value, mask) + if ( + inplace + and warn_copy_on_write() + and already_warned is not None + and not already_warned.warned_already + ): + if self.refs.has_reference(): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + already_warned.warned_already = True + nbs = block.convert(copy=False, using_cow=using_cow) opt = get_option("future.no_silent_downcasting") if (len(nbs) > 1 or nbs[0].dtype != block.dtype) and not opt: @@ -992,6 +1022,7 @@ def replace_list( inplace: bool = False, regex: bool = False, using_cow: bool = False, + already_warned=None, ) -> list[Block]: """ See BlockManager.replace_list docstring. @@ -1048,6 +1079,20 @@ def replace_list( else: rb = [self if inplace else self.copy()] + if ( + inplace + and warn_copy_on_write() + and already_warned is not None + and not already_warned.warned_already + ): + if self.refs.has_reference(): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + already_warned.warned_already = True + opt = get_option("future.no_silent_downcasting") for i, ((src, dest), mask) in enumerate(zip(pairs, masks)): convert = i == src_len # only convert once at the end @@ -1657,6 +1702,7 @@ def pad_or_backfill( limit_area: Literal["inside", "outside"] | None = None, downcast: Literal["infer"] | None = None, using_cow: bool = False, + already_warned=None, ) -> list[Block]: if not self._can_hold_na: # If there are no NAs, then interpolate is a no-op @@ -1677,6 +1723,19 @@ def pad_or_backfill( limit_area=limit_area, copy=copy, ) + if ( + not copy + and warn_copy_on_write() + and already_warned is not None + and not already_warned.warned_already + ): + if self.refs.has_reference(): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + already_warned.warned_already = True if axis == 1: new_values = new_values.T @@ -1697,6 +1756,7 @@ def interpolate( limit_area: Literal["inside", "outside"] | None = None, downcast: Literal["infer"] | None = None, using_cow: bool = False, + already_warned=None, **kwargs, ) -> list[Block]: inplace = validate_bool_kwarg(inplace, "inplace") @@ -1735,6 +1795,20 @@ def interpolate( ) data = extract_array(new_values, extract_numpy=True) + if ( + not copy + and warn_copy_on_write() + and already_warned is not None + and not already_warned.warned_already + ): + if self.refs.has_reference(): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + already_warned.warned_already = True + nb = self.make_block_same_class(data, refs=refs) return nb._maybe_downcast([nb], downcast, using_cow, caller="interpolate") @@ -2178,9 +2252,9 @@ def pad_or_backfill( limit_area: Literal["inside", "outside"] | None = None, downcast: Literal["infer"] | None = None, using_cow: bool = False, + already_warned=None, ) -> list[Block]: values = self.values - copy, refs = self._get_refs_and_copy(using_cow, inplace) if values.ndim == 2 and axis == 1: # NDArrayBackedExtensionArray.fillna assumes axis=0 diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index a221d02b75bb2..14d05c59272e8 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -495,17 +495,12 @@ def is_view(self) -> bool: def _get_data_subset(self, predicate: Callable) -> Self: blocks = [blk for blk in self.blocks if predicate(blk.values)] - return self._combine(blocks, copy=False) + return self._combine(blocks) - def get_bool_data(self, copy: bool = False) -> Self: + def get_bool_data(self) -> Self: """ Select blocks that are bool-dtype and columns from object-dtype blocks that are all-bool. - - Parameters - ---------- - copy : bool, default False - Whether to copy the blocks """ new_blocks = [] @@ -518,26 +513,16 @@ def get_bool_data(self, copy: bool = False) -> Self: nbs = blk._split() new_blocks.extend(nb for nb in nbs if nb.is_bool) - return self._combine(new_blocks, copy) + return self._combine(new_blocks) - def get_numeric_data(self, copy: bool = False) -> Self: - """ - Parameters - ---------- - copy : bool, default False - Whether to copy the blocks - """ + def get_numeric_data(self) -> Self: numeric_blocks = [blk for blk in self.blocks if blk.is_numeric] if len(numeric_blocks) == len(self.blocks): # Avoid somewhat expensive _combine - if copy: - return self.copy(deep=True) return self - return self._combine(numeric_blocks, copy) + return self._combine(numeric_blocks) - def _combine( - self, blocks: list[Block], copy: bool = True, index: Index | None = None - ) -> Self: + def _combine(self, blocks: list[Block], index: Index | None = None) -> Self: """return a new manager with the blocks""" if len(blocks) == 0: if self.ndim == 2: @@ -554,11 +539,8 @@ def _combine( inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0]) new_blocks: list[Block] = [] - # TODO(CoW) we could optimize here if we know that the passed blocks - # are fully "owned" (eg created from an operation, not coming from - # an existing manager) for b in blocks: - nb = b.copy(deep=copy) + nb = b.copy(deep=False) nb.mgr_locs = BlockPlacement(inv_indexer[nb.mgr_locs.indexer]) new_blocks.append(nb) @@ -1325,16 +1307,16 @@ def column_setitem( This is a method on the BlockManager level, to avoid creating an intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`) """ + needs_to_warn = False if warn_copy_on_write() and not self._has_no_reference(loc): if not isinstance( self.blocks[self.blknos[loc]].values, (ArrowExtensionArray, ArrowStringArray), ): - warnings.warn( - COW_WARNING_GENERAL_MSG, - FutureWarning, - stacklevel=find_stack_level(), - ) + # We might raise if we are in an expansion case, so defer + # warning till we actually updated + needs_to_warn = True + elif using_copy_on_write() and not self._has_no_reference(loc): blkno = self.blknos[loc] # Split blocks to only copy the column we want to modify @@ -1358,6 +1340,13 @@ def column_setitem( new_mgr = col_mgr.setitem((idx,), value) self.iset(loc, new_mgr._block.values, inplace=True) + if needs_to_warn: + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + def insert(self, loc: int, item: Hashable, value: ArrayLike, refs=None) -> None: """ Insert item at selected position. @@ -1369,8 +1358,14 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike, refs=None) -> None: value : np.ndarray or ExtensionArray refs : The reference tracking object of the value to set. """ - # insert to the axis; this could possibly raise a TypeError - new_axis = self.items.insert(loc, item) + with warnings.catch_warnings(): + # TODO: re-issue this with setitem-specific message? + warnings.filterwarnings( + "ignore", + "The behavior of Index.insert with object-dtype is deprecated", + category=FutureWarning, + ) + new_axis = self.items.insert(loc, item) if value.ndim == 2: value = value.T @@ -1623,14 +1618,10 @@ def unstack(self, unstacker, fill_value) -> BlockManager: bm = BlockManager(new_blocks, [new_columns, new_index], verify_integrity=False) return bm - def to_dict(self, copy: bool = True) -> dict[str, Self]: + def to_dict(self) -> dict[str, Self]: """ Return a dict of str(dtype) -> BlockManager - Parameters - ---------- - copy : bool, default True - Returns ------- values : a dict of dtype -> BlockManager @@ -1641,7 +1632,7 @@ def to_dict(self, copy: bool = True) -> dict[str, Self]: bd.setdefault(str(b.dtype), []).append(b) # TODO(EA2D): the combine will be unnecessary with 2D EAs - return {dtype: self._combine(blocks, copy=copy) for dtype, blocks in bd.items()} + return {dtype: self._combine(blocks) for dtype, blocks in bd.items()} def as_array( self, @@ -2021,9 +2012,9 @@ def array_values(self) -> ExtensionArray: """The array that Series.array returns""" return self._block.array_values - def get_numeric_data(self, copy: bool = False) -> Self: + def get_numeric_data(self) -> Self: if self._block.is_numeric: - return self.copy(deep=copy) + return self.copy(deep=False) return self.make_empty() @property diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index b39930da9f711..d8a772aac6082 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -24,11 +24,9 @@ ) from pandas._libs.tslibs import ( BaseOffset, - get_supported_reso, - get_unit_from_dtype, - is_supported_unit, + get_supported_dtype, + is_supported_dtype, is_unitless, - npy_unit_to_abbrev, ) from pandas.util._exceptions import find_stack_level @@ -543,10 +541,9 @@ def maybe_prepare_scalar_for_op(obj, shape: Shape): # GH 52295 if is_unitless(obj.dtype): obj = obj.astype("datetime64[ns]") - elif not is_supported_unit(get_unit_from_dtype(obj.dtype)): - unit = get_unit_from_dtype(obj.dtype) - closest_unit = npy_unit_to_abbrev(get_supported_reso(unit)) - obj = obj.astype(f"datetime64[{closest_unit}]") + elif not is_supported_dtype(obj.dtype): + new_dtype = get_supported_dtype(obj.dtype) + obj = obj.astype(new_dtype) right = np.broadcast_to(obj, shape) return DatetimeArray(right) @@ -562,10 +559,9 @@ def maybe_prepare_scalar_for_op(obj, shape: Shape): # GH 52295 if is_unitless(obj.dtype): obj = obj.astype("timedelta64[ns]") - elif not is_supported_unit(get_unit_from_dtype(obj.dtype)): - unit = get_unit_from_dtype(obj.dtype) - closest_unit = npy_unit_to_abbrev(get_supported_reso(unit)) - obj = obj.astype(f"timedelta64[{closest_unit}]") + elif not is_supported_dtype(obj.dtype): + new_dtype = get_supported_dtype(obj.dtype) + obj = obj.astype(new_dtype) right = np.broadcast_to(obj, shape) return TimedeltaArray(right) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 8af81cd43d62e..d54f6d31f6144 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -38,6 +38,7 @@ rewrite_warning, ) +from pandas.core.dtypes.dtypes import ArrowDtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, @@ -48,6 +49,7 @@ ResamplerWindowApply, warn_alias_replacement, ) +from pandas.core.arrays import ArrowExtensionArray from pandas.core.base import ( PandasObject, SelectionMixin, @@ -68,6 +70,7 @@ from pandas.core.groupby.grouper import Grouper from pandas.core.groupby.ops import BinGrouper from pandas.core.indexes.api import MultiIndex +from pandas.core.indexes.base import Index from pandas.core.indexes.datetimes import ( DatetimeIndex, date_range, @@ -109,7 +112,6 @@ from pandas import ( DataFrame, - Index, Series, ) @@ -511,6 +513,9 @@ def _wrap_result(self, result): result.index = _asfreq_compat(obj.index[:0], freq=self.freq) result.name = getattr(obj, "name", None) + if self._timegrouper._arrow_dtype is not None: + result.index = result.index.astype(self._timegrouper._arrow_dtype) + return result @final @@ -854,7 +859,7 @@ def fillna(self, method, limit: int | None = None): Missing values present before the upsampling are not affected. >>> sm = pd.Series([1, None, 3], - ... index=pd.date_range('20180101', periods=3, freq='h')) + ... index=pd.date_range('20180101', periods=3, freq='h')) >>> sm 2018-01-01 00:00:00 1.0 2018-01-01 01:00:00 NaN @@ -1023,13 +1028,8 @@ def interpolate( Examples -------- - >>> import datetime as dt - >>> timesteps = [ - ... dt.datetime(2023, 3, 1, 7, 0, 0), - ... dt.datetime(2023, 3, 1, 7, 0, 1), - ... dt.datetime(2023, 3, 1, 7, 0, 2), - ... dt.datetime(2023, 3, 1, 7, 0, 3), - ... dt.datetime(2023, 3, 1, 7, 0, 4)] + >>> start = "2023-03-01T07:00:00" + >>> timesteps = pd.date_range(start, periods=5, freq="s") >>> series = pd.Series(data=[1, -1, 2, 1, 3], index=timesteps) >>> series 2023-03-01 07:00:00 1 @@ -1037,7 +1037,7 @@ def interpolate( 2023-03-01 07:00:02 2 2023-03-01 07:00:03 1 2023-03-01 07:00:04 3 - dtype: int64 + Freq: s, dtype: int64 Upsample the dataframe to 0.5Hz by providing the period time of 2s. @@ -1061,7 +1061,7 @@ def interpolate( 2023-03-01 07:00:04.000 3.0 Freq: 500ms, dtype: float64 - Internal reindexing with ``as_freq()`` prior to interpolation leads to + Internal reindexing with ``asfreq()`` prior to interpolation leads to an interpolated timeseries on the basis the reindexed timestamps (anchors). Since not all datapoints from original series become anchors, it can lead to misleading interpolation results as in the following example: @@ -2163,6 +2163,7 @@ def __init__( self.fill_method = fill_method self.limit = limit self.group_keys = group_keys + self._arrow_dtype: ArrowDtype | None = None if origin in ("epoch", "start", "start_day", "end", "end_day"): # error: Incompatible types in assignment (expression has type "Union[Union[ @@ -2213,7 +2214,7 @@ def _get_resampler(self, obj: NDFrame, kind=None) -> Resampler: TypeError if incompatible axis """ - _, ax, indexer = self._set_grouper(obj, gpr_index=None) + _, ax, _ = self._set_grouper(obj, gpr_index=None) if isinstance(ax, DatetimeIndex): return DatetimeIndexResampler( obj, @@ -2495,6 +2496,17 @@ def _get_period_bins(self, ax: PeriodIndex): return binner, bins, labels + def _set_grouper( + self, obj: NDFrameT, sort: bool = False, *, gpr_index: Index | None = None + ) -> tuple[NDFrameT, Index, npt.NDArray[np.intp] | None]: + obj, ax, indexer = super()._set_grouper(obj, sort, gpr_index=gpr_index) + if isinstance(ax.dtype, ArrowDtype) and ax.dtype.kind in "Mm": + self._arrow_dtype = ax.dtype + ax = Index( + cast(ArrowExtensionArray, ax.array)._maybe_convert_datelike_array() + ) + return obj, ax, indexer + def _take_new_index( obj: NDFrameT, indexer: npt.NDArray[np.intp], new_index: Index, axis: AxisInt = 0 diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 1bc548de91f01..d46348fff7a02 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -464,7 +464,7 @@ def __init__( # if we have mixed ndims, then convert to highest ndim # creating column numbers as needed if len(ndims) > 1: - objs, sample = self._sanitize_mixed_ndim(objs, sample, ignore_index, axis) + objs = self._sanitize_mixed_ndim(objs, sample, ignore_index, axis) self.objs = objs @@ -580,7 +580,7 @@ def _sanitize_mixed_ndim( sample: Series | DataFrame, ignore_index: bool, axis: AxisInt, - ) -> tuple[list[Series | DataFrame], Series | DataFrame]: + ) -> list[Series | DataFrame]: # if we have mixed ndims, then convert to highest ndim # creating column numbers as needed @@ -601,19 +601,21 @@ def _sanitize_mixed_ndim( else: name = getattr(obj, "name", None) if ignore_index or name is None: - name = current_column - current_column += 1 - - # doing a row-wise concatenation so need everything - # to line up - if self._is_frame and axis == 1: - name = 0 + if axis == 1: + # doing a row-wise concatenation so need everything + # to line up + name = 0 + else: + # doing a column-wise concatenation so need series + # to have unique names + name = current_column + current_column += 1 obj = sample._constructor({name: obj}, copy=False) new_objs.append(obj) - return new_objs, sample + return new_objs def get_result(self): cons: Callable[..., DataFrame | Series] diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 6963bf677bcfb..3ed67bb7b7c02 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -21,9 +21,14 @@ is_object_dtype, pandas_dtype, ) +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + CategoricalDtype, +) from pandas.core.arrays import SparseArray from pandas.core.arrays.categorical import factorize_from_iterable +from pandas.core.arrays.string_ import StringDtype from pandas.core.frame import DataFrame from pandas.core.indexes.api import ( Index, @@ -244,8 +249,25 @@ def _get_dummies_1d( # Series avoids inconsistent NaN handling codes, levels = factorize_from_iterable(Series(data, copy=False)) - if dtype is None: + if dtype is None and hasattr(data, "dtype"): + input_dtype = data.dtype + if isinstance(input_dtype, CategoricalDtype): + input_dtype = input_dtype.categories.dtype + + if isinstance(input_dtype, ArrowDtype): + import pyarrow as pa + + dtype = ArrowDtype(pa.bool_()) # type: ignore[assignment] + elif ( + isinstance(input_dtype, StringDtype) + and input_dtype.storage != "pyarrow_numpy" + ): + dtype = pandas_dtype("boolean") # type: ignore[assignment] + else: + dtype = np.dtype(bool) + elif dtype is None: dtype = np.dtype(bool) + _dtype = pandas_dtype(dtype) if is_object_dtype(_dtype): diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f8575b1b53908..c43c16cded852 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -718,7 +718,7 @@ class _MergeOperation: """ _merge_type = "merge" - how: MergeHow | Literal["asof"] + how: JoinHow | Literal["asof"] on: IndexLabel | None # left_on/right_on may be None when passed, but in validate_specification # get replaced with non-None. @@ -739,7 +739,7 @@ def __init__( self, left: DataFrame | Series, right: DataFrame | Series, - how: MergeHow | Literal["asof"] = "inner", + how: JoinHow | Literal["asof"] = "inner", on: IndexLabel | AnyArrayLike | None = None, left_on: IndexLabel | AnyArrayLike | None = None, right_on: IndexLabel | AnyArrayLike | None = None, @@ -759,7 +759,7 @@ def __init__( self.on = com.maybe_make_list(on) self.suffixes = suffixes - self.sort = sort + self.sort = sort or how == "outer" self.left_index = left_index self.right_index = right_index @@ -1106,6 +1106,8 @@ def _maybe_add_join_keys( def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: """return the join indexers""" + # make mypy happy + assert self.how != "asof" return get_join_indexers( self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how ) @@ -1114,8 +1116,6 @@ def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp] def _get_join_info( self, ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: - # make mypy happy - assert self.how != "cross" left_ax = self.left.index right_ax = self.right.index @@ -1658,7 +1658,7 @@ def get_join_indexers( left_keys: list[ArrayLike], right_keys: list[ArrayLike], sort: bool = False, - how: MergeHow | Literal["asof"] = "inner", + how: JoinHow = "inner", ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: """ @@ -1684,22 +1684,19 @@ def get_join_indexers( left_n = len(left_keys[0]) right_n = len(right_keys[0]) if left_n == 0: - if how in ["left", "inner", "cross"]: + if how in ["left", "inner"]: return _get_empty_indexer() elif not sort and how in ["right", "outer"]: return _get_no_sort_one_missing_indexer(right_n, True) elif right_n == 0: - if how in ["right", "inner", "cross"]: + if how in ["right", "inner"]: return _get_empty_indexer() elif not sort and how in ["left", "outer"]: return _get_no_sort_one_missing_indexer(left_n, False) - if not sort and how == "outer": - sort = True - # get left & right join labels and num. of levels at each location mapped = ( - _factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how) + _factorize_keys(left_keys[n], right_keys[n], sort=sort) for n in range(len(left_keys)) ) zipped = zip(*mapped) @@ -1712,7 +1709,7 @@ def get_join_indexers( # `count` is the num. of unique keys # set(lkey) | set(rkey) == range(count) - lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how) + lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) # preserve left frame order if how == 'left' and sort == False kwargs = {} if how in ("inner", "left", "right"): @@ -1989,7 +1986,12 @@ def _validate_left_right_on(self, left_on, right_on): else: ro_dtype = self.right.index.dtype - if is_object_dtype(lo_dtype) or is_object_dtype(ro_dtype): + if ( + is_object_dtype(lo_dtype) + or is_object_dtype(ro_dtype) + or is_string_dtype(lo_dtype) + or is_string_dtype(ro_dtype) + ): raise MergeError( f"Incompatible merge dtype, {repr(ro_dtype)} and " f"{repr(lo_dtype)}, both sides must have numeric dtype" @@ -2166,7 +2168,6 @@ def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp] left_join_keys[n], right_join_keys[n], sort=False, - how="left", ) for n in range(len(left_join_keys)) ] @@ -2310,10 +2311,7 @@ def _left_join_on_index( def _factorize_keys( - lk: ArrayLike, - rk: ArrayLike, - sort: bool = True, - how: MergeHow | Literal["asof"] = "inner", + lk: ArrayLike, rk: ArrayLike, sort: bool = True ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: """ Encode left and right keys as enumerated types. @@ -2329,8 +2327,6 @@ def _factorize_keys( sort : bool, defaults to True If True, the encoding is done such that the unique elements in the keys are sorted. - how : {'left', 'right', 'outer', 'inner'}, default 'inner' - Type of merge. Returns ------- @@ -2419,8 +2415,6 @@ def _factorize_keys( ) if dc.null_count > 0: count += 1 - if how == "right": - return rlab, llab, count return llab, rlab, count if not isinstance(lk, BaseMaskedArray) and not ( @@ -2491,8 +2485,6 @@ def _factorize_keys( np.putmask(rlab, rmask, count) count += 1 - if how == "right": - return rlab, llab, count return llab, rlab, count diff --git a/pandas/core/series.py b/pandas/core/series.py index 464e066b4e86a..e4dca97bc645d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1719,7 +1719,7 @@ def reset_index( return new_ser.__finalize__(self, method="reset_index") else: return self._constructor( - self._values.copy(), index=new_index, copy=False + self._values.copy(), index=new_index, copy=False, dtype=self.dtype ).__finalize__(self, method="reset_index") elif inplace: raise TypeError( @@ -4740,7 +4740,11 @@ def transform( ) -> DataFrame | Series: # Validate axis argument self._get_axis_number(axis) - ser = self.copy(deep=False) if using_copy_on_write() else self + ser = ( + self.copy(deep=False) + if using_copy_on_write() or warn_copy_on_write() + else self + ) result = SeriesApply(ser, func=func, args=args, kwargs=kwargs).transform() return result diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index fdb8ef1cc5dad..25f7e7e9f832b 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -797,7 +797,7 @@ ... 'B': ['a', 'b', 'c', 'd', 'e'], ... 'C': ['f', 'g', 'h', 'i', 'j']}}) - >>> df.replace(to_replace='^[a-g]', value = 'e', regex=True) + >>> df.replace(to_replace='^[a-g]', value='e', regex=True) A B C 0 0 e e 1 1 e e @@ -808,7 +808,7 @@ If ``value`` is not ``None`` and `to_replace` is a dictionary, the dictionary keys will be the DataFrame columns that the replacement will be applied. - >>> df.replace(to_replace={{'B': '^[a-c]', 'C': '^[h-j]'}}, value = 'e', regex=True) + >>> df.replace(to_replace={{'B': '^[a-c]', 'C': '^[h-j]'}}, value='e', regex=True) A B C 0 0 e f 1 1 e g diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 9fa6e9973291d..75866c6f6013a 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -44,6 +44,7 @@ ) from pandas.core.dtypes.missing import isna +from pandas.core.arrays import ExtensionArray from pandas.core.base import NoNewAttributesMixin from pandas.core.construction import extract_array @@ -456,7 +457,7 @@ def _get_series_list(self, others): # in case of list-like `others`, all elements must be # either Series/Index/np.ndarray (1-dim)... if all( - isinstance(x, (ABCSeries, ABCIndex)) + isinstance(x, (ABCSeries, ABCIndex, ExtensionArray)) or (isinstance(x, np.ndarray) and x.ndim == 1) for x in others ): @@ -690,12 +691,15 @@ def cat( out: Index | Series if isinstance(self._orig, ABCIndex): # add dtype for case that result is all-NA + dtype = None + if isna(result).all(): + dtype = object - out = Index(result, dtype=object, name=self._orig.name) + out = Index(result, dtype=dtype, name=self._orig.name) else: # Series if isinstance(self._orig.dtype, CategoricalDtype): # We need to infer the new categories. - dtype = None + dtype = self._orig.dtype.categories.dtype # type: ignore[assignment] else: dtype = self._orig.dtype res_ser = Series( @@ -914,7 +918,13 @@ def split( if is_re(pat): regex = True result = self._data.array._str_split(pat, n, expand, regex) - return self._wrap_result(result, returns_string=expand, expand=expand) + if self._data.dtype == "category": + dtype = self._data.dtype.categories.dtype + else: + dtype = object if self._data.dtype == object else None + return self._wrap_result( + result, expand=expand, returns_string=expand, dtype=dtype + ) @Appender( _shared_docs["str_split"] @@ -932,7 +942,10 @@ def split( @forbid_nonstring_types(["bytes"]) def rsplit(self, pat=None, *, n=-1, expand: bool = False): result = self._data.array._str_rsplit(pat, n=n) - return self._wrap_result(result, expand=expand, returns_string=expand) + dtype = object if self._data.dtype == object else None + return self._wrap_result( + result, expand=expand, returns_string=expand, dtype=dtype + ) _shared_docs[ "str_partition" @@ -1028,7 +1041,13 @@ def rsplit(self, pat=None, *, n=-1, expand: bool = False): @forbid_nonstring_types(["bytes"]) def partition(self, sep: str = " ", expand: bool = True): result = self._data.array._str_partition(sep, expand) - return self._wrap_result(result, expand=expand, returns_string=expand) + if self._data.dtype == "category": + dtype = self._data.dtype.categories.dtype + else: + dtype = object if self._data.dtype == object else None + return self._wrap_result( + result, expand=expand, returns_string=expand, dtype=dtype + ) @Appender( _shared_docs["str_partition"] @@ -1042,7 +1061,13 @@ def partition(self, sep: str = " ", expand: bool = True): @forbid_nonstring_types(["bytes"]) def rpartition(self, sep: str = " ", expand: bool = True): result = self._data.array._str_rpartition(sep, expand) - return self._wrap_result(result, expand=expand, returns_string=expand) + if self._data.dtype == "category": + dtype = self._data.dtype.categories.dtype + else: + dtype = object if self._data.dtype == object else None + return self._wrap_result( + result, expand=expand, returns_string=expand, dtype=dtype + ) def get(self, i): """ @@ -2748,7 +2773,7 @@ def extract( else: name = _get_single_group_name(regex) result = self._data.array._str_extract(pat, flags=flags, expand=returns_df) - return self._wrap_result(result, name=name) + return self._wrap_result(result, name=name, dtype=result_dtype) @forbid_nonstring_types(["bytes"]) def extractall(self, pat, flags: int = 0) -> DataFrame: @@ -3488,7 +3513,7 @@ def str_extractall(arr, pat, flags: int = 0) -> DataFrame: raise ValueError("pattern contains no capture groups") if isinstance(arr, ABCIndex): - arr = arr.to_series().reset_index(drop=True) + arr = arr.to_series().reset_index(drop=True).astype(arr.dtype) columns = _get_group_names(regex) match_list = [] diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 26cbc77e4e8ae..5ebf1e442733e 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -25,8 +25,7 @@ Timedelta, Timestamp, astype_overflowsafe, - get_unit_from_dtype, - is_supported_unit, + is_supported_dtype, timezones as libtimezones, ) from pandas._libs.tslibs.conversion import cast_from_unit_vectorized @@ -385,7 +384,7 @@ def _convert_listlike_datetimes( return arg elif lib.is_np_dtype(arg_dtype, "M"): - if not is_supported_unit(get_unit_from_dtype(arg_dtype)): + if not is_supported_dtype(arg_dtype): # We go to closest supported reso, i.e. "s" arg = astype_overflowsafe( # TODO: looks like we incorrectly raise with errors=="ignore" diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index f90863a8ea1ef..f268d36d7fdc4 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -2439,14 +2439,14 @@ def var( create_section_header("Examples"), dedent( """\ - >>> ser = pd.Series([1, 5, 2, 7, 12, 6]) + >>> ser = pd.Series([1, 5, 2, 7, 15, 6]) >>> ser.rolling(3).skew().round(6) 0 NaN 1 NaN 2 1.293343 3 -0.585583 - 4 0.000000 - 5 1.545393 + 4 0.670284 + 5 1.652317 dtype: float64 """ ), @@ -2794,12 +2794,12 @@ def cov( >>> v1 = [3, 3, 3, 5, 8] >>> v2 = [3, 4, 4, 4, 8] - >>> # numpy returns a 2X2 array, the correlation coefficient - >>> # is the number at entry [0][1] - >>> print(f"{{np.corrcoef(v1[:-1], v2[:-1])[0][1]:.6f}}") - 0.333333 - >>> print(f"{{np.corrcoef(v1[1:], v2[1:])[0][1]:.6f}}") - 0.916949 + >>> np.corrcoef(v1[:-1], v2[:-1]) + array([[1. , 0.33333333], + [0.33333333, 1. ]]) + >>> np.corrcoef(v1[1:], v2[1:]) + array([[1. , 0.9169493], + [0.9169493, 1. ]]) >>> s1 = pd.Series(v1) >>> s2 = pd.Series(v2) >>> s1.rolling(4).corr(s2) @@ -2813,15 +2813,18 @@ def cov( The below example shows a similar rolling calculation on a DataFrame using the pairwise option. - >>> matrix = np.array([[51., 35.], [49., 30.], [47., 32.],\ - [46., 31.], [50., 36.]]) - >>> print(np.corrcoef(matrix[:-1,0], matrix[:-1,1]).round(7)) - [[1. 0.6263001] - [0.6263001 1. ]] - >>> print(np.corrcoef(matrix[1:,0], matrix[1:,1]).round(7)) - [[1. 0.5553681] - [0.5553681 1. ]] - >>> df = pd.DataFrame(matrix, columns=['X','Y']) + >>> matrix = np.array([[51., 35.], + ... [49., 30.], + ... [47., 32.], + ... [46., 31.], + ... [50., 36.]]) + >>> np.corrcoef(matrix[:-1, 0], matrix[:-1, 1]) + array([[1. , 0.6263001], + [0.6263001, 1. ]]) + >>> np.corrcoef(matrix[1:, 0], matrix[1:, 1]) + array([[1. , 0.55536811], + [0.55536811, 1. ]]) + >>> df = pd.DataFrame(matrix, columns=['X', 'Y']) >>> df X Y 0 51.0 35.0 diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 6d66830ab1dfd..2884294377ec9 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -86,7 +86,7 @@ ) _read_excel_doc = ( """ -Read an Excel file into a pandas DataFrame. +Read an Excel file into a ``pandas`` ``DataFrame``. Supports `xls`, `xlsx`, `xlsm`, `xlsb`, `odf`, `ods` and `odt` file extensions read from a local filesystem or URL. Supports an option to read @@ -112,7 +112,7 @@ Strings are used for sheet names. Integers are used in zero-indexed sheet positions (chart sheets do not count as a sheet position). Lists of strings/integers are used to request multiple sheets. - Specify None to get all worksheets. + Specify ``None`` to get all worksheets. Available cases: @@ -121,7 +121,7 @@ * ``"Sheet1"``: Load sheet with name "Sheet1" * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5" as a dict of `DataFrame` - * None: All worksheets. + * ``None``: All worksheets. header : int, list of int, default 0 Row (0-indexed) to use for the column labels of the parsed @@ -155,21 +155,21 @@ Returns a subset of the columns according to behavior above. dtype : Type name or dict of column -> type, default None Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32}} - Use `object` to preserve data as stored in Excel and not interpret dtype, - which will necessarily result in `object` dtype. + Use ``object`` to preserve data as stored in Excel and not interpret dtype, + which will necessarily result in ``object`` dtype. If converters are specified, they will be applied INSTEAD of dtype conversion. - If you use `None`, it will infer the dtype of each column based on the data. + If you use ``None``, it will infer the dtype of each column based on the data. engine : str, default None If io is not a buffer or path, this must be set to identify io. Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb", "calamine". Engine compatibility : - - "xlrd" supports old-style Excel files (.xls). - - "openpyxl" supports newer Excel file formats. - - "odf" supports OpenDocument file formats (.odf, .ods, .odt). - - "pyxlsb" supports Binary Excel files. - - "calamine" supports Excel (.xls, .xlsx, .xlsm, .xlsb) + - ``xlr`` supports old-style Excel files (.xls). + - ``openpyxl`` supports newer Excel file formats. + - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). + - ``pyxlsb`` supports Binary Excel files. + - ``calamine`` supports Excel (.xls, .xlsx, .xlsm, .xlsb) and OpenDocument (.ods) file formats. .. versionchanged:: 1.2.0 @@ -215,34 +215,34 @@ + """'. keep_default_na : bool, default True Whether or not to include the default NaN values when parsing the data. - Depending on whether `na_values` is passed in, the behavior is as follows: + Depending on whether ``na_values`` is passed in, the behavior is as follows: - * If `keep_default_na` is True, and `na_values` are specified, `na_values` - is appended to the default NaN values used for parsing. - * If `keep_default_na` is True, and `na_values` are not specified, only + * If ``keep_default_na`` is True, and ``na_values`` are specified, + ``na_values`` is appended to the default NaN values used for parsing. + * If ``keep_default_na`` is True, and ``na_values`` are not specified, only the default NaN values are used for parsing. - * If `keep_default_na` is False, and `na_values` are specified, only - the NaN values specified `na_values` are used for parsing. - * If `keep_default_na` is False, and `na_values` are not specified, no + * If ``keep_default_na`` is False, and ``na_values`` are specified, only + the NaN values specified ``na_values`` are used for parsing. + * If ``keep_default_na`` is False, and ``na_values`` are not specified, no strings will be parsed as NaN. - Note that if `na_filter` is passed in as False, the `keep_default_na` and - `na_values` parameters will be ignored. + Note that if `na_filter` is passed in as False, the ``keep_default_na`` and + ``na_values`` parameters will be ignored. na_filter : bool, default True Detect missing value markers (empty strings and the value of na_values). In - data without any NAs, passing na_filter=False can improve the performance - of reading a large file. + data without any NAs, passing ``na_filter=False`` can improve the + performance of reading a large file. verbose : bool, default False Indicate number of NA values placed in non-numeric columns. parse_dates : bool, list-like, or dict, default False The behavior is as follows: - * bool. If True -> try parsing the index. - * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 + * ``bool``. If True -> try parsing the index. + * ``list`` of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 each as a separate date column. - * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as + * ``list`` of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as a single date column. - * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call + * ``dict``, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call result 'foo' If a column or index contains an unparsable date, the entire column or @@ -372,7 +372,8 @@ 1 NaN 2 2 #Comment 3 -Comment lines in the excel input file can be skipped using the `comment` kwarg +Comment lines in the excel input file can be skipped using the +``comment`` kwarg. >>> pd.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP Name Value diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 39d5b45862a8f..7d5c354aef002 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -4082,8 +4082,9 @@ def css_calc(x, left: float, right: float, align: str, color: str | list | tuple return ret values = data.to_numpy() - left = np.nanmin(values) if vmin is None else vmin - right = np.nanmax(values) if vmax is None else vmax + # A tricky way to address the issue where np.nanmin/np.nanmax fail to handle pd.NA. + left = np.nanmin(data.min(skipna=True)) if vmin is None else vmin + right = np.nanmax(data.max(skipna=True)) if vmax is None else vmax z: float = 0 # adjustment to translate data if align == "mid": diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 1c79392d54771..66a7ccacf675b 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -296,18 +296,8 @@ def read(self) -> DataFrame: dtype_mapping[pa.null()] = pd.Int64Dtype() frame = table.to_pandas(types_mapper=dtype_mapping.get) elif using_pyarrow_string_dtype(): - - def types_mapper(dtype): - dtype_dict = self.kwds["dtype"] - if dtype_dict is not None and dtype_dict.get(dtype, None) is not None: - return dtype_dict.get(dtype) - return arrow_string_types_mapper()(dtype) - - frame = table.to_pandas(types_mapper=types_mapper) + frame = table.to_pandas(types_mapper=arrow_string_types_mapper()) else: - if isinstance(self.kwds.get("dtype"), dict): - frame = table.to_pandas(types_mapper=self.kwds["dtype"].get) - else: - frame = table.to_pandas() + frame = table.to_pandas() return self._finalize_pandas_output(frame) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 66990de6d3b89..2f9243c895ae8 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -5,7 +5,10 @@ """ from __future__ import annotations -from collections import abc +from collections import ( + abc, + defaultdict, +) import csv import sys from textwrap import fill @@ -23,6 +26,8 @@ import numpy as np +from pandas._config import using_copy_on_write + from pandas._libs import lib from pandas._libs.parsers import STR_NA_VALUES from pandas.errors import ( @@ -38,8 +43,10 @@ is_float, is_integer, is_list_like, + pandas_dtype, ) +from pandas import Series from pandas.core.frame import DataFrame from pandas.core.indexes.api import RangeIndex from pandas.core.shared_docs import _shared_docs @@ -1846,7 +1853,40 @@ def read(self, nrows: int | None = None) -> DataFrame: else: new_rows = len(index) - df = DataFrame(col_dict, columns=columns, index=index) + if hasattr(self, "orig_options"): + dtype_arg = self.orig_options.get("dtype", None) + else: + dtype_arg = None + + if isinstance(dtype_arg, dict): + dtype = defaultdict(lambda: None) # type: ignore[var-annotated] + dtype.update(dtype_arg) + elif dtype_arg is not None and pandas_dtype(dtype_arg) in ( + np.str_, + np.object_, + ): + dtype = defaultdict(lambda: dtype_arg) + else: + dtype = None + + if dtype is not None: + new_col_dict = {} + for k, v in col_dict.items(): + d = ( + dtype[k] + if pandas_dtype(dtype[k]) in (np.str_, np.object_) + else None + ) + new_col_dict[k] = Series(v, index=index, dtype=d, copy=False) + else: + new_col_dict = col_dict + + df = DataFrame( + new_col_dict, + columns=columns, + index=index, + copy=not using_copy_on_write(), + ) self._currow += new_rows return df diff --git a/pandas/io/sql.py b/pandas/io/sql.py index d4b6602d9f0eb..a83c2bf241450 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -680,7 +680,7 @@ def read_sql( pandas now supports reading via ADBC drivers - >>> from adbc_driver_postgresql import dbapi + >>> from adbc_driver_postgresql import dbapi # doctest:+SKIP >>> with dbapi.connect('postgres:///db_name') as conn: # doctest:+SKIP ... pd.read_sql('SELECT int_column FROM test_data', conn) int_column diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index a017787f2dc2d..bd04e812e0be1 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -241,10 +241,10 @@ def hist_frame( .. plot:: :context: close-figs - >>> df = pd.DataFrame({ - ... 'length': [1.5, 0.5, 1.2, 0.9, 3], - ... 'width': [0.7, 0.2, 0.15, 0.2, 1.1] - ... }, index=['pig', 'rabbit', 'duck', 'chicken', 'horse']) + >>> data = {'length': [1.5, 0.5, 1.2, 0.9, 3], + ... 'width': [0.7, 0.2, 0.15, 0.2, 1.1]} + >>> index = ['pig', 'rabbit', 'duck', 'chicken', 'horse'] + >>> df = pd.DataFrame(data, index=index) >>> hist = df.hist(bins=3) """ plot_backend = _get_plot_backend(backend) @@ -607,10 +607,10 @@ def boxplot_frame_groupby( >>> import itertools >>> tuples = [t for t in itertools.product(range(1000), range(4))] >>> index = pd.MultiIndex.from_tuples(tuples, names=['lvl0', 'lvl1']) - >>> data = np.random.randn(len(index),4) + >>> data = np.random.randn(len(index), 4) >>> df = pd.DataFrame(data, columns=list('ABCD'), index=index) >>> grouped = df.groupby(level='lvl1') - >>> grouped.boxplot(rot=45, fontsize=12, figsize=(8,10)) # doctest: +SKIP + >>> grouped.boxplot(rot=45, fontsize=12, figsize=(8, 10)) # doctest: +SKIP The ``subplots=False`` option shows the boxplots in a single figure. @@ -1400,9 +1400,7 @@ def hist( .. plot:: :context: close-figs - >>> df = pd.DataFrame( - ... np.random.randint(1, 7, 6000), - ... columns = ['one']) + >>> df = pd.DataFrame(np.random.randint(1, 7, 6000), columns=['one']) >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000) >>> ax = df.plot.hist(bins=12, alpha=0.5) diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 5e5a55b4d0a98..18db460d388a4 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -439,7 +439,7 @@ def bootstrap_plot( :context: close-figs >>> s = pd.Series(np.random.uniform(size=100)) - >>> pd.plotting.bootstrap_plot(s) + >>> pd.plotting.bootstrap_plot(s) # doctest: +SKIP
""" plot_backend = _get_plot_backend("matplotlib") diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index 7d27f940daa4c..4ffd76722286a 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -10,10 +10,13 @@ from pandas._config import using_pyarrow_string_dtype +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Series, Timestamp, + option_context, ) import pandas._testing as tm from pandas.core import ops @@ -33,20 +36,24 @@ def test_comparison_object_numeric_nas(self, comparison_op): expected = func(ser.astype(float), shifted.astype(float)) tm.assert_series_equal(result, expected) - def test_object_comparisons(self): - ser = Series(["a", "b", np.nan, "c", "a"]) + @pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] + ) + def test_object_comparisons(self, infer_string): + with option_context("future.infer_string", infer_string): + ser = Series(["a", "b", np.nan, "c", "a"]) - result = ser == "a" - expected = Series([True, False, False, False, True]) - tm.assert_series_equal(result, expected) + result = ser == "a" + expected = Series([True, False, False, False, True]) + tm.assert_series_equal(result, expected) - result = ser < "a" - expected = Series([False, False, False, False, False]) - tm.assert_series_equal(result, expected) + result = ser < "a" + expected = Series([False, False, False, False, False]) + tm.assert_series_equal(result, expected) - result = ser != "a" - expected = -(ser == "a") - tm.assert_series_equal(result, expected) + result = ser != "a" + expected = -(ser == "a") + tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", [None, object]) def test_more_na_comparisons(self, dtype): diff --git a/pandas/tests/arrays/boolean/test_arithmetic.py b/pandas/tests/arrays/boolean/test_arithmetic.py index 197e83121567e..0c4fcf149eb20 100644 --- a/pandas/tests/arrays/boolean/test_arithmetic.py +++ b/pandas/tests/arrays/boolean/test_arithmetic.py @@ -90,9 +90,16 @@ def test_op_int8(left_array, right_array, opname): # ----------------------------------------------------------------------------- -def test_error_invalid_values(data, all_arithmetic_operators): +def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): # invalid ops + if using_infer_string: + import pyarrow as pa + + err = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError) + else: + err = TypeError + op = all_arithmetic_operators s = pd.Series(data) ops = getattr(s, op) @@ -110,9 +117,10 @@ def test_error_invalid_values(data, all_arithmetic_operators): [ r"unsupported operand type\(s\) for", "Concatenation operation is not implemented for NumPy arrays", + "has no kernel", ] ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(err, match=msg): ops(pd.Timestamp("20180101")) # invalid array-likes @@ -123,7 +131,9 @@ def test_error_invalid_values(data, all_arithmetic_operators): r"unsupported operand type\(s\) for", "can only concatenate str", "not all arguments converted during string formatting", + "has no kernel", + "not implemented", ] ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(err, match=msg): ops(pd.Series("foo", index=s.index)) diff --git a/pandas/tests/arrays/categorical/test_astype.py b/pandas/tests/arrays/categorical/test_astype.py index 7fba150c9113f..a2a53af6ab1ad 100644 --- a/pandas/tests/arrays/categorical/test_astype.py +++ b/pandas/tests/arrays/categorical/test_astype.py @@ -89,7 +89,7 @@ def test_astype(self, ordered): expected = np.array(cat) tm.assert_numpy_array_equal(result, expected) - msg = r"Cannot cast object dtype to float64" + msg = r"Cannot cast object|string dtype to float64" with pytest.raises(ValueError, match=msg): cat.astype(float) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index e25e31e2f2e9e..50aaa42e09f22 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, @@ -447,6 +449,7 @@ def test_constructor_str_unknown(self): with pytest.raises(ValueError, match="Unknown dtype"): Categorical([1, 2], dtype="foo") + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="Can't be NumPy strings") def test_constructor_np_strs(self): # GH#31499 Hashtable.map_locations needs to work on np.str_ objects cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")]) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index a1e50917fed98..16b941eab4830 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -92,7 +92,7 @@ def test_comparisons(self, factor): cat > cat_unordered # comparison (in both directions) with Series will raise - s = Series(["b", "b", "b"]) + s = Series(["b", "b", "b"], dtype=object) msg = ( "Cannot compare a Categorical for op __gt__ with type " r"" @@ -108,7 +108,7 @@ def test_comparisons(self, factor): # comparison with numpy.array will raise in both direction, but only on # newer numpy versions - a = np.array(["b", "b", "b"]) + a = np.array(["b", "b", "b"], dtype=object) with pytest.raises(TypeError, match=msg): cat > a with pytest.raises(TypeError, match=msg): @@ -248,7 +248,7 @@ def test_comparisons(self, data, reverse, base): cat_base = Series( Categorical(base, categories=cat.cat.categories, ordered=True) ) - s = Series(base) + s = Series(base, dtype=object if base == list("bbb") else None) a = np.array(base) # comparisons need to take categories ordering into account diff --git a/pandas/tests/arrays/categorical/test_replace.py b/pandas/tests/arrays/categorical/test_replace.py index 0611d04d36d10..3c677142846d7 100644 --- a/pandas/tests/arrays/categorical/test_replace.py +++ b/pandas/tests/arrays/categorical/test_replace.py @@ -31,6 +31,9 @@ ([1, 2, "3"], "5", ["5", "5", 3], True), ], ) +@pytest.mark.filterwarnings( + "ignore:.*with CategoricalDtype is deprecated:FutureWarning" +) def test_replace_categorical_series(to_replace, value, expected, flip_categories): # GH 31720 @@ -60,7 +63,13 @@ def test_replace_categorical(to_replace, value, result, expected_error_msg): # GH#26988 cat = Categorical(["a", "b"]) expected = Categorical(result) - result = pd.Series(cat, copy=False).replace(to_replace, value)._values + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + warn = FutureWarning if expected_error_msg is not None else None + with tm.assert_produces_warning(warn, match=msg): + result = pd.Series(cat, copy=False).replace(to_replace, value)._values tm.assert_categorical_equal(result, expected) if to_replace == "b": # the "c" test is supposed to be unchanged @@ -69,14 +78,20 @@ def test_replace_categorical(to_replace, value, result, expected_error_msg): tm.assert_categorical_equal(cat, expected) ser = pd.Series(cat, copy=False) - ser.replace(to_replace, value, inplace=True) + with tm.assert_produces_warning(warn, match=msg): + ser.replace(to_replace, value, inplace=True) tm.assert_categorical_equal(cat, expected) def test_replace_categorical_ea_dtype(): # GH49404 cat = Categorical(pd.array(["a", "b"], dtype="string")) - result = pd.Series(cat).replace(["a", "b"], ["c", pd.NA])._values + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = pd.Series(cat).replace(["a", "b"], ["c", pd.NA])._values expected = Categorical(pd.array(["c", pd.NA], dtype="string")) tm.assert_categorical_equal(result, expected) @@ -85,7 +100,12 @@ def test_replace_maintain_ordering(): # GH51016 dtype = pd.CategoricalDtype([0, 1, 2], ordered=True) ser = pd.Series([0, 1, 2], dtype=dtype) - result = ser.replace(0, 2) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ser.replace(0, 2) expected_dtype = pd.CategoricalDtype([1, 2], ordered=True) expected = pd.Series([2, 1, 2], dtype=expected_dtype) tm.assert_series_equal(expected, result, check_category_order=True) diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index dca171bf81047..d6f93fbbd912f 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -1,9 +1,13 @@ import numpy as np +import pytest + +from pandas._config import using_pyarrow_string_dtype from pandas import ( Categorical, CategoricalDtype, CategoricalIndex, + Index, Series, date_range, option_context, @@ -13,11 +17,17 @@ class TestCategoricalReprWithFactor: - def test_print(self, factor): - expected = [ - "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']", - "Categories (3, object): ['a' < 'b' < 'c']", - ] + def test_print(self, factor, using_infer_string): + if using_infer_string: + expected = [ + "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']", + "Categories (3, string): [a < b < c]", + ] + else: + expected = [ + "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']", + "Categories (3, object): ['a' < 'b' < 'c']", + ] expected = "\n".join(expected) actual = repr(factor) assert actual == expected @@ -26,7 +36,7 @@ def test_print(self, factor): class TestCategoricalRepr: def test_big_print(self): codes = np.array([0, 1, 2, 0, 1, 2] * 100) - dtype = CategoricalDtype(categories=["a", "b", "c"]) + dtype = CategoricalDtype(categories=Index(["a", "b", "c"], dtype=object)) factor = Categorical.from_codes(codes, dtype=dtype) expected = [ "['a', 'b', 'c', 'a', 'b', ..., 'b', 'c', 'a', 'b', 'c']", @@ -40,13 +50,13 @@ def test_big_print(self): assert actual == expected def test_empty_print(self): - factor = Categorical([], ["a", "b", "c"]) + factor = Categorical([], Index(["a", "b", "c"], dtype=object)) expected = "[], Categories (3, object): ['a', 'b', 'c']" actual = repr(factor) assert actual == expected assert expected == actual - factor = Categorical([], ["a", "b", "c"], ordered=True) + factor = Categorical([], Index(["a", "b", "c"], dtype=object), ordered=True) expected = "[], Categories (3, object): ['a' < 'b' < 'c']" actual = repr(factor) assert expected == actual @@ -66,6 +76,10 @@ def test_print_none_width(self): with option_context("display.width", None): assert exp == repr(a) + @pytest.mark.skipif( + using_pyarrow_string_dtype(), + reason="Change once infer_string is set to True by default", + ) def test_unicode_print(self): c = Categorical(["aaaaa", "bb", "cccc"] * 20) expected = """\ diff --git a/pandas/tests/arrays/floating/test_arithmetic.py b/pandas/tests/arrays/floating/test_arithmetic.py index 056c22d8c1131..ba081bd01062a 100644 --- a/pandas/tests/arrays/floating/test_arithmetic.py +++ b/pandas/tests/arrays/floating/test_arithmetic.py @@ -122,11 +122,18 @@ def test_arith_zero_dim_ndarray(other): # ----------------------------------------------------------------------------- -def test_error_invalid_values(data, all_arithmetic_operators): +def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): op = all_arithmetic_operators s = pd.Series(data) ops = getattr(s, op) + if using_infer_string: + import pyarrow as pa + + errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError) + else: + errs = TypeError + # invalid scalars msg = "|".join( [ @@ -140,15 +147,17 @@ def test_error_invalid_values(data, all_arithmetic_operators): "ufunc '.*' not supported for the input types, and the inputs could not", "ufunc '.*' did not contain a loop with signature matching types", "Concatenation operation is not implemented for NumPy arrays", + "has no kernel", + "not implemented", ] ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops("foo") - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops(pd.Timestamp("20180101")) # invalid array-likes - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops(pd.Series("foo", index=s.index)) msg = "|".join( @@ -167,9 +176,11 @@ def test_error_invalid_values(data, all_arithmetic_operators): ), r"ufunc 'add' cannot use operands with types dtype\('float\d{2}'\)", "cannot subtract DatetimeArray from ndarray", + "has no kernel", + "not implemented", ] ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops(pd.Series(pd.date_range("20180101", periods=len(s)))) diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index ce6c245cd0f37..d979dd445a61a 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -172,11 +172,18 @@ def test_numpy_zero_dim_ndarray(other): # ----------------------------------------------------------------------------- -def test_error_invalid_values(data, all_arithmetic_operators): +def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): op = all_arithmetic_operators s = pd.Series(data) ops = getattr(s, op) + if using_infer_string: + import pyarrow as pa + + errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError) + else: + errs = TypeError + # invalid scalars msg = "|".join( [ @@ -188,20 +195,26 @@ def test_error_invalid_values(data, all_arithmetic_operators): "ufunc '.*' not supported for the input types, and the inputs could not", "ufunc '.*' did not contain a loop with signature matching types", "Addition/subtraction of integers and integer-arrays with Timestamp", + "has no kernel", + "not implemented", ] ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops("foo") - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops(pd.Timestamp("20180101")) # invalid array-likes str_ser = pd.Series("foo", index=s.index) # with pytest.raises(TypeError, match=msg): - if all_arithmetic_operators in [ - "__mul__", - "__rmul__", - ]: # (data[~data.isna()] >= 0).all(): + if ( + all_arithmetic_operators + in [ + "__mul__", + "__rmul__", + ] + and not using_infer_string + ): # (data[~data.isna()] >= 0).all(): res = ops(str_ser) expected = pd.Series(["foo" * x for x in data], index=s.index) expected = expected.fillna(np.nan) @@ -210,7 +223,7 @@ def test_error_invalid_values(data, all_arithmetic_operators): # more-correct than np.nan here. tm.assert_series_equal(res, expected) else: - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops(str_ser) msg = "|".join( @@ -223,9 +236,11 @@ def test_error_invalid_values(data, all_arithmetic_operators): r"can only concatenate str \(not \"int\"\) to str", "not all arguments converted during string", "cannot subtract DatetimeArray from ndarray", + "has no kernel", + "not implemented", ] ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops(pd.Series(pd.date_range("20180101", periods=len(s)))) diff --git a/pandas/tests/arrays/integer/test_reduction.py b/pandas/tests/arrays/integer/test_reduction.py index 1c91cd25ba69c..db04862e4ea07 100644 --- a/pandas/tests/arrays/integer/test_reduction.py +++ b/pandas/tests/arrays/integer/test_reduction.py @@ -102,7 +102,9 @@ def test_groupby_reductions(op, expected): ["all", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")], ], ) -def test_mixed_reductions(op, expected): +def test_mixed_reductions(op, expected, using_infer_string): + if op in ["any", "all"] and using_infer_string: + expected = expected.astype("bool") df = DataFrame( { "A": ["a", "b", "b"], diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 524a6632e5544..d015e899c4231 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -2,6 +2,8 @@ This module tests the functionality of StringArray and ArrowStringArray. Tests for the str accessors are in pandas/tests/strings/test_string_array.py """ +import operator + import numpy as np import pytest @@ -176,12 +178,7 @@ def test_add_sequence(dtype): tm.assert_extension_array_equal(result, expected) -def test_mul(dtype, request, arrow_string_storage): - if dtype.storage in arrow_string_storage: - reason = "unsupported operand type(s) for *: 'ArrowStringArray' and 'int'" - mark = pytest.mark.xfail(raises=NotImplementedError, reason=reason) - request.applymarker(mark) - +def test_mul(dtype): a = pd.array(["a", "b", None], dtype=dtype) result = a * 2 expected = pd.array(["aa", "bb", None], dtype=dtype) @@ -194,7 +191,7 @@ def test_mul(dtype, request, arrow_string_storage): @pytest.mark.xfail(reason="GH-28527") def test_add_strings(dtype): arr = pd.array(["a", "b", "c", "d"], dtype=dtype) - df = pd.DataFrame([["t", "y", "v", "w"]]) + df = pd.DataFrame([["t", "y", "v", "w"]], dtype=object) assert arr.__add__(df) is NotImplemented result = arr + df @@ -229,7 +226,10 @@ def test_comparison_methods_scalar(comparison_op, dtype): result = getattr(a, op_name)(other) if dtype.storage == "pyarrow_numpy": expected = np.array([getattr(item, op_name)(other) for item in a]) - expected[1] = False + if comparison_op == operator.ne: + expected[1] = True + else: + expected[1] = False tm.assert_numpy_array_equal(result, expected.astype(np.bool_)) else: expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" @@ -244,7 +244,10 @@ def test_comparison_methods_scalar_pd_na(comparison_op, dtype): result = getattr(a, op_name)(pd.NA) if dtype.storage == "pyarrow_numpy": - expected = np.array([False, False, False]) + if operator.ne == comparison_op: + expected = np.array([True, True, True]) + else: + expected = np.array([False, False, False]) tm.assert_numpy_array_equal(result, expected) else: expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" @@ -270,7 +273,7 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype): if dtype.storage == "pyarrow_numpy": expected_data = { "__eq__": [False, False, False], - "__ne__": [True, False, True], + "__ne__": [True, True, True], }[op_name] expected = np.array(expected_data) tm.assert_numpy_array_equal(result, expected) @@ -290,12 +293,18 @@ def test_comparison_methods_array(comparison_op, dtype): other = [None, None, "c"] result = getattr(a, op_name)(other) if dtype.storage == "pyarrow_numpy": - expected = np.array([False, False, False]) - expected[-1] = getattr(other[-1], op_name)(a[-1]) + if operator.ne == comparison_op: + expected = np.array([True, True, False]) + else: + expected = np.array([False, False, False]) + expected[-1] = getattr(other[-1], op_name)(a[-1]) tm.assert_numpy_array_equal(result, expected) result = getattr(a, op_name)(pd.NA) - expected = np.array([False, False, False]) + if operator.ne == comparison_op: + expected = np.array([True, True, True]) + else: + expected = np.array([False, False, False]) tm.assert_numpy_array_equal(result, expected) else: @@ -489,10 +498,17 @@ def test_arrow_array(dtype): @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") -def test_arrow_roundtrip(dtype, string_storage2): +def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): # roundtrip possible from arrow 1.0.0 pa = pytest.importorskip("pyarrow") + if using_infer_string and string_storage2 != "pyarrow_numpy": + request.applymarker( + pytest.mark.xfail( + reason="infer_string takes precedence over string storage" + ) + ) + data = pd.array(["a", "b", None], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) @@ -507,10 +523,19 @@ def test_arrow_roundtrip(dtype, string_storage2): @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") -def test_arrow_load_from_zero_chunks(dtype, string_storage2): +def test_arrow_load_from_zero_chunks( + dtype, string_storage2, request, using_infer_string +): # GH-41040 pa = pytest.importorskip("pyarrow") + if using_infer_string and string_storage2 != "pyarrow_numpy": + request.applymarker( + pytest.mark.xfail( + reason="infer_string takes precedence over string storage" + ) + ) + data = pd.array([], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index a801a845bc7be..a022dfffbdd2b 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -26,7 +26,9 @@ def test_eq_all_na(): tm.assert_extension_array_equal(result, expected) -def test_config(string_storage): +def test_config(string_storage, request, using_infer_string): + if using_infer_string and string_storage != "pyarrow_numpy": + request.applymarker(pytest.mark.xfail(reason="infer string takes precedence")) with pd.option_context("string_storage", string_storage): assert StringDtype().storage == string_storage result = pd.array(["a", "b"]) @@ -101,7 +103,7 @@ def test_constructor_from_list(): assert result.dtype.storage == "pyarrow" -def test_from_sequence_wrong_dtype_raises(): +def test_from_sequence_wrong_dtype_raises(using_infer_string): pytest.importorskip("pyarrow") with pd.option_context("string_storage", "python"): ArrowStringArray._from_sequence(["a", None, "c"], dtype="string") @@ -114,15 +116,19 @@ def test_from_sequence_wrong_dtype_raises(): ArrowStringArray._from_sequence(["a", None, "c"], dtype="string[pyarrow]") - with pytest.raises(AssertionError, match=None): - with pd.option_context("string_storage", "python"): - ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) + if not using_infer_string: + with pytest.raises(AssertionError, match=None): + with pd.option_context("string_storage", "python"): + ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) with pd.option_context("string_storage", "pyarrow"): ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) - with pytest.raises(AssertionError, match=None): - ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype("python")) + if not using_infer_string: + with pytest.raises(AssertionError, match=None): + ArrowStringArray._from_sequence( + ["a", None, "c"], dtype=StringDtype("python") + ) ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype("pyarrow")) @@ -137,13 +143,15 @@ def test_from_sequence_wrong_dtype_raises(): with pytest.raises(AssertionError, match=None): StringArray._from_sequence(["a", None, "c"], dtype="string[pyarrow]") - with pd.option_context("string_storage", "python"): - StringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) - - with pytest.raises(AssertionError, match=None): - with pd.option_context("string_storage", "pyarrow"): + if not using_infer_string: + with pd.option_context("string_storage", "python"): StringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) + if not using_infer_string: + with pytest.raises(AssertionError, match=None): + with pd.option_context("string_storage", "pyarrow"): + StringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) + StringArray._from_sequence(["a", None, "c"], dtype=StringDtype("python")) with pytest.raises(AssertionError, match=None): diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index e2b8ebcb79a3b..b0ec2787097f0 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -440,7 +440,7 @@ def test_array_unboxes(index_or_series): def test_array_to_numpy_na(): # GH#40638 - arr = pd.array([pd.NA, 1], dtype="string") + arr = pd.array([pd.NA, 1], dtype="string[python]") result = arr.to_numpy(na_value=True, dtype=bool) expected = np.array([True, True]) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/copy_view/test_chained_assignment_deprecation.py b/pandas/tests/copy_view/test_chained_assignment_deprecation.py index b51a5920917d6..80e38380ed27c 100644 --- a/pandas/tests/copy_view/test_chained_assignment_deprecation.py +++ b/pandas/tests/copy_view/test_chained_assignment_deprecation.py @@ -35,7 +35,7 @@ def test_methods_iloc_warn(using_copy_on_write): @pytest.mark.parametrize( "func, args", [ - ("replace", (1, 5)), + ("replace", (4, 5)), ("fillna", (1,)), ("interpolate", ()), ("bfill", ()), @@ -43,26 +43,30 @@ def test_methods_iloc_warn(using_copy_on_write): ], ) def test_methods_iloc_getitem_item_cache(func, args, using_copy_on_write): - df = DataFrame({"a": [1, 2, 3], "b": 1}) + # ensure we don't incorrectly raise chained assignment warning because + # of the item cache / iloc not setting the item cache + df_orig = DataFrame({"a": [1, 2, 3], "b": 1}) + + df = df_orig.copy() ser = df.iloc[:, 0] - # TODO(CoW-warn) should warn about updating a view getattr(ser, func)(*args, inplace=True) # parent that holds item_cache is dead, so don't increase ref count + df = df_orig.copy() ser = df.copy()["a"] getattr(ser, func)(*args, inplace=True) - df = df.copy() - + df = df_orig.copy() df["a"] # populate the item_cache ser = df.iloc[:, 0] # iloc creates a new object - ser.fillna(0, inplace=True) + getattr(ser, func)(*args, inplace=True) + df = df_orig.copy() df["a"] # populate the item_cache ser = df["a"] - ser.fillna(0, inplace=True) + getattr(ser, func)(*args, inplace=True) - df = df.copy() + df = df_orig.copy() df["a"] # populate the item_cache if using_copy_on_write: with tm.raises_chained_assignment_error(): diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index 33b8ce218f029..6f3850ab64daa 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -1145,13 +1145,12 @@ def test_set_value_copy_only_necessary_column( view = df[:] if val == "a" and indexer[0] != slice(None): - # TODO(CoW-warn) assert the FutureWarning for CoW is also raised with tm.assert_produces_warning( FutureWarning, match="Setting an item of incompatible dtype is deprecated" ): indexer_func(df)[indexer] = val else: - with tm.assert_cow_warning(warn_copy_on_write): + with tm.assert_cow_warning(warn_copy_on_write and val == 100): indexer_func(df)[indexer] = val if using_copy_on_write: diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py index 7849799eb2cc4..ddc5879a56d54 100644 --- a/pandas/tests/copy_view/test_interp_fillna.py +++ b/pandas/tests/copy_view/test_interp_fillna.py @@ -91,12 +91,13 @@ def test_interpolate_inplace_no_reference_no_copy(using_copy_on_write, vals): @pytest.mark.parametrize( "vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]] ) -def test_interpolate_inplace_with_refs(using_copy_on_write, vals): +def test_interpolate_inplace_with_refs(using_copy_on_write, vals, warn_copy_on_write): df = DataFrame({"a": [1, np.nan, 2]}) df_orig = df.copy() arr = get_array(df, "a") view = df[:] - df.interpolate(method="linear", inplace=True) + with tm.assert_cow_warning(warn_copy_on_write): + df.interpolate(method="linear", inplace=True) if using_copy_on_write: # Check that copy was triggered in interpolate and that we don't @@ -109,6 +110,31 @@ def test_interpolate_inplace_with_refs(using_copy_on_write, vals): assert np.shares_memory(arr, get_array(df, "a")) +@pytest.mark.parametrize("func", ["ffill", "bfill"]) +@pytest.mark.parametrize("dtype", ["float64", "Float64"]) +def test_interp_fill_functions_inplace( + using_copy_on_write, func, warn_copy_on_write, dtype +): + # Check that these takes the same code paths as interpolate + df = DataFrame({"a": [1, np.nan, 2]}, dtype=dtype) + df_orig = df.copy() + arr = get_array(df, "a") + view = df[:] + + with tm.assert_cow_warning(warn_copy_on_write and dtype == "float64"): + getattr(df, func)(inplace=True) + + if using_copy_on_write: + # Check that copy was triggered in interpolate and that we don't + # have any references left + assert not np.shares_memory(arr, get_array(df, "a")) + tm.assert_frame_equal(df_orig, view) + assert df._mgr._has_no_reference(0) + assert view._mgr._has_no_reference(0) + else: + assert np.shares_memory(arr, get_array(df, "a")) is (dtype == "float64") + + def test_interpolate_cleaned_fill_method(using_copy_on_write): # Check that "method is set to None" case works correctly df = DataFrame({"a": ["a", np.nan, "c"], "b": 1}) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 1e3c95dbc1ca3..862aebdc70a9d 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1607,7 +1607,8 @@ def test_interpolate_creates_copy(using_copy_on_write, warn_copy_on_write): view = df[:] expected = df.copy() - df.ffill(inplace=True) + with tm.assert_cow_warning(warn_copy_on_write): + df.ffill(inplace=True) with tm.assert_cow_warning(warn_copy_on_write): df.iloc[0, 0] = 100.5 @@ -1938,8 +1939,8 @@ def func(ser): ser.iloc[0] = 100 return ser - # TODO(CoW-warn) should warn? - ser.transform(func) + with tm.assert_cow_warning(warn_copy_on_write): + ser.transform(func) if using_copy_on_write: tm.assert_series_equal(ser, ser_orig) diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index eb3b1a5ef68e8..6d16bc3083883 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -48,12 +48,13 @@ def test_replace(using_copy_on_write, replace_kwargs): tm.assert_frame_equal(df, df_orig) -def test_replace_regex_inplace_refs(using_copy_on_write): +def test_replace_regex_inplace_refs(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": ["aaa", "bbb"]}) df_orig = df.copy() view = df[:] arr = get_array(df, "a") - df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True) + with tm.assert_cow_warning(warn_copy_on_write): + df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True) if using_copy_on_write: assert not np.shares_memory(arr, get_array(df, "a")) assert df._mgr._has_no_reference(0) @@ -161,13 +162,19 @@ def test_replace_to_replace_wrong_dtype(using_copy_on_write): def test_replace_list_categorical(using_copy_on_write): df = DataFrame({"a": ["a", "b", "c"]}, dtype="category") arr = get_array(df, "a") - df.replace(["c"], value="a", inplace=True) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + df.replace(["c"], value="a", inplace=True) assert np.shares_memory(arr.codes, get_array(df, "a").codes) if using_copy_on_write: assert df._mgr._has_no_reference(0) df_orig = df.copy() - df2 = df.replace(["b"], value="a") + with tm.assert_produces_warning(FutureWarning, match=msg): + df2 = df.replace(["b"], value="a") assert not np.shares_memory(arr.codes, get_array(df2, "a").codes) tm.assert_frame_equal(df, df_orig) @@ -177,7 +184,12 @@ def test_replace_list_inplace_refs_categorical(using_copy_on_write): df = DataFrame({"a": ["a", "b", "c"]}, dtype="category") view = df[:] df_orig = df.copy() - df.replace(["c"], value="a", inplace=True) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + df.replace(["c"], value="a", inplace=True) if using_copy_on_write: assert not np.shares_memory( get_array(view, "a").codes, get_array(df, "a").codes @@ -202,11 +214,12 @@ def test_replace_inplace(using_copy_on_write, to_replace): @pytest.mark.parametrize("to_replace", [1.5, [1.5]]) -def test_replace_inplace_reference(using_copy_on_write, to_replace): +def test_replace_inplace_reference(using_copy_on_write, to_replace, warn_copy_on_write): df = DataFrame({"a": [1.5, 2, 3]}) arr_a = get_array(df, "a") view = df[:] - df.replace(to_replace=to_replace, value=15.5, inplace=True) + with tm.assert_cow_warning(warn_copy_on_write): + df.replace(to_replace=to_replace, value=15.5, inplace=True) if using_copy_on_write: assert not np.shares_memory(get_array(df, "a"), arr_a) @@ -236,7 +249,13 @@ def test_replace_categorical_inplace_reference(using_copy_on_write, val, to_repl df_orig = df.copy() arr_a = get_array(df, "a") view = df[:] - df.replace(to_replace=to_replace, value=val, inplace=True) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + warn = FutureWarning if val == 1.5 else None + with tm.assert_produces_warning(warn, match=msg): + df.replace(to_replace=to_replace, value=val, inplace=True) if using_copy_on_write: assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes) @@ -251,7 +270,13 @@ def test_replace_categorical_inplace_reference(using_copy_on_write, val, to_repl def test_replace_categorical_inplace(using_copy_on_write, val): df = DataFrame({"a": Categorical([1, 2, 3])}) arr_a = get_array(df, "a") - df.replace(to_replace=1, value=val, inplace=True) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + warn = FutureWarning if val == 1.5 else None + with tm.assert_produces_warning(warn, match=msg): + df.replace(to_replace=1, value=val, inplace=True) assert np.shares_memory(get_array(df, "a").codes, arr_a.codes) if using_copy_on_write: @@ -265,7 +290,13 @@ def test_replace_categorical_inplace(using_copy_on_write, val): def test_replace_categorical(using_copy_on_write, val): df = DataFrame({"a": Categorical([1, 2, 3])}) df_orig = df.copy() - df2 = df.replace(to_replace=1, value=val) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + warn = FutureWarning if val == 1.5 else None + with tm.assert_produces_warning(warn, match=msg): + df2 = df.replace(to_replace=1, value=val) if using_copy_on_write: assert df._mgr._has_no_reference(0) @@ -354,12 +385,13 @@ def test_replace_list_none(using_copy_on_write): assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) -def test_replace_list_none_inplace_refs(using_copy_on_write): +def test_replace_list_none_inplace_refs(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": ["a", "b", "c"]}) arr = get_array(df, "a") df_orig = df.copy() view = df[:] - df.replace(["a"], value=None, inplace=True) + with tm.assert_cow_warning(warn_copy_on_write): + df.replace(["a"], value=None, inplace=True) if using_copy_on_write: assert df._mgr._has_no_reference(0) assert not np.shares_memory(arr, get_array(df, "a")) @@ -431,7 +463,7 @@ def test_replace_listlike(using_copy_on_write): tm.assert_frame_equal(df, df_orig) -def test_replace_listlike_inplace(using_copy_on_write): +def test_replace_listlike_inplace(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) arr = get_array(df, "a") df.replace([200, 2], [10, 11], inplace=True) @@ -439,7 +471,8 @@ def test_replace_listlike_inplace(using_copy_on_write): view = df[:] df_orig = df.copy() - df.replace([200, 3], [10, 11], inplace=True) + with tm.assert_cow_warning(warn_copy_on_write): + df.replace([200, 3], [10, 11], inplace=True) if using_copy_on_write: assert not np.shares_memory(get_array(df, "a"), arr) tm.assert_frame_equal(view, df_orig) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 42043f95a7ace..c34c97b6e4f04 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -164,7 +164,9 @@ def get_is_dtype_funcs(): return [getattr(com, fname) for fname in fnames] -@pytest.mark.filterwarnings("ignore:is_categorical_dtype is deprecated:FutureWarning") +@pytest.mark.filterwarnings( + "ignore:is_categorical_dtype is deprecated:DeprecationWarning" +) @pytest.mark.parametrize("func", get_is_dtype_funcs(), ids=lambda x: x.__name__) def test_get_dtype_error_catch(func): # see gh-15941 @@ -180,7 +182,7 @@ def test_get_dtype_error_catch(func): or func is com.is_categorical_dtype or func is com.is_period_dtype ): - warn = FutureWarning + warn = DeprecationWarning with tm.assert_produces_warning(warn, match=msg): assert not func(None) @@ -200,7 +202,7 @@ def test_is_object(): ) def test_is_sparse(check_scipy): msg = "is_sparse is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert com.is_sparse(SparseArray([1, 2, 3])) assert not com.is_sparse(np.array([1, 2, 3])) @@ -230,7 +232,7 @@ def test_is_datetime64_dtype(): def test_is_datetime64tz_dtype(): msg = "is_datetime64tz_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert not com.is_datetime64tz_dtype(object) assert not com.is_datetime64tz_dtype([1, 2, 3]) assert not com.is_datetime64tz_dtype(pd.DatetimeIndex([1, 2, 3])) @@ -246,7 +248,7 @@ def kind(self) -> str: not_tz_dtype = NotTZDtype() msg = "is_datetime64tz_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert not com.is_datetime64tz_dtype(not_tz_dtype) assert not com.needs_i8_conversion(not_tz_dtype) @@ -268,7 +270,7 @@ def test_is_timedelta64_dtype(): def test_is_period_dtype(): msg = "is_period_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert not com.is_period_dtype(object) assert not com.is_period_dtype([1, 2, 3]) assert not com.is_period_dtype(pd.Period("2017-01-01")) @@ -279,7 +281,7 @@ def test_is_period_dtype(): def test_is_interval_dtype(): msg = "is_interval_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert not com.is_interval_dtype(object) assert not com.is_interval_dtype([1, 2, 3]) @@ -292,7 +294,7 @@ def test_is_interval_dtype(): def test_is_categorical_dtype(): msg = "is_categorical_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert not com.is_categorical_dtype(object) assert not com.is_categorical_dtype([1, 2, 3]) @@ -442,7 +444,7 @@ def test_is_not_unsigned_integer_dtype(dtype): ) def test_is_int64_dtype(dtype): msg = "is_int64_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert com.is_int64_dtype(dtype) @@ -480,7 +482,7 @@ def test_type_comparison_with_signed_int_ea_dtype_and_signed_int_numpy_dtype( ) def test_is_not_int64_dtype(dtype): msg = "is_int64_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert not com.is_int64_dtype(dtype) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 4e8f375b31674..0dad0b05303ad 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -166,7 +166,7 @@ def test_is_dtype(self, dtype): def test_basic(self, dtype): msg = "is_categorical_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert is_categorical_dtype(dtype) factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"]) @@ -292,7 +292,7 @@ def test_subclass(self): def test_compat(self, dtype): msg = "is_datetime64tz_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert is_datetime64tz_dtype(dtype) assert is_datetime64tz_dtype("datetime64[ns, US/Eastern]") assert is_datetime64_any_dtype(dtype) @@ -353,14 +353,14 @@ def test_equality(self, dtype): def test_basic(self, dtype): msg = "is_datetime64tz_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert is_datetime64tz_dtype(dtype) dr = date_range("20130101", periods=3, tz="US/Eastern") s = Series(dr, name="A") # dtypes - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert is_datetime64tz_dtype(s.dtype) assert is_datetime64tz_dtype(s) assert not is_datetime64tz_dtype(np.dtype("float64")) @@ -531,7 +531,7 @@ def test_equality(self, dtype): def test_basic(self, dtype): msg = "is_period_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert is_period_dtype(dtype) pidx = pd.period_range("2013-01-01 09:00", periods=5, freq="h") @@ -619,7 +619,7 @@ def test_construction(self, subtype): i = IntervalDtype(subtype, closed="right") assert i.subtype == np.dtype("int64") msg = "is_interval_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert is_interval_dtype(i) @pytest.mark.parametrize( @@ -642,7 +642,7 @@ def test_construction_generic(self, subtype): i = IntervalDtype(subtype) assert i.subtype is None msg = "is_interval_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert is_interval_dtype(i) @pytest.mark.parametrize( @@ -815,7 +815,7 @@ def test_name_repr_generic(self, subtype): def test_basic(self, dtype): msg = "is_interval_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert is_interval_dtype(dtype) ii = IntervalIndex.from_breaks(range(3)) @@ -830,7 +830,7 @@ def test_basic(self, dtype): def test_basic_dtype(self): msg = "is_interval_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert is_interval_dtype("interval[int64, both]") assert is_interval_dtype(IntervalIndex.from_tuples([(0, 1)])) assert is_interval_dtype(IntervalIndex.from_breaks(np.arange(4))) @@ -1178,7 +1178,7 @@ def test_is_dtype_no_warning(check): or check is is_datetime64tz_dtype or check is is_period_dtype ): - warn = FutureWarning + warn = DeprecationWarning with tm.assert_produces_warning(warn, match=msg): check(data) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 32c8def669c21..ff2cfc1278331 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1833,7 +1833,7 @@ def test_is_datetime_dtypes(self): assert is_datetime64_any_dtype(ts) assert is_datetime64_any_dtype(tsa) - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert not is_datetime64tz_dtype("datetime64") assert not is_datetime64tz_dtype("datetime64[ns]") assert not is_datetime64tz_dtype(ts) @@ -1845,7 +1845,7 @@ def test_is_datetime_dtypes_with_tz(self, tz): assert not is_datetime64_dtype(dtype) msg = "is_datetime64tz_dtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): assert is_datetime64tz_dtype(dtype) assert is_datetime64_ns_dtype(dtype) assert is_datetime64_any_dtype(dtype) diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py index 5ba65ceaeeada..c7b768f6e3c88 100644 --- a/pandas/tests/extension/base/dtype.py +++ b/pandas/tests/extension/base/dtype.py @@ -59,7 +59,12 @@ def test_check_dtype(self, data): # check equivalency for using .dtypes df = pd.DataFrame( - {"A": pd.Series(data, dtype=dtype), "B": data, "C": "foo", "D": 1} + { + "A": pd.Series(data, dtype=dtype), + "B": data, + "C": pd.Series(["foo"] * len(data), dtype=object), + "D": 1, + } ) result = df.dtypes == str(dtype) assert np.dtype("int64") != "Int64" diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 5c21c4f7137a5..4e8221f67a74d 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -21,7 +21,12 @@ class BaseGroupbyTests: def test_grouping_grouper(self, data_for_grouping): df = pd.DataFrame( - {"A": ["B", "B", None, None, "A", "A", "B", "C"], "B": data_for_grouping} + { + "A": pd.Series( + ["B", "B", None, None, "A", "A", "B", "C"], dtype=object + ), + "B": data_for_grouping, + } ) gr1 = df.groupby("A").grouper.groupings[0] gr2 = df.groupby("B").grouper.groupings[0] diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index 40cc952d44200..ffb7a24b4b390 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -44,7 +44,7 @@ def test_dropna_series(self, data_missing): tm.assert_series_equal(result, expected) def test_dropna_frame(self, data_missing): - df = pd.DataFrame({"A": data_missing}) + df = pd.DataFrame({"A": data_missing}, columns=pd.Index(["A"], dtype=object)) # defaults result = df.dropna() diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index 40fab5ec11d7d..5cd66d8a874c7 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.core.dtypes.common import is_string_dtype import pandas as pd @@ -27,13 +29,23 @@ def _get_expected_exception( # The self.obj_bar_exc pattern isn't great in part because it can depend # on op_name or dtypes, but we use it here for backward-compatibility. if op_name in ["__divmod__", "__rdivmod__"]: - return self.divmod_exc - if isinstance(obj, pd.Series) and isinstance(other, pd.Series): - return self.series_array_exc + result = self.divmod_exc + elif isinstance(obj, pd.Series) and isinstance(other, pd.Series): + result = self.series_array_exc elif isinstance(obj, pd.Series): - return self.series_scalar_exc + result = self.series_scalar_exc else: - return self.frame_scalar_exc + result = self.frame_scalar_exc + + if using_pyarrow_string_dtype() and result is not None: + import pyarrow as pa + + result = ( # type: ignore[assignment] + result, + pa.lib.ArrowNotImplementedError, + NotImplementedError, + ) + return result def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): # In _check_op we check that the result of a pointwise operation diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index 067b401ce2f23..187da89729f0e 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -351,11 +351,11 @@ def test_setitem_preserves_views(self, data): def test_setitem_with_expansion_dataframe_column(self, data, full_indexer): # https://github.com/pandas-dev/pandas/issues/32395 - df = expected = pd.DataFrame({"data": pd.Series(data)}) + df = expected = pd.DataFrame({0: pd.Series(data)}) result = pd.DataFrame(index=df.index) key = full_indexer(df) - result.loc[key, "data"] = df["data"] + result.loc[key, 0] = df[0] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 9e70a59932701..75e5fb00586e6 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -965,8 +965,16 @@ def _get_arith_xfail_marker(self, opname, pa_dtype): def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request): pa_dtype = data.dtype.pyarrow_dtype - if all_arithmetic_operators == "__rmod__" and (pa.types.is_binary(pa_dtype)): + if all_arithmetic_operators == "__rmod__" and pa.types.is_binary(pa_dtype): pytest.skip("Skip testing Python string formatting") + elif all_arithmetic_operators in ("__rmul__", "__mul__") and ( + pa.types.is_binary(pa_dtype) or pa.types.is_string(pa_dtype) + ): + request.applymarker( + pytest.mark.xfail( + raises=TypeError, reason="Can only string multiply by an integer." + ) + ) mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype) if mark is not None: @@ -981,6 +989,14 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype) ): pytest.skip("Skip testing Python string formatting") + elif all_arithmetic_operators in ("__rmul__", "__mul__") and ( + pa.types.is_binary(pa_dtype) or pa.types.is_string(pa_dtype) + ): + request.applymarker( + pytest.mark.xfail( + raises=TypeError, reason="Can only string multiply by an integer." + ) + ) mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype) if mark is not None: @@ -1004,6 +1020,14 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators, request): ), ) ) + elif all_arithmetic_operators in ("__rmul__", "__mul__") and ( + pa.types.is_binary(pa_dtype) or pa.types.is_string(pa_dtype) + ): + request.applymarker( + pytest.mark.xfail( + raises=TypeError, reason="Can only string multiply by an integer." + ) + ) mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype) if mark is not None: @@ -1752,6 +1776,14 @@ def test_str_replace(pat, repl, n, regex, exp): tm.assert_series_equal(result, expected) +def test_str_replace_negative_n(): + # GH 56404 + ser = pd.Series(["abc", "aaaaaa"], dtype=ArrowDtype(pa.string())) + actual = ser.str.replace("a", "", -3, True) + expected = pd.Series(["bc", ""], dtype=ArrowDtype(pa.string())) + tm.assert_series_equal(expected, actual) + + def test_str_repeat_unsupported(): ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) with pytest.raises(NotImplementedError, match="repeat is not"): @@ -1803,7 +1835,7 @@ def test_str_fullmatch(pat, case, na, exp): @pytest.mark.parametrize( "sub, start, end, exp, exp_typ", - [["ab", 0, None, [0, None], pa.int32()], ["bc", 1, 3, [2, None], pa.int64()]], + [["ab", 0, None, [0, None], pa.int32()], ["bc", 1, 3, [1, None], pa.int64()]], ) def test_str_find(sub, start, end, exp, exp_typ): ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) @@ -1812,6 +1844,14 @@ def test_str_find(sub, start, end, exp, exp_typ): tm.assert_series_equal(result, expected) +def test_str_find_negative_start(): + # GH 56411 + ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) + result = ser.str.find(sub="b", start=-1000, end=3) + expected = pd.Series([1, None], dtype=ArrowDtype(pa.int64())) + tm.assert_series_equal(result, expected) + + def test_str_find_notimplemented(): ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) with pytest.raises(NotImplementedError, match="find not implemented"): diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 5cde5df4bc007..6f33b18b19c51 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -18,6 +18,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas as pd from pandas import Categorical import pandas._testing as tm @@ -100,7 +102,9 @@ def test_contains(self, data, data_missing): if na_value_obj is na_value: continue assert na_value_obj not in data - assert na_value_obj in data_missing # this line differs from super method + # this section suffers from super method + if not using_pyarrow_string_dtype(): + assert na_value_obj in data_missing def test_empty(self, dtype): cls = dtype.construct_array_type() diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index f1939ea174841..c0692064cfaec 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -196,7 +196,7 @@ def test_series_constructor_scalar_with_index(self, data, dtype): class TestDtype(BaseNumPyTests, base.BaseDtypeTests): - def test_check_dtype(self, data, request): + def test_check_dtype(self, data, request, using_infer_string): if data.dtype.numpy_dtype == "object": request.applymarker( pytest.mark.xfail( @@ -429,7 +429,7 @@ def test_setitem_with_expansion_dataframe_column(self, data, full_indexer): if data.dtype.numpy_dtype != object: if not isinstance(key, slice) or key != slice(None): expected = pd.DataFrame({"data": data.to_numpy()}) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_column_type=False) @skip_nested diff --git a/pandas/tests/frame/constructors/test_from_dict.py b/pandas/tests/frame/constructors/test_from_dict.py index 845174bbf600e..60a8e688b3b8a 100644 --- a/pandas/tests/frame/constructors/test_from_dict.py +++ b/pandas/tests/frame/constructors/test_from_dict.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas import ( DataFrame, Index, @@ -42,6 +44,9 @@ def test_constructor_single_row(self): ) tm.assert_frame_equal(result, expected) + @pytest.mark.skipif( + using_pyarrow_string_dtype(), reason="columns inferring logic broken" + ) def test_constructor_list_of_series(self): data = [ OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]), diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index bcf4e8fb0e64a..3622571f1365d 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -6,6 +6,8 @@ import pytest import pytz +from pandas._config import using_pyarrow_string_dtype + from pandas.compat import is_platform_little_endian from pandas import ( @@ -56,6 +58,9 @@ def test_from_records_with_datetimes(self): expected["EXPIRY"] = expected["EXPIRY"].astype("M8[s]") tm.assert_frame_equal(result, expected) + @pytest.mark.skipif( + using_pyarrow_string_dtype(), reason="dtype checking logic doesn't work" + ) def test_from_records_sequencelike(self): df = DataFrame( { @@ -80,7 +85,7 @@ def test_from_records_sequencelike(self): # this is actually tricky to create the recordlike arrays and # have the dtypes be intact - blocks = df._to_dict_of_blocks(copy=False) + blocks = df._to_dict_of_blocks() tuples = [] columns = [] dtypes = [] @@ -169,7 +174,7 @@ def test_from_records_dictlike(self): # columns is in a different order here than the actual items iterated # from the dict - blocks = df._to_dict_of_blocks(copy=False) + blocks = df._to_dict_of_blocks() columns = [] for b in blocks.values(): columns.extend(b.columns) diff --git a/pandas/tests/frame/indexing/test_getitem.py b/pandas/tests/frame/indexing/test_getitem.py index 8502f98df5962..a36b0c0e850b3 100644 --- a/pandas/tests/frame/indexing/test_getitem.py +++ b/pandas/tests/frame/indexing/test_getitem.py @@ -103,7 +103,7 @@ def test_getitem_list_duplicates(self): def test_getitem_dupe_cols(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"]) - msg = "\"None of [Index(['baf'], dtype='object')] are in the [columns]\"" + msg = "\"None of [Index(['baf'], dtype=" with pytest.raises(KeyError, match=re.escape(msg)): df[["baf"]] diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index dfb4a3092789a..4be5be77b015c 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -288,7 +288,9 @@ def test_setattr_column(self): df.foobar = 5 assert (df.foobar == 5).all() - def test_setitem(self, float_frame, using_copy_on_write, warn_copy_on_write): + def test_setitem( + self, float_frame, using_copy_on_write, warn_copy_on_write, using_infer_string + ): # not sure what else to do here series = float_frame["A"][::2] float_frame["col5"] = series @@ -331,7 +333,10 @@ def test_setitem(self, float_frame, using_copy_on_write, warn_copy_on_write): with pytest.raises(SettingWithCopyError, match=msg): smaller["col10"] = ["1", "2"] - assert smaller["col10"].dtype == np.object_ + if using_infer_string: + assert smaller["col10"].dtype == "string" + else: + assert smaller["col10"].dtype == np.object_ assert (smaller["col10"] == ["1", "2"]).all() def test_setitem2(self): @@ -426,7 +431,7 @@ def test_setitem_cast(self, float_frame): float_frame["something"] = 2.5 assert float_frame["something"].dtype == np.float64 - def test_setitem_corner(self, float_frame): + def test_setitem_corner(self, float_frame, using_infer_string): # corner case df = DataFrame({"B": [1.0, 2.0, 3.0], "C": ["a", "b", "c"]}, index=np.arange(3)) del df["B"] @@ -463,10 +468,16 @@ def test_setitem_corner(self, float_frame): dm["foo"] = "bar" del dm["foo"] dm["foo"] = "bar" - assert dm["foo"].dtype == np.object_ + if using_infer_string: + assert dm["foo"].dtype == "string" + else: + assert dm["foo"].dtype == np.object_ dm["coercible"] = ["1", "2", "3"] - assert dm["coercible"].dtype == np.object_ + if using_infer_string: + assert dm["coercible"].dtype == "string" + else: + assert dm["coercible"].dtype == np.object_ def test_setitem_corner2(self): data = { @@ -483,7 +494,7 @@ def test_setitem_corner2(self): assert df.loc[1, "title"] == "foobar" assert df.loc[1, "cruft"] == 0 - def test_setitem_ambig(self): + def test_setitem_ambig(self, using_infer_string): # Difficulties with mixed-type data # Created as float type dm = DataFrame(index=range(3), columns=range(3)) @@ -499,18 +510,22 @@ def test_setitem_ambig(self): dm[2] = uncoercable_series assert len(dm.columns) == 3 - assert dm[2].dtype == np.object_ + if using_infer_string: + assert dm[2].dtype == "string" + else: + assert dm[2].dtype == np.object_ - def test_setitem_None(self, float_frame): + def test_setitem_None(self, float_frame, using_infer_string): # GH #766 float_frame[None] = float_frame["A"] + key = None if not using_infer_string else np.nan tm.assert_series_equal( float_frame.iloc[:, -1], float_frame["A"], check_names=False ) tm.assert_series_equal( - float_frame.loc[:, None], float_frame["A"], check_names=False + float_frame.loc[:, key], float_frame["A"], check_names=False ) - tm.assert_series_equal(float_frame[None], float_frame["A"], check_names=False) + tm.assert_series_equal(float_frame[key], float_frame["A"], check_names=False) def test_loc_setitem_boolean_mask_allfalse(self): # GH 9596 @@ -1397,9 +1412,7 @@ def test_iloc_setitem_enlarge_no_warning(self, warn_copy_on_write): df = DataFrame(columns=["a", "b"]) expected = df.copy() view = df[:] - # TODO(CoW-warn) false positive: shouldn't warn in case of enlargement? - with tm.assert_produces_warning(FutureWarning if warn_copy_on_write else None): - df.iloc[:, 0] = np.array([1, 2], dtype=np.float64) + df.iloc[:, 0] = np.array([1, 2], dtype=np.float64) tm.assert_frame_equal(view, expected) def test_loc_internals_not_updated_correctly(self): diff --git a/pandas/tests/frame/indexing/test_set_value.py b/pandas/tests/frame/indexing/test_set_value.py index 32312868adacb..1e3c793c8449f 100644 --- a/pandas/tests/frame/indexing/test_set_value.py +++ b/pandas/tests/frame/indexing/test_set_value.py @@ -16,7 +16,7 @@ def test_set_value(self, float_frame): float_frame._set_value(idx, col, 1) assert float_frame[col][idx] == 1 - def test_set_value_resize(self, float_frame): + def test_set_value_resize(self, float_frame, using_infer_string): res = float_frame._set_value("foobar", "B", 0) assert res is None assert float_frame.index[-1] == "foobar" @@ -27,8 +27,10 @@ def test_set_value_resize(self, float_frame): res = float_frame.copy() res._set_value("foobar", "baz", "sam") - assert res["baz"].dtype == np.object_ - + if using_infer_string: + assert res["baz"].dtype == "string" + else: + assert res["baz"].dtype == np.object_ res = float_frame.copy() with tm.assert_produces_warning( FutureWarning, match="Setting an item of incompatible dtype" diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index c0ba2f245efed..e802a56ecbc81 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -786,6 +786,24 @@ def test_loc_setitem_ea_dtype(self): df.iloc[:, 0] = Series([11], dtype="Int64") tm.assert_frame_equal(df, expected) + def test_setitem_object_inferring(self): + # GH#56102 + idx = Index([Timestamp("2019-12-31")], dtype=object) + df = DataFrame({"a": [1]}) + with tm.assert_produces_warning(FutureWarning, match="infer"): + df.loc[:, "b"] = idx + with tm.assert_produces_warning(FutureWarning, match="infer"): + df["c"] = idx + + expected = DataFrame( + { + "a": [1], + "b": Series([Timestamp("2019-12-31")], dtype="datetime64[ns]"), + "c": Series([Timestamp("2019-12-31")], dtype="datetime64[ns]"), + } + ) + tm.assert_frame_equal(df, expected) + class TestSetitemTZAwareValues: @pytest.fixture @@ -1319,7 +1337,7 @@ def test_setitem_column_frame_as_category(self): df["col2"] = Series([1, 2, 3], dtype="category") expected_types = Series( - ["int64", "category", "category"], index=[0, "col1", "col2"] + ["int64", "category", "category"], index=[0, "col1", "col2"], dtype=object ) tm.assert_series_equal(df.dtypes, expected_types) diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 103ec67951a01..3d36d0471f02f 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -1077,9 +1077,13 @@ def test_where_producing_ea_cond_for_np_dtype(): @pytest.mark.parametrize( "replacement", [0.001, True, "snake", None, datetime(2022, 5, 4)] ) -def test_where_int_overflow(replacement): +def test_where_int_overflow(replacement, using_infer_string, request): # GH 31687 df = DataFrame([[1.0, 2e25, "nine"], [np.nan, 0.1, None]]) + if using_infer_string and replacement not in (None, "snake"): + request.node.add_marker( + pytest.mark.xfail(reason="Can't set non-string into string column") + ) result = df.where(pd.notnull(df), replacement) expected = DataFrame([[1.0, 2e25, "nine"], [replacement, 0.1, replacement]]) diff --git a/pandas/tests/frame/methods/test_align.py b/pandas/tests/frame/methods/test_align.py index 312d6f6d37dde..5a9c47866dae8 100644 --- a/pandas/tests/frame/methods/test_align.py +++ b/pandas/tests/frame/methods/test_align.py @@ -107,7 +107,7 @@ def test_align_float(self, float_frame, using_copy_on_write): af, bf = float_frame.align( other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=None ) - tm.assert_index_equal(bf.index, Index([])) + tm.assert_index_equal(bf.index, Index([]).astype(bf.index.dtype)) msg = ( "The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align " @@ -117,7 +117,7 @@ def test_align_float(self, float_frame, using_copy_on_write): af, bf = float_frame.align( other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 ) - tm.assert_index_equal(bf.index, Index([])) + tm.assert_index_equal(bf.index, Index([]).astype(bf.index.dtype)) # Try to align DataFrame to Series along bad axis msg = "No axis named 2 for object type DataFrame" diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 2578dfb622fbf..5a1e3cd786f84 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -166,7 +166,8 @@ def test_astype_str(self): "c": [Timedelta(x)._repr_base() for x in c._values], "d": list(map(str, d._values)), "e": list(map(str, e._values)), - } + }, + dtype="object", ) tm.assert_frame_equal(result, expected) @@ -174,13 +175,13 @@ def test_astype_str(self): def test_astype_str_float(self): # see GH#11302 result = DataFrame([np.nan]).astype(str) - expected = DataFrame(["nan"]) + expected = DataFrame(["nan"], dtype="object") tm.assert_frame_equal(result, expected) result = DataFrame([1.12345678901234567890]).astype(str) val = "1.1234567890123457" - expected = DataFrame([val]) + expected = DataFrame([val], dtype="object") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype_class", [dict, Series]) @@ -199,7 +200,7 @@ def test_astype_dict_like(self, dtype_class): expected = DataFrame( { "a": a, - "b": Series(["0", "1", "2", "3", "4"]), + "b": Series(["0", "1", "2", "3", "4"], dtype="object"), "c": c, "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"), } @@ -282,7 +283,7 @@ def test_astype_duplicate_col_series_arg(self): result = df.astype(dtypes) expected = DataFrame( { - 0: vals[:, 0].astype(str), + 0: Series(vals[:, 0].astype(str), dtype=object), 1: vals[:, 1], 2: pd.array(vals[:, 2], dtype="Float64"), 3: vals[:, 3], @@ -620,6 +621,7 @@ def test_astype_arg_for_errors_dictlist(self): {"a": 2.2, "b": "15.3", "c": "another_test"}, ] ) + expected["c"] = expected["c"].astype("object") type_dict = {"a": "float64", "b": "float64", "c": "object"} result = df.astype(dtype=type_dict, errors="ignore") @@ -680,6 +682,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame): ], ], columns=timezone_frame.columns, + dtype="object", ) tm.assert_frame_equal(result, expected) @@ -754,7 +757,9 @@ def test_astype_tz_object_conversion(self, tz): result = result.astype({"tz": "datetime64[ns, Europe/London]"}) tm.assert_frame_equal(result, expected) - def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture): + def test_astype_dt64_to_string( + self, frame_or_series, tz_naive_fixture, using_infer_string + ): # GH#41409 tz = tz_naive_fixture @@ -772,7 +777,10 @@ def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture): item = result.iloc[0] if frame_or_series is DataFrame: item = item.iloc[0] - assert item is pd.NA + if using_infer_string: + assert item is np.nan + else: + assert item is pd.NA # For non-NA values, we should match what we get for non-EA str alt = obj.astype(str) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 0335279b3a123..8aeab5dacd8b4 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -30,7 +30,7 @@ def test_combine_first_mixed(self): combined = f.combine_first(g) tm.assert_frame_equal(combined, exp) - def test_combine_first(self, float_frame): + def test_combine_first(self, float_frame, using_infer_string): # disjoint head, tail = float_frame[:5], float_frame[5:] @@ -76,11 +76,13 @@ def test_combine_first(self, float_frame): tm.assert_series_equal(combined["A"].reindex(g.index), g["A"]) # corner cases - comb = float_frame.combine_first(DataFrame()) + warning = FutureWarning if using_infer_string else None + with tm.assert_produces_warning(warning, match="empty entries"): + comb = float_frame.combine_first(DataFrame()) tm.assert_frame_equal(comb, float_frame) comb = DataFrame().combine_first(float_frame) - tm.assert_frame_equal(comb, float_frame) + tm.assert_frame_equal(comb, float_frame.sort_index()) comb = float_frame.combine_first(DataFrame(index=["faz", "boo"])) assert "faz" in comb.index diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 4c371afcc4e00..a181a271181ca 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -11,9 +11,13 @@ class TestConvertDtypes: @pytest.mark.parametrize( "convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")] ) - def test_convert_dtypes(self, convert_integer, expected, string_storage): + def test_convert_dtypes( + self, convert_integer, expected, string_storage, using_infer_string + ): # Specific types are tested in tests/series/test_dtypes.py # Just check that it works for DataFrame here + if using_infer_string: + string_storage = "pyarrow_numpy" df = pd.DataFrame( { "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 108816697ef3e..04a08c8b9bc52 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -326,7 +326,7 @@ def test_corrwith(self, datetime_frame, dtype): for row in index[:4]: tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row])) - def test_corrwith_with_objects(self): + def test_corrwith_with_objects(self, using_infer_string): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=Index(list("ABCD"), dtype=object), @@ -338,8 +338,14 @@ def test_corrwith_with_objects(self): df1["obj"] = "foo" df2["obj"] = "bar" - with pytest.raises(TypeError, match="Could not convert"): - df1.corrwith(df2) + if using_infer_string: + import pyarrow as pa + + with pytest.raises(pa.lib.ArrowNotImplementedError, match="has no kernel"): + df1.corrwith(df2) + else: + with pytest.raises(TypeError, match="Could not convert"): + df1.corrwith(df2) result = df1.corrwith(df2, numeric_only=True) expected = df1.loc[:, cols].corrwith(df2.loc[:, cols]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py index f72c0594fa1f7..06cd51b43a0aa 100644 --- a/pandas/tests/frame/methods/test_drop.py +++ b/pandas/tests/frame/methods/test_drop.py @@ -510,7 +510,7 @@ def test_drop_with_duplicate_columns2(self): def test_drop_inplace_no_leftover_column_reference(self): # GH 13934 - df = DataFrame({"a": [1, 2, 3]}) + df = DataFrame({"a": [1, 2, 3]}, columns=Index(["a"], dtype="object")) a = df.a df.drop(["a"], axis=1, inplace=True) tm.assert_index_equal(df.columns, Index([], dtype="object")) diff --git a/pandas/tests/frame/methods/test_drop_duplicates.py b/pandas/tests/frame/methods/test_drop_duplicates.py index df12139258a6d..6bea97b2cf189 100644 --- a/pandas/tests/frame/methods/test_drop_duplicates.py +++ b/pandas/tests/frame/methods/test_drop_duplicates.py @@ -16,7 +16,7 @@ def test_drop_duplicates_with_misspelled_column_name(subset): # GH 19730 df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]}) - msg = re.escape("Index(['a'], dtype='object')") + msg = re.escape("Index(['a'], dtype=") with pytest.raises(KeyError, match=msg): df.drop_duplicates(subset) diff --git a/pandas/tests/frame/methods/test_dtypes.py b/pandas/tests/frame/methods/test_dtypes.py index 4bdf16977dae6..ab632ac17318e 100644 --- a/pandas/tests/frame/methods/test_dtypes.py +++ b/pandas/tests/frame/methods/test_dtypes.py @@ -142,9 +142,12 @@ def test_dtypes_timedeltas(self): ) tm.assert_series_equal(result, expected) - def test_frame_apply_np_array_return_type(self): + def test_frame_apply_np_array_return_type(self, using_infer_string): # GH 35517 df = DataFrame([["foo"]]) result = df.apply(lambda col: np.array("bar")) - expected = Series(["bar"]) + if using_infer_string: + expected = Series([np.array(["bar"])]) + else: + expected = Series(["bar"]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_duplicated.py b/pandas/tests/frame/methods/test_duplicated.py index 788aede805110..6052b61ea8db5 100644 --- a/pandas/tests/frame/methods/test_duplicated.py +++ b/pandas/tests/frame/methods/test_duplicated.py @@ -16,7 +16,7 @@ def test_duplicated_with_misspelled_column_name(subset): # GH 19730 df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]}) - msg = re.escape("Index(['a'], dtype='object')") + msg = re.escape("Index(['a'], dtype=") with pytest.raises(KeyError, match=msg): df.duplicated(subset) diff --git a/pandas/tests/frame/methods/test_equals.py b/pandas/tests/frame/methods/test_equals.py index 6fcf670f96ef0..d0b9d96cafa0d 100644 --- a/pandas/tests/frame/methods/test_equals.py +++ b/pandas/tests/frame/methods/test_equals.py @@ -14,11 +14,11 @@ def test_dataframe_not_equal(self): df2 = DataFrame({"a": ["s", "d"], "b": [1, 2]}) assert df1.equals(df2) is False - def test_equals_different_blocks(self, using_array_manager): + def test_equals_different_blocks(self, using_array_manager, using_infer_string): # GH#9330 df0 = DataFrame({"A": ["x", "y"], "B": [1, 2], "C": ["w", "z"]}) df1 = df0.reset_index()[["A", "B", "C"]] - if not using_array_manager: + if not using_array_manager and not using_infer_string: # this assert verifies that the above operations have # induced a block rearrangement assert df0._mgr.blocks[0].dtype != df1._mgr.blocks[0].dtype diff --git a/pandas/tests/frame/methods/test_explode.py b/pandas/tests/frame/methods/test_explode.py index d1e4a603c5710..5cd54db62d783 100644 --- a/pandas/tests/frame/methods/test_explode.py +++ b/pandas/tests/frame/methods/test_explode.py @@ -203,7 +203,7 @@ def test_usecase(): ) def test_duplicate_index(input_dict, input_index, expected_dict, expected_index): # GH 28005 - df = pd.DataFrame(input_dict, index=input_index) + df = pd.DataFrame(input_dict, index=input_index, dtype=object) result = df.explode("col1") expected = pd.DataFrame(expected_dict, index=expected_index, dtype=object) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 960f05a6457a4..1403a45a5cccd 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas.util._test_decorators as td from pandas import ( @@ -89,6 +91,7 @@ def test_fillna_datetime(self, datetime_frame): with pytest.raises(ValueError, match=msg): datetime_frame.fillna(5, method="ffill") + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") def test_fillna_mixed_type(self, float_string_frame): mf = float_string_frame mf.loc[mf.index[5:20], "foo"] = np.nan @@ -122,19 +125,27 @@ def test_fillna_empty(self, using_copy_on_write): df.x.fillna(method=m, inplace=True) df.x.fillna(method=m) - def test_fillna_different_dtype(self): + def test_fillna_different_dtype(self, using_infer_string): # with different dtype (GH#3386) df = DataFrame( [["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]] ) - result = df.fillna({2: "foo"}) + if using_infer_string: + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + result = df.fillna({2: "foo"}) + else: + result = df.fillna({2: "foo"}) expected = DataFrame( [["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]] ) tm.assert_frame_equal(result, expected) - return_value = df.fillna({2: "foo"}, inplace=True) + if using_infer_string: + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + return_value = df.fillna({2: "foo"}, inplace=True) + else: + return_value = df.fillna({2: "foo"}, inplace=True) tm.assert_frame_equal(df, expected) assert return_value is None @@ -358,7 +369,7 @@ def test_fillna_dictlike_value_duplicate_colnames(self, columns): expected["A"] = 0.0 tm.assert_frame_equal(result, expected) - def test_fillna_dtype_conversion(self): + def test_fillna_dtype_conversion(self, using_infer_string): # make sure that fillna on an empty frame works df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) result = df.dtypes @@ -373,7 +384,11 @@ def test_fillna_dtype_conversion(self): # empty block df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64") - result = df.fillna("nan") + if using_infer_string: + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + result = df.fillna("nan") + else: + result = df.fillna("nan") expected = DataFrame("nan", index=range(3), columns=["A", "B"]) tm.assert_frame_equal(result, expected) @@ -649,6 +664,7 @@ def test_fillna_col_reordering(self): filled = df.fillna(method="ffill") assert df.columns.tolist() == filled.columns.tolist() + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") def test_fill_corner(self, float_frame, float_string_frame): mf = float_string_frame mf.loc[mf.index[5:20], "foo"] = np.nan diff --git a/pandas/tests/frame/methods/test_get_numeric_data.py b/pandas/tests/frame/methods/test_get_numeric_data.py index ec1c768603a59..c5d32d56d03c1 100644 --- a/pandas/tests/frame/methods/test_get_numeric_data.py +++ b/pandas/tests/frame/methods/test_get_numeric_data.py @@ -15,12 +15,12 @@ class TestGetNumericData: def test_get_numeric_data_preserve_dtype(self): # get the numeric data - obj = DataFrame({"A": [1, "2", 3.0]}) + obj = DataFrame({"A": [1, "2", 3.0]}, columns=Index(["A"], dtype="object")) result = obj._get_numeric_data() expected = DataFrame(dtype=object, index=pd.RangeIndex(3), columns=[]) tm.assert_frame_equal(result, expected) - def test_get_numeric_data(self): + def test_get_numeric_data(self, using_infer_string): datetime64name = np.dtype("M8[s]").name objectname = np.dtype(np.object_).name @@ -33,7 +33,7 @@ def test_get_numeric_data(self): [ np.dtype("float64"), np.dtype("int64"), - np.dtype(objectname), + np.dtype(objectname) if not using_infer_string else "string", np.dtype(datetime64name), ], index=["a", "b", "c", "f"], diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 5f37ed6d9e18a..e0641fcb65bd3 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.errors import ChainedAssignmentError import pandas.util._test_decorators as td @@ -67,6 +69,9 @@ def test_interpolate_inplace(self, frame_or_series, using_array_manager, request assert np.shares_memory(orig, obj.values) assert orig.squeeze()[1] == 1.5 + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="interpolate doesn't work for string" + ) def test_interp_basic(self, using_copy_on_write): df = DataFrame( { @@ -108,7 +113,10 @@ def test_interp_basic(self, using_copy_on_write): assert np.shares_memory(df["C"]._values, cvalues) assert np.shares_memory(df["D"]._values, dvalues) - def test_interp_basic_with_non_range_index(self): + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="interpolate doesn't work for string" + ) + def test_interp_basic_with_non_range_index(self, using_infer_string): df = DataFrame( { "A": [1, 2, np.nan, 4], @@ -119,7 +127,8 @@ def test_interp_basic_with_non_range_index(self): ) msg = "DataFrame.interpolate with object dtype" - with tm.assert_produces_warning(FutureWarning, match=msg): + warning = FutureWarning if not using_infer_string else None + with tm.assert_produces_warning(warning, match=msg): result = df.set_index("C").interpolate() expected = df.set_index("C") expected.loc[3, "A"] = 3 diff --git a/pandas/tests/frame/methods/test_is_homogeneous_dtype.py b/pandas/tests/frame/methods/test_is_homogeneous_dtype.py index a5f285d31301b..1fe28cb8eb856 100644 --- a/pandas/tests/frame/methods/test_is_homogeneous_dtype.py +++ b/pandas/tests/frame/methods/test_is_homogeneous_dtype.py @@ -25,7 +25,8 @@ { "A": np.array([1, 2], dtype=object), "B": np.array(["a", "b"], dtype=object), - } + }, + dtype="object", ), True, ), diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index 1196f8cd3886a..3ba893501914a 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -86,7 +86,7 @@ def test_nlargest_n(self, df_strings, nselect_method, n, order): df = df_strings if "b" in order: error_msg = ( - f"Column 'b' has dtype object, " + f"Column 'b' has dtype (object|string), " f"cannot use method '{nselect_method}' with this dtype" ) with pytest.raises(TypeError, match=error_msg): diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index b5b5e42691e59..8d7a0b373f5f8 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -13,6 +13,7 @@ from pandas import ( DataFrame, + Index, Series, ) import pandas._testing as tm @@ -469,21 +470,28 @@ def test_rank_inf_nans_na_option( ("top", False, [2.0, 3.0, 1.0, 4.0]), ], ) - def test_rank_object_first(self, frame_or_series, na_option, ascending, expected): + def test_rank_object_first( + self, frame_or_series, na_option, ascending, expected, using_infer_string + ): obj = frame_or_series(["foo", "foo", None, "foo"]) result = obj.rank(method="first", na_option=na_option, ascending=ascending) expected = frame_or_series(expected) + if using_infer_string and isinstance(obj, Series): + expected = expected.astype("uint64") tm.assert_equal(result, expected) @pytest.mark.parametrize( "data,expected", [ - ({"a": [1, 2, "a"], "b": [4, 5, 6]}, DataFrame({"b": [1.0, 2.0, 3.0]})), + ( + {"a": [1, 2, "a"], "b": [4, 5, 6]}, + DataFrame({"b": [1.0, 2.0, 3.0]}, columns=Index(["b"], dtype=object)), + ), ({"a": [1, 2, "a"]}, DataFrame(index=range(3), columns=[])), ], ) def test_rank_mixed_axis_zero(self, data, expected): - df = DataFrame(data) + df = DataFrame(data, columns=Index(list(data.keys()), dtype=object)) with pytest.raises(TypeError, match="'<' not supported between instances of"): df.rank() result = df.rank(numeric_only=True) diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index d0d971e29204a..d862e14ce86cb 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -120,7 +120,7 @@ def test_reindex_timestamp_with_fold(self, timezone, year, month, day, hour): exp = DataFrame({"index": ["1", "2"], "vals": [np.nan, np.nan]}).set_index( "index" ) - exp = exp.astype(object) + exp = exp.astype(df.vals.dtype) tm.assert_frame_equal( df, exp, @@ -840,8 +840,8 @@ def test_reindex_fill_value(self): # other dtypes df["foo"] = "foo" - result = df.reindex(range(15), fill_value=0) - expected = df.reindex(range(15)).fillna(0) + result = df.reindex(range(15), fill_value="0") + expected = df.reindex(range(15)).fillna("0") tm.assert_frame_equal(result, expected) def test_reindex_uint_dtypes_fill_value(self, any_unsigned_int_numpy_dtype): diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 13e2c1a249ac2..8bfa98042eb07 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -28,6 +30,9 @@ def mix_abc() -> dict[str, list[float | str]]: class TestDataFrameReplace: + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_inplace(self, datetime_frame, float_string_frame): datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan @@ -278,14 +283,25 @@ def test_regex_replace_dict_nested(self, mix_abc): tm.assert_frame_equal(res3, expec) tm.assert_frame_equal(res4, expec) - def test_regex_replace_dict_nested_non_first_character(self, any_string_dtype): + def test_regex_replace_dict_nested_non_first_character( + self, any_string_dtype, using_infer_string + ): # GH 25259 dtype = any_string_dtype df = DataFrame({"first": ["abc", "bca", "cab"]}, dtype=dtype) - expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) - result = df.replace({"a": "."}, regex=True) + if using_infer_string and any_string_dtype == "object": + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + result = df.replace({"a": "."}, regex=True) + expected = DataFrame({"first": [".bc", "bc.", "c.b"]}) + + else: + result = df.replace({"a": "."}, regex=True) + expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_regex_replace_dict_nested_gh4115(self): df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2}) expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2}) @@ -294,6 +310,9 @@ def test_regex_replace_dict_nested_gh4115(self): result = df.replace({"Type": {"Q": 0, "T": 1}}) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_regex_replace_list_to_scalar(self, mix_abc): df = DataFrame(mix_abc) expec = DataFrame( @@ -322,6 +341,9 @@ def test_regex_replace_list_to_scalar(self, mix_abc): tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_regex_replace_str_to_numeric(self, mix_abc): # what happens when you try to replace a numeric value with a regex? df = DataFrame(mix_abc) @@ -337,6 +359,9 @@ def test_regex_replace_str_to_numeric(self, mix_abc): tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_regex_replace_regex_list_to_numeric(self, mix_abc): df = DataFrame(mix_abc) res = df.replace([r"\s*\.\s*", "b"], 0, regex=True) @@ -415,12 +440,31 @@ def test_replace_regex_metachar(self, metachar): ], ) def test_regex_replace_string_types( - self, data, to_replace, expected, frame_or_series, any_string_dtype + self, + data, + to_replace, + expected, + frame_or_series, + any_string_dtype, + using_infer_string, + request, ): # GH-41333, GH-35977 dtype = any_string_dtype obj = frame_or_series(data, dtype=dtype) - result = obj.replace(to_replace, regex=True) + if using_infer_string and any_string_dtype == "object": + if len(to_replace) > 1 and isinstance(obj, DataFrame): + request.node.add_marker( + pytest.mark.xfail( + reason="object input array that gets downcasted raises on " + "second pass" + ) + ) + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + result = obj.replace(to_replace, regex=True) + dtype = "string[pyarrow_numpy]" + else: + result = obj.replace(to_replace, regex=True) expected = frame_or_series(expected, dtype=dtype) tm.assert_equal(result, expected) @@ -522,6 +566,9 @@ def test_replace_series_dict(self): result = df.replace(s, df.mean()) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_convert(self): # gh 3907 df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]]) @@ -533,6 +580,9 @@ def test_replace_convert(self): res = rep.dtypes tm.assert_series_equal(expec, res) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_mixed(self, float_string_frame): mf = float_string_frame mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan @@ -588,7 +638,7 @@ def test_replace_mixed_int_block_splitting(self): result = df.replace(0, 0.5) tm.assert_frame_equal(result, expected) - def test_replace_mixed2(self): + def test_replace_mixed2(self, using_infer_string): # to object block upcasting df = DataFrame( { @@ -607,11 +657,15 @@ def test_replace_mixed2(self): expected = DataFrame( { - "A": Series(["foo", "bar"], dtype="object"), + "A": Series(["foo", "bar"]), "B": Series([0, "foo"], dtype="object"), } ) - result = df.replace([1, 2], ["foo", "bar"]) + if using_infer_string: + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + result = df.replace([1, 2], ["foo", "bar"]) + else: + result = df.replace([1, 2], ["foo", "bar"]) tm.assert_frame_equal(result, expected) def test_replace_mixed3(self): @@ -892,6 +946,9 @@ def test_replace_input_formats_listlike(self): with pytest.raises(ValueError, match=msg): df.replace(to_rep, values[1:]) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_input_formats_scalar(self): df = DataFrame( {"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} @@ -920,6 +977,9 @@ def test_replace_limit(self): # TODO pass + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_dict_no_regex(self): answer = Series( { @@ -943,6 +1003,9 @@ def test_replace_dict_no_regex(self): result = answer.replace(weights) tm.assert_series_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_series_no_regex(self): answer = Series( { @@ -1049,7 +1112,10 @@ def test_nested_dict_overlapping_keys_replace_str(self): expected = df.replace({"a": dict(zip(astr, bstr))}) tm.assert_frame_equal(result, expected) - def test_replace_swapping_bug(self): + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) + def test_replace_swapping_bug(self, using_infer_string): df = DataFrame({"a": [True, False, True]}) res = df.replace({"a": {True: "Y", False: "N"}}) expect = DataFrame({"a": ["Y", "N", "Y"]}) @@ -1060,6 +1126,9 @@ def test_replace_swapping_bug(self): expect = DataFrame({"a": ["Y", "N", "Y"]}) tm.assert_frame_equal(res, expect) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_period(self): d = { "fname": { @@ -1096,6 +1165,9 @@ def test_replace_period(self): result = df.replace(d) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_datetime(self): d = { "fname": { @@ -1279,7 +1351,9 @@ def test_categorical_replace_with_dict(self, replace_dict, final_data): b = pd.Categorical(final_data[:, 1], categories=ex_cat) expected = DataFrame({"a": a, "b": b}) - result = df.replace(replace_dict, 3) + msg2 = "with CategoricalDtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg2): + result = df.replace(replace_dict, 3) tm.assert_frame_equal(result, expected) msg = ( r"Attributes of DataFrame.iloc\[:, 0\] \(column name=\"a\"\) are " @@ -1288,7 +1362,8 @@ def test_categorical_replace_with_dict(self, replace_dict, final_data): with pytest.raises(AssertionError, match=msg): # ensure non-inplace call does not affect original tm.assert_frame_equal(df, expected) - return_value = df.replace(replace_dict, 3, inplace=True) + with tm.assert_produces_warning(FutureWarning, match=msg2): + return_value = df.replace(replace_dict, 3, inplace=True) assert return_value is None tm.assert_frame_equal(df, expected) @@ -1318,6 +1393,9 @@ def test_replace_commutative(self, df, to_replace, exp): result = df.replace(to_replace) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) @pytest.mark.parametrize( "replacer", [ @@ -1438,9 +1516,14 @@ def test_replace_value_category_type(self): ) # replace values in input dataframe - input_df = input_df.replace("d", "z") - input_df = input_df.replace("obj1", "obj9") - result = input_df.replace("cat2", "catX") + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + input_df = input_df.replace("d", "z") + input_df = input_df.replace("obj1", "obj9") + result = input_df.replace("cat2", "catX") tm.assert_frame_equal(result, expected) @@ -1466,7 +1549,12 @@ def test_replace_dict_category_type(self): ) # replace values in input dataframe using a dict - result = input_df.replace({"a": "z", "obj1": "obj9", "cat1": "catX"}) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = input_df.replace({"a": "z", "obj1": "obj9", "cat1": "catX"}) tm.assert_frame_equal(result, expected) @@ -1478,10 +1566,12 @@ def test_replace_with_compiled_regex(self): expected = DataFrame(["z", "b", "c"]) tm.assert_frame_equal(result, expected) - def test_replace_intervals(self): + def test_replace_intervals(self, using_infer_string): # https://github.com/pandas-dev/pandas/issues/35931 df = DataFrame({"a": [pd.Interval(0, 1), pd.Interval(0, 1)]}) - result = df.replace({"a": {pd.Interval(0, 1): "x"}}) + warning = FutureWarning if using_infer_string else None + with tm.assert_produces_warning(warning, match="Downcasting"): + result = df.replace({"a": {pd.Interval(0, 1): "x"}}) expected = DataFrame({"a": ["x", "x"]}) tm.assert_frame_equal(result, expected) @@ -1582,6 +1672,9 @@ def test_regex_replace_scalar( expected.loc[expected["a"] == ".", "a"] = expected_replace_val tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_frame(self, regex): # GH-48644 @@ -1619,9 +1712,15 @@ def test_replace_categorical_no_replacement(self): result = df.replace(to_replace=[".", "def"], value=["_", None]) tm.assert_frame_equal(result, expected) - def test_replace_object_splitting(self): + def test_replace_object_splitting(self, using_infer_string): # GH#53977 df = DataFrame({"a": ["a"], "b": "b"}) - assert len(df._mgr.blocks) == 1 + if using_infer_string: + assert len(df._mgr.blocks) == 2 + else: + assert len(df._mgr.blocks) == 1 df.replace(to_replace=r"^\s*$", value="", inplace=True, regex=True) - assert len(df._mgr.blocks) == 1 + if using_infer_string: + assert len(df._mgr.blocks) == 2 + else: + assert len(df._mgr.blocks) == 1 diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index 20f0dcc816408..fbf36dbc4fb02 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -654,10 +654,14 @@ def test_rest_index_multiindex_categorical_with_missing_values(self, codes): ), ], ) -def test_reset_index_dtypes_on_empty_frame_with_multiindex(array, dtype): +def test_reset_index_dtypes_on_empty_frame_with_multiindex( + array, dtype, using_infer_string +): # GH 19602 - Preserve dtype on empty DataFrame with MultiIndex idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array]) result = DataFrame(index=idx)[:0].reset_index().dtypes + if using_infer_string and dtype == object: + dtype = "string" expected = Series({"level_0": np.int64, "level_1": np.float64, "level_2": dtype}) tm.assert_series_equal(result, expected) @@ -676,7 +680,9 @@ def test_reset_index_empty_frame_with_datetime64_multiindex(): tm.assert_frame_equal(result, expected) -def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby(): +def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby( + using_infer_string, +): # https://github.com/pandas-dev/pandas/issues/35657 dti = pd.DatetimeIndex(["2020-01-01"], dtype="M8[ns]") df = DataFrame({"c1": [10.0], "c2": ["a"], "c3": dti}) @@ -687,6 +693,8 @@ def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby(): ) expected["c3"] = expected["c3"].astype("datetime64[ns]") expected["c1"] = expected["c1"].astype("float64") + if using_infer_string: + expected["c2"] = expected["c2"].astype("string[pyarrow_numpy]") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index e2759c5d5b7b7..47c479faed1ef 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -282,7 +282,7 @@ def test_select_dtypes_duplicate_columns(self): result = df.select_dtypes(include=[np.number], exclude=["floating"]) tm.assert_frame_equal(result, expected) - def test_select_dtypes_not_an_attr_but_still_valid_dtype(self): + def test_select_dtypes_not_an_attr_but_still_valid_dtype(self, using_infer_string): df = DataFrame( { "a": list("abc"), @@ -296,11 +296,17 @@ def test_select_dtypes_not_an_attr_but_still_valid_dtype(self): df["g"] = df.f.diff() assert not hasattr(np, "u8") r = df.select_dtypes(include=["i8", "O"], exclude=["timedelta"]) - e = df[["a", "b"]] + if using_infer_string: + e = df[["b"]] + else: + e = df[["a", "b"]] tm.assert_frame_equal(r, e) r = df.select_dtypes(include=["i8", "O", "timedelta64[ns]"]) - e = df[["a", "b", "g"]] + if using_infer_string: + e = df[["b", "g"]] + else: + e = df[["a", "b", "g"]] tm.assert_frame_equal(r, e) def test_select_dtypes_empty(self): diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 97fbe597d1dab..250567eafc670 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -682,7 +682,7 @@ def _make_frame(names=None): tm.assert_index_equal(recons.columns, exp.columns) assert len(recons) == 0 - def test_to_csv_interval_index(self): + def test_to_csv_interval_index(self, using_infer_string): # GH 28210 df = DataFrame({"A": list("abc"), "B": range(3)}, index=pd.interval_range(0, 3)) @@ -692,7 +692,10 @@ def test_to_csv_interval_index(self): # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) expected = df.copy() - expected.index = expected.index.astype(str) + if using_infer_string: + expected.index = expected.index.astype("string[pyarrow_numpy]") + else: + expected.index = expected.index.astype(str) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_to_dict_of_blocks.py b/pandas/tests/frame/methods/test_to_dict_of_blocks.py index 28973fe0d7900..f64cfd5fe6a2d 100644 --- a/pandas/tests/frame/methods/test_to_dict_of_blocks.py +++ b/pandas/tests/frame/methods/test_to_dict_of_blocks.py @@ -14,22 +14,6 @@ class TestToDictOfBlocks: - def test_copy_blocks(self, float_frame): - # GH#9607 - df = DataFrame(float_frame, copy=True) - column = df.columns[0] - - # use the default copy=True, change a column - _last_df = None - blocks = df._to_dict_of_blocks(copy=True) - for _df in blocks.values(): - _last_df = _df - if column in _df: - _df.loc[:, column] = _df[column] + 1 - - # make sure we did not change the original DataFrame - assert _last_df is not None and not _last_df[column].equals(df[column]) - @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") def test_no_copy_blocks(self, float_frame, using_copy_on_write): # GH#9607 @@ -38,7 +22,7 @@ def test_no_copy_blocks(self, float_frame, using_copy_on_write): _last_df = None # use the copy=False, change a column - blocks = df._to_dict_of_blocks(copy=False) + blocks = df._to_dict_of_blocks() for _df in blocks.values(): _last_df = _df if column in _df: @@ -51,9 +35,7 @@ def test_no_copy_blocks(self, float_frame, using_copy_on_write): assert _last_df is not None and not _last_df[column].equals(df[column]) -def test_to_dict_of_blocks_item_cache(request, using_copy_on_write, warn_copy_on_write): - if using_copy_on_write: - request.applymarker(pytest.mark.xfail(reason="CoW - not yet implemented")) +def test_to_dict_of_blocks_item_cache(using_copy_on_write, warn_copy_on_write): # Calling to_dict_of_blocks should not poison item_cache df = DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) df["c"] = NumpyExtensionArray(np.array([1, 2, None, 3], dtype=object)) @@ -65,10 +47,8 @@ def test_to_dict_of_blocks_item_cache(request, using_copy_on_write, warn_copy_on df._to_dict_of_blocks() if using_copy_on_write: - # TODO(CoW) we should disallow this, so `df` doesn't get updated, - # this currently still updates df, so this test fails - ser.values[0] = "foo" - assert df.loc[0, "b"] == "a" + with pytest.raises(ValueError, match="read-only"): + ser.values[0] = "foo" elif warn_copy_on_write: ser.values[0] = "foo" assert df.loc[0, "b"] == "foo" diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index c79a37b5b30f0..7c7a0d23ff75f 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -166,11 +166,19 @@ def test_update_with_different_dtype(self, using_copy_on_write): with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): df["c"].update(Series(["foo"], index=[0])) - expected = DataFrame({"a": [1, 3], "b": [np.nan, 2], "c": ["foo", np.nan]}) + expected = DataFrame( + { + "a": [1, 3], + "b": [np.nan, 2], + "c": Series(["foo", np.nan], dtype="object"), + } + ) tm.assert_frame_equal(df, expected) @td.skip_array_manager_invalid_test - def test_update_modify_view(self, using_copy_on_write, warn_copy_on_write): + def test_update_modify_view( + self, using_copy_on_write, warn_copy_on_write, using_infer_string + ): # GH#47188 df = DataFrame({"A": ["1", np.nan], "B": ["100", np.nan]}) df2 = DataFrame({"A": ["a", "x"], "B": ["100", "200"]}) @@ -181,7 +189,7 @@ def test_update_modify_view(self, using_copy_on_write, warn_copy_on_write): df2.update(df) expected = DataFrame({"A": ["1", "x"], "B": ["100", "200"]}) tm.assert_frame_equal(df2, expected) - if using_copy_on_write: + if using_copy_on_write or using_infer_string: tm.assert_frame_equal(result_view, df2_orig) else: tm.assert_frame_equal(result_view, expected) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 2b392ddcfb44d..c7b444045a0f2 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -5,6 +5,7 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype from pandas._config.config import option_context import pandas as pd @@ -112,6 +113,7 @@ def test_not_hashable(self): with pytest.raises(TypeError, match=msg): hash(empty_frame) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="surrogates not allowed") def test_column_name_contains_unicode_surrogate(self): # GH 25509 colname = "\ud83d" diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index a4825c80ee815..42ce658701355 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -11,6 +11,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -251,6 +253,9 @@ def test_timestamp_compare(self, left, right): with pytest.raises(TypeError, match=msg): right_f(pd.Timestamp("nat"), df) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't compare string and int" + ) def test_mixed_comparison(self): # GH#13128, GH#22163 != datetime64 vs non-dt64 should be False, # not raise TypeError @@ -432,8 +437,8 @@ def test_bool_flex_frame_complex_dtype(self): def test_bool_flex_frame_object_dtype(self): # corner, dtype=object - df1 = DataFrame({"col": ["foo", np.nan, "bar"]}) - df2 = DataFrame({"col": ["foo", datetime.now(), "bar"]}) + df1 = DataFrame({"col": ["foo", np.nan, "bar"]}, dtype=object) + df2 = DataFrame({"col": ["foo", datetime.now(), "bar"]}, dtype=object) result = df1.ne(df2) exp = DataFrame({"col": [False, True, False]}) tm.assert_frame_equal(result, exp) @@ -624,10 +629,12 @@ def test_arith_flex_frame_corner(self, float_frame): # corner cases result = float_frame.add(float_frame[:0]) - tm.assert_frame_equal(result, float_frame * np.nan) + expected = float_frame.sort_index() * np.nan + tm.assert_frame_equal(result, expected) result = float_frame[:0].add(float_frame) - tm.assert_frame_equal(result, float_frame * np.nan) + expected = float_frame.sort_index() * np.nan + tm.assert_frame_equal(result, expected) with pytest.raises(NotImplementedError, match="fill_value"): float_frame.add(float_frame.iloc[0], fill_value=3) @@ -1976,7 +1983,12 @@ def test_dataframe_blockwise_slicelike(): "df, col_dtype", [ (DataFrame([[1.0, 2.0], [4.0, 5.0]], columns=list("ab")), "float64"), - (DataFrame([[1.0, "b"], [4.0, "b"]], columns=list("ab")), "object"), + ( + DataFrame([[1.0, "b"], [4.0, "b"]], columns=list("ab")).astype( + {"b": object} + ), + "object", + ), ], ) def test_dataframe_operation_with_non_numeric_types(df, col_dtype): diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index b132f136e9741..712494ef15f97 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -183,7 +183,7 @@ def test_constructor_with_convert(self): ) tm.assert_series_equal(result, expected) - def test_construction_with_mixed(self, float_string_frame): + def test_construction_with_mixed(self, float_string_frame, using_infer_string): # test construction edge cases with mixed types # f7u12, this does not work without extensive workaround @@ -206,7 +206,7 @@ def test_construction_with_mixed(self, float_string_frame): expected = Series( [np.dtype("float64")] * 4 + [ - np.dtype("object"), + np.dtype("object") if not using_infer_string else "string", np.dtype("datetime64[us]"), np.dtype("timedelta64[us]"), ], diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index c6fe3a154905c..e1abd0344e356 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -21,6 +21,8 @@ import pytest import pytz +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import lib from pandas.errors import IntCastingNaNError import pandas.util._test_decorators as td @@ -79,7 +81,7 @@ def test_constructor_from_ndarray_with_str_dtype(self): # with an array of strings each of which is e.g. "[0 1 2]" arr = np.arange(12).reshape(4, 3) df = DataFrame(arr, dtype=str) - expected = DataFrame(arr.astype(str)) + expected = DataFrame(arr.astype(str), dtype=object) tm.assert_frame_equal(df, expected) def test_constructor_from_2d_datetimearray(self, using_array_manager): @@ -261,8 +263,9 @@ def test_emptylike_constructor(self, emptylike, expected_index, expected_columns result = DataFrame(emptylike) tm.assert_frame_equal(result, expected) - def test_constructor_mixed(self, float_string_frame): - assert float_string_frame["foo"].dtype == np.object_ + def test_constructor_mixed(self, float_string_frame, using_infer_string): + dtype = "string" if using_infer_string else np.object_ + assert float_string_frame["foo"].dtype == dtype def test_constructor_cast_failure(self): # as of 2.0, we raise if we can't respect "dtype", previously we @@ -323,6 +326,7 @@ def test_constructor_dtype_nocast_view_2d_array( assert df2._mgr.arrays[0].flags.c_contiguous @td.skip_array_manager_invalid_test + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="conversion copies") def test_1d_object_array_does_not_copy(self): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array(["a", "b"], dtype="object") @@ -330,6 +334,7 @@ def test_1d_object_array_does_not_copy(self): assert np.shares_memory(df.values, arr) @td.skip_array_manager_invalid_test + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="conversion copies") def test_2d_object_array_does_not_copy(self): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array([["a", "b"], ["c", "d"]], dtype="object") @@ -773,7 +778,7 @@ def test_constructor_dict_block(self): ) tm.assert_numpy_array_equal(df.values, expected) - def test_constructor_dict_cast(self): + def test_constructor_dict_cast(self, using_infer_string): # cast float tests test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}} frame = DataFrame(test_data, dtype=float) @@ -783,7 +788,7 @@ def test_constructor_dict_cast(self): frame = DataFrame(test_data) assert len(frame) == 3 - assert frame["B"].dtype == np.object_ + assert frame["B"].dtype == np.object_ if not using_infer_string else "string" assert frame["A"].dtype == np.float64 def test_constructor_dict_cast2(self): @@ -1195,7 +1200,7 @@ def test_constructor_dtype_nullable_extension_arrays( df = DataFrame({"a": data}, dtype=input_dtype) assert df["a"].dtype == expected_dtype() - def test_constructor_scalar_inference(self): + def test_constructor_scalar_inference(self, using_infer_string): data = {"int": 1, "bool": True, "float": 3.0, "complex": 4j, "object": "foo"} df = DataFrame(data, index=np.arange(10)) @@ -1203,7 +1208,7 @@ def test_constructor_scalar_inference(self): assert df["bool"].dtype == np.bool_ assert df["float"].dtype == np.float64 assert df["complex"].dtype == np.complex128 - assert df["object"].dtype == np.object_ + assert df["object"].dtype == np.object_ if not using_infer_string else "string" def test_constructor_arrays_and_scalars(self): df = DataFrame({"a": np.random.default_rng(2).standard_normal(10), "b": True}) @@ -1282,11 +1287,11 @@ def empty_gen(): df = DataFrame(empty_gen(), columns=["A", "B"]) tm.assert_frame_equal(df, expected) - def test_constructor_list_of_lists(self): + def test_constructor_list_of_lists(self, using_infer_string): # GH #484 df = DataFrame(data=[[1, "a"], [2, "b"]], columns=["num", "str"]) assert is_integer_dtype(df["num"]) - assert df["str"].dtype == np.object_ + assert df["str"].dtype == np.object_ if not using_infer_string else "string" # GH 4851 # list of 0-dim ndarrays @@ -1835,7 +1840,7 @@ def test_constructor_single_value(self): with pytest.raises(TypeError, match=msg): DataFrame("a", [1, 2], ["a", "c"], float) - def test_constructor_with_datetimes(self): + def test_constructor_with_datetimes(self, using_infer_string): intname = np.dtype(int).name floatname = np.dtype(np.float64).name objectname = np.dtype(np.object_).name @@ -1854,7 +1859,7 @@ def test_constructor_with_datetimes(self): result = df.dtypes expected = Series( [np.dtype("int64")] - + [np.dtype(objectname)] * 2 + + [np.dtype(objectname) if not using_infer_string else "string"] * 2 + [np.dtype("M8[s]"), np.dtype("M8[us]")], index=list("ABCDE"), ) @@ -1876,7 +1881,7 @@ def test_constructor_with_datetimes(self): expected = Series( [np.dtype("float64")] + [np.dtype("int64")] - + [np.dtype("object")] + + [np.dtype("object") if not using_infer_string else "string"] + [np.dtype("float64")] + [np.dtype(intname)], index=["a", "b", "c", floatname, intname], @@ -1898,7 +1903,7 @@ def test_constructor_with_datetimes(self): expected = Series( [np.dtype("float64")] + [np.dtype("int64")] - + [np.dtype("object")] + + [np.dtype("object") if not using_infer_string else "string"] + [np.dtype("float64")] + [np.dtype(intname)], index=["a", "b", "c", floatname, intname], @@ -1935,13 +1940,13 @@ def test_constructor_with_datetimes3(self): df = DataFrame({"End Date": dt}, index=[0]) assert df.iat[0, 0] == dt tm.assert_series_equal( - df.dtypes, Series({"End Date": "datetime64[us, US/Eastern]"}) + df.dtypes, Series({"End Date": "datetime64[us, US/Eastern]"}, dtype=object) ) df = DataFrame([{"End Date": dt}]) assert df.iat[0, 0] == dt tm.assert_series_equal( - df.dtypes, Series({"End Date": "datetime64[ns, US/Eastern]"}) + df.dtypes, Series({"End Date": "datetime64[ns, US/Eastern]"}, dtype=object) ) def test_constructor_with_datetimes4(self): @@ -2066,7 +2071,7 @@ def test_constructor_timedelta_non_ns(self, order, unit): # dtype=exp_dtype. tm.assert_frame_equal(df, expected) - def test_constructor_for_list_with_dtypes(self): + def test_constructor_for_list_with_dtypes(self, using_infer_string): # test list of lists/ndarrays df = DataFrame([np.arange(5) for x in range(5)]) result = df.dtypes @@ -2117,7 +2122,7 @@ def test_constructor_for_list_with_dtypes(self): [ np.dtype("int64"), np.dtype("float64"), - np.dtype("object"), + np.dtype("object") if not using_infer_string else "string", np.dtype("datetime64[ns]"), np.dtype("float64"), ], diff --git a/pandas/tests/frame/test_logical_ops.py b/pandas/tests/frame/test_logical_ops.py index a15d7d7f93f01..16ca3a202f1e0 100644 --- a/pandas/tests/frame/test_logical_ops.py +++ b/pandas/tests/frame/test_logical_ops.py @@ -96,7 +96,7 @@ def test_logical_ops_int_frame(self): res_ser = df1a_int["A"] | df1a_bool["A"] tm.assert_series_equal(res_ser, df1a_bool["A"]) - def test_logical_ops_invalid(self): + def test_logical_ops_invalid(self, using_infer_string): # GH#5808 df1 = DataFrame(1.0, index=[1], columns=["A"]) @@ -108,8 +108,14 @@ def test_logical_ops_invalid(self): df1 = DataFrame("foo", index=[1], columns=["A"]) df2 = DataFrame(True, index=[1], columns=["A"]) msg = re.escape("unsupported operand type(s) for |: 'str' and 'bool'") - with pytest.raises(TypeError, match=msg): - df1 | df2 + if using_infer_string: + import pyarrow as pa + + with pytest.raises(pa.lib.ArrowNotImplementedError, match="|has no kernel"): + df1 | df2 + else: + with pytest.raises(TypeError, match=msg): + df1 | df2 def test_logical_operators(self): def _check_bin_op(op): diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 6353546648156..a498296e09c52 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -1035,7 +1035,7 @@ def test_query_with_string_columns(self, parser, engine): with pytest.raises(NotImplementedError, match=msg): df.query("a in b and c < d", parser=parser, engine=engine) - def test_object_array_eq_ne(self, parser, engine): + def test_object_array_eq_ne(self, parser, engine, using_infer_string): df = DataFrame( { "a": list("aaaabbbbcccc"), @@ -1044,11 +1044,14 @@ def test_object_array_eq_ne(self, parser, engine): "d": np.random.default_rng(2).integers(9, size=12), } ) - res = df.query("a == b", parser=parser, engine=engine) + warning = RuntimeWarning if using_infer_string and engine == "numexpr" else None + with tm.assert_produces_warning(warning): + res = df.query("a == b", parser=parser, engine=engine) exp = df[df.a == df.b] tm.assert_frame_equal(res, exp) - res = df.query("a != b", parser=parser, engine=engine) + with tm.assert_produces_warning(warning): + res = df.query("a != b", parser=parser, engine=engine) exp = df[df.a != df.b] tm.assert_frame_equal(res, exp) @@ -1087,12 +1090,16 @@ def test_query_with_nested_special_character(self, parser, engine): [">=", operator.ge], ], ) - def test_query_lex_compare_strings(self, parser, engine, op, func): + def test_query_lex_compare_strings( + self, parser, engine, op, func, using_infer_string + ): a = Series(np.random.default_rng(2).choice(list("abcde"), 20)) b = Series(np.arange(a.size)) df = DataFrame({"X": a, "Y": b}) - res = df.query(f'X {op} "d"', engine=engine, parser=parser) + warning = RuntimeWarning if using_infer_string and engine == "numexpr" else None + with tm.assert_produces_warning(warning): + res = df.query(f'X {op} "d"', engine=engine, parser=parser) expected = df[func(df.X, "d")] tm.assert_frame_equal(res, expected) @@ -1166,7 +1173,7 @@ def test_bool_arith_expr(self, frame, parser, engine): @pytest.mark.parametrize("op", ["+", "-", "*", "/"]) def test_invalid_type_for_operator_raises(self, parser, engine, op): df = DataFrame({"a": [1, 2], "b": ["c", "d"]}) - msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'" + msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'|Cannot" with pytest.raises(TypeError, match=msg): df.eval(f"a {op} b", engine=engine, parser=parser) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 3b1a751a738f9..66145c32c18d7 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.compat import ( IS64, is_platform_windows, @@ -243,11 +245,17 @@ class TestDataFrameAnalytics: pytest.param("kurt", marks=td.skip_if_no("scipy")), ], ) - def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname): - if (opname in ("sum", "min", "max") and axis == 0) or opname in ( - "count", - "nunique", - ): + def test_stat_op_api_float_string_frame( + self, float_string_frame, axis, opname, using_infer_string + ): + if ( + (opname in ("sum", "min", "max") and axis == 0) + or opname + in ( + "count", + "nunique", + ) + ) and not (using_infer_string and opname == "sum"): getattr(float_string_frame, opname)(axis=axis) else: if opname in ["var", "std", "sem", "skew", "kurt"]: @@ -273,7 +281,11 @@ def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname): elif opname in ["min", "max"]: msg = "'[><]=' not supported between instances of 'float' and 'str'" elif opname == "median": - msg = re.compile(r"Cannot convert \[.*\] to numeric", flags=re.S) + msg = re.compile( + r"Cannot convert \[.*\] to numeric|does not support", flags=re.S + ) + if not isinstance(msg, re.Pattern): + msg = msg + "|does not support" with pytest.raises(TypeError, match=msg): getattr(float_string_frame, opname)(axis=axis) if opname != "nunique": @@ -434,6 +446,7 @@ def test_mixed_ops(self, op): "Could not convert", "could not convert", "can't multiply sequence by non-int", + "does not support", ] ) with pytest.raises(TypeError, match=msg): @@ -445,11 +458,15 @@ def test_mixed_ops(self, op): "Could not convert", "could not convert", "can't multiply sequence by non-int", + "does not support", ] ) with pytest.raises(TypeError, match=msg): getattr(df, op)() + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="sum doesn't work for arrow strings" + ) def test_reduce_mixed_frame(self): # GH 6806 df = DataFrame( @@ -516,7 +533,9 @@ def test_mean_mixed_string_decimal(self): df = DataFrame(d) - with pytest.raises(TypeError, match="unsupported operand type"): + with pytest.raises( + TypeError, match="unsupported operand type|does not support" + ): df.mean() result = df[["A", "C"]].mean() expected = Series([2.7, 681.6], index=["A", "C"], dtype=object) @@ -652,7 +671,7 @@ def test_mode_dropna(self, dropna, expected): "A": [12, 12, 19, 11], "B": [10, 10, np.nan, 3], "C": [1, np.nan, np.nan, np.nan], - "D": [np.nan, np.nan, "a", np.nan], + "D": Series([np.nan, np.nan, "a", np.nan], dtype=object), "E": Categorical([np.nan, np.nan, "a", np.nan]), "F": DatetimeIndex(["NaT", "2000-01-02", "NaT", "NaT"], dtype="M8[ns]"), "G": to_timedelta(["1 days", "nan", "nan", "nan"]), @@ -672,14 +691,15 @@ def test_mode_dropna(self, dropna, expected): expected = DataFrame(expected) tm.assert_frame_equal(result, expected) - def test_mode_sortwarning(self): + def test_mode_sortwarning(self, using_infer_string): # Check for the warning that is raised when the mode # results cannot be sorted df = DataFrame({"A": [np.nan, np.nan, "a", "a"]}) expected = DataFrame({"A": ["a", np.nan]}) - with tm.assert_produces_warning(UserWarning): + warning = None if using_infer_string else UserWarning + with tm.assert_produces_warning(warning): result = df.mode(dropna=False) result = result.sort_values(by="A").reset_index(drop=True) @@ -969,7 +989,8 @@ def test_sum_mixed_datetime(self): def test_mean_corner(self, float_frame, float_string_frame): # unit test when have object data - with pytest.raises(TypeError, match="Could not convert"): + msg = "Could not convert|does not support" + with pytest.raises(TypeError, match=msg): float_string_frame.mean(axis=0) # xs sum mixed type, just want to know it works... @@ -1341,7 +1362,9 @@ def test_any_all_extra(self): @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("bool_agg_func", ["any", "all"]) @pytest.mark.parametrize("skipna", [True, False]) - def test_any_all_object_dtype(self, axis, bool_agg_func, skipna): + def test_any_all_object_dtype( + self, axis, bool_agg_func, skipna, using_infer_string + ): # GH#35450 df = DataFrame( data=[ @@ -1351,8 +1374,13 @@ def test_any_all_object_dtype(self, axis, bool_agg_func, skipna): [np.nan, np.nan, "5", np.nan], ] ) + if using_infer_string: + # na in object is True while in string pyarrow numpy it's false + val = not axis == 0 and not skipna and bool_agg_func == "all" + else: + val = True result = getattr(df, bool_agg_func)(axis=axis, skipna=skipna) - expected = Series([True, True, True, True]) + expected = Series([True, True, val, True]) tm.assert_series_equal(result, expected) # GH#50947 deprecates this but it is not emitting a warning in some builds. @@ -1378,7 +1406,8 @@ def test_any_datetime(self): def test_any_all_bool_only(self): # GH 25101 df = DataFrame( - {"col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [None, None, None]} + {"col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [None, None, None]}, + columns=Index(["col1", "col2", "col3"], dtype=object), ) result = df.all(bool_only=True) @@ -1931,6 +1960,9 @@ def test_sum_timedelta64_skipna_false(using_array_manager, request): tm.assert_series_equal(result, expected) +@pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="sum doesn't work with arrow strings" +) def test_mixed_frame_with_integer_sum(): # https://github.com/pandas-dev/pandas/issues/34520 df = DataFrame([["a", 1]], columns=list("ab")) @@ -1951,7 +1983,7 @@ def test_minmax_extensionarray(method, numeric_only): expected = Series( [getattr(int64_info, method)], dtype="Int64", - index=Index(["Int64"], dtype="object"), + index=Index(["Int64"]), ) tm.assert_series_equal(result, expected) @@ -1969,7 +2001,7 @@ def test_prod_sum_min_count_mixed_object(): df = DataFrame([1, "a", True]) result = df.prod(axis=0, min_count=1, numeric_only=False) - expected = Series(["a"]) + expected = Series(["a"], dtype=object) tm.assert_series_equal(result, expected) msg = re.escape("unsupported operand type(s) for +: 'int' and 'str'") diff --git a/pandas/tests/frame/test_repr.py b/pandas/tests/frame/test_repr.py index 6184e791cab5d..776007fb9691d 100644 --- a/pandas/tests/frame/test_repr.py +++ b/pandas/tests/frame/test_repr.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas import ( NA, Categorical, @@ -174,6 +176,7 @@ def test_repr_mixed_big(self): repr(biggie) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="/r in") def test_repr(self): # columns but no index no_index = DataFrame(columns=[0, 1, 3]) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 2e7e8eba270c0..554a9d4ce2d5d 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -619,7 +619,7 @@ def test_unstack_to_series(self, float_frame): data = data.unstack() tm.assert_frame_equal(old_data, data) - def test_unstack_dtypes(self): + def test_unstack_dtypes(self, using_infer_string): # GH 2929 rows = [[1, 1, 3, 4], [1, 2, 3, 4], [2, 1, 3, 4], [2, 2, 3, 4]] @@ -655,8 +655,9 @@ def test_unstack_dtypes(self): df2["D"] = "foo" df3 = df2.unstack("B") result = df3.dtypes + dtype = "string" if using_infer_string else np.dtype("object") expected = Series( - [np.dtype("float64")] * 2 + [np.dtype("object")] * 2, + [np.dtype("float64")] * 2 + [dtype] * 2, index=MultiIndex.from_arrays( [["C", "C", "D", "D"], [1, 2, 1, 2]], names=(None, "B") ), @@ -1359,14 +1360,16 @@ def test_unstack_fill_frame_object(): # By default missing values will be NaN result = data.unstack() expected = DataFrame( - {"a": ["a", np.nan, "a"], "b": ["b", "c", np.nan]}, index=list("xyz") + {"a": ["a", np.nan, "a"], "b": ["b", "c", np.nan]}, + index=list("xyz"), + dtype=object, ) tm.assert_frame_equal(result, expected) # Fill with any value replaces missing values as expected result = data.unstack(fill_value="d") expected = DataFrame( - {"a": ["a", "d", "a"], "b": ["b", "c", "d"]}, index=list("xyz") + {"a": ["a", "d", "a"], "b": ["b", "c", "d"]}, index=list("xyz"), dtype=object ) tm.assert_frame_equal(result, expected) @@ -2083,7 +2086,7 @@ def test_stack_multiple_bug(self, future_stack): multi = df.set_index(["DATE", "ID"]) multi.columns.name = "Params" unst = multi.unstack("ID") - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): unst.resample("W-THU").mean() down = unst.resample("W-THU").mean(numeric_only=True) @@ -2298,7 +2301,7 @@ def test_stack_unstack_unordered_multiindex(self, future_stack): tm.assert_frame_equal(result, expected) def test_unstack_preserve_types( - self, multiindex_year_month_day_dataframe_random_data + self, multiindex_year_month_day_dataframe_random_data, using_infer_string ): # GH#403 ymd = multiindex_year_month_day_dataframe_random_data @@ -2307,7 +2310,11 @@ def test_unstack_preserve_types( unstacked = ymd.unstack("month") assert unstacked["A", 1].dtype == np.float64 - assert unstacked["E", 1].dtype == np.object_ + assert ( + unstacked["E", 1].dtype == np.object_ + if not using_infer_string + else "string" + ) assert unstacked["F", 1].dtype == np.float64 def test_unstack_group_index_overflow(self, future_stack): @@ -2367,7 +2374,7 @@ def test_unstack_with_missing_int_cast_to_float(self, using_array_manager): expected = DataFrame( [[10.0, 10.0, 1.0, 1.0], [np.nan, 10.0, 0.0, 1.0]], - index=Index(["A", "B"], dtype="object", name="a"), + index=Index(["A", "B"], name="a"), columns=MultiIndex.from_tuples( [("v", "ca"), ("v", "cb"), ("is_", "ca"), ("is_", "cb")], names=[None, "b"], diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index f19e31002c877..ef78ae62cb4d6 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -524,8 +524,6 @@ def test_subclassed_wide_to_long(self): tm.assert_frame_equal(long_frame, expected) - # TODO(CoW-warn) should not need to warn - @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") def test_subclassed_apply(self): # GH 19822 diff --git a/pandas/tests/frame/test_unary.py b/pandas/tests/frame/test_unary.py index 5e29d3c868983..850c92013694f 100644 --- a/pandas/tests/frame/test_unary.py +++ b/pandas/tests/frame/test_unary.py @@ -48,15 +48,25 @@ def test_neg_object(self, df, expected): pd.DataFrame({"a": pd.to_datetime(["2017-01-22", "1970-01-01"])}), ], ) - def test_neg_raises(self, df): + def test_neg_raises(self, df, using_infer_string): msg = ( "bad operand type for unary -: 'str'|" r"bad operand type for unary -: 'DatetimeArray'" ) - with pytest.raises(TypeError, match=msg): - (-df) - with pytest.raises(TypeError, match=msg): - (-df["a"]) + if using_infer_string and df.dtypes.iloc[0] == "string": + import pyarrow as pa + + msg = "has no kernel" + with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): + (-df) + with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): + (-df["a"]) + + else: + with pytest.raises(TypeError, match=msg): + (-df) + with pytest.raises(TypeError, match=msg): + (-df["a"]) def test_invert(self, float_frame): df = float_frame diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 60b386adb664a..34b6e7c4cde5f 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -37,7 +37,7 @@ def store(group): tm.assert_frame_equal(groups[0], expected_value) -def test_apply_index_date(): +def test_apply_index_date(using_infer_string): # GH 5788 ts = [ "2011-05-16 00:00", @@ -77,7 +77,7 @@ def test_apply_index_date(): tm.assert_frame_equal(result, expected) -def test_apply_index_date_object(): +def test_apply_index_date_object(using_infer_string): # GH 5789 # don't auto coerce dates ts = [ @@ -109,8 +109,9 @@ def test_apply_index_date_object(): 1.40750, 1.40649, ] + dtype = "string[pyarrow_numpy]" if using_infer_string else object exp_idx = Index( - ["2011-05-16", "2011-05-17", "2011-05-18"], dtype=object, name="date" + ["2011-05-16", "2011-05-17", "2011-05-18"], dtype=dtype, name="date" ) expected = Series(["00:00", "02:00", "02:00"], index=exp_idx) msg = "DataFrameGroupBy.apply operated on the grouping columns" @@ -121,14 +122,15 @@ def test_apply_index_date_object(): tm.assert_series_equal(result, expected) -def test_apply_trivial(): +def test_apply_trivial(using_infer_string): # GH 20066 # trivial apply: ignore input and return a constant dataframe. df = DataFrame( {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=["key", "data"], ) - expected = pd.concat([df.iloc[1:], df.iloc[1:]], axis=1, keys=["float64", "object"]) + dtype = "string" if using_infer_string else "object" + expected = pd.concat([df.iloc[1:], df.iloc[1:]], axis=1, keys=["float64", dtype]) msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): @@ -138,13 +140,14 @@ def test_apply_trivial(): tm.assert_frame_equal(result, expected) -def test_apply_trivial_fail(): +def test_apply_trivial_fail(using_infer_string): # GH 20066 df = DataFrame( {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=["key", "data"], ) - expected = pd.concat([df, df], axis=1, keys=["float64", "object"]) + dtype = "string" if using_infer_string else "object" + expected = pd.concat([df, df], axis=1, keys=["float64", dtype]) msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): gb = df.groupby([str(x) for x in df.dtypes], axis=1, group_keys=True) @@ -941,7 +944,7 @@ def test_func_returns_object(): "group_column_dtlike", [datetime.today(), datetime.today().date(), datetime.today().time()], ) -def test_apply_datetime_issue(group_column_dtlike): +def test_apply_datetime_issue(group_column_dtlike, using_infer_string): # GH-28247 # groupby-apply throws an error if one of the columns in the DataFrame # is a datetime object and the column labels are different from @@ -952,9 +955,8 @@ def test_apply_datetime_issue(group_column_dtlike): with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) - expected = DataFrame( - ["spam"], Index(["foo"], dtype="object", name="a"), columns=[42] - ) + dtype = "string" if using_infer_string else "object" + expected = DataFrame(["spam"], Index(["foo"], dtype=dtype, name="a"), columns=[42]) tm.assert_frame_equal(result, expected) @@ -1021,7 +1023,7 @@ def test_apply_multi_level_name(category): assert df.index.names == ["A", "B"] -def test_groupby_apply_datetime_result_dtypes(): +def test_groupby_apply_datetime_result_dtypes(using_infer_string): # GH 14849 data = DataFrame.from_records( [ @@ -1035,8 +1037,9 @@ def test_groupby_apply_datetime_result_dtypes(): msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(FutureWarning, match=msg): result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes + dtype = "string" if using_infer_string else object expected = Series( - [np.dtype("datetime64[ns]"), object, object, np.int64, object], + [np.dtype("datetime64[ns]"), dtype, dtype, np.int64, dtype], index=["observation", "color", "mood", "intensity", "score"], ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index d4ccbe4c1c376..7a91601bf688f 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -82,7 +82,7 @@ def get_stats(group): assert result.index.names[0] == "C" -def test_basic(): # TODO: split this test +def test_basic(using_infer_string): # TODO: split this test cats = Categorical( ["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], @@ -129,7 +129,8 @@ def f(x): result = g.apply(f) expected = x.iloc[[0, 1]].copy() expected.index = Index([1, 2], name="person_id") - expected["person_name"] = expected["person_name"].astype("object") + dtype = "string[pyarrow_numpy]" if using_infer_string else object + expected["person_name"] = expected["person_name"].astype(dtype) tm.assert_frame_equal(result, expected) # GH 9921 diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 62347ec1d3d6a..802cae9ff65f0 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -11,6 +11,8 @@ ) import pandas.util._test_decorators as td +from pandas.core.dtypes.common import is_string_dtype + import pandas as pd from pandas import ( Categorical, @@ -687,7 +689,7 @@ def test_frame_multi_key_function_list_partial_failure(): grouped = data.groupby(["A", "B"]) funcs = ["mean", "std"] - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): grouped.agg(funcs) @@ -980,7 +982,7 @@ def test_groupby_multi_corner(df): def test_raises_on_nuisance(df): grouped = df.groupby("A") - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): grouped.agg("mean") with pytest.raises(TypeError, match=msg): @@ -1036,7 +1038,7 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only): msg = "could not convert string to float: 'one'" else: klass = TypeError - msg = re.escape(f"agg function failed [how->{agg_function},dtype->object]") + msg = re.escape(f"agg function failed [how->{agg_function},dtype->") with pytest.raises(klass, match=msg): getattr(grouped, agg_function)(numeric_only=numeric_only) else: @@ -1061,7 +1063,7 @@ def test_raise_on_nuisance_python_single(df): def test_raise_on_nuisance_python_multiple(three_group): grouped = three_group.groupby(["A", "B"]) - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): grouped.agg("mean") with pytest.raises(TypeError, match=msg): @@ -1104,7 +1106,7 @@ def test_wrap_aggregated_output_multindex(multiindex_dataframe_random_data): df["baz", "two"] = "peekaboo" keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): df.groupby(keys).agg("mean") agged = df.drop(columns=("baz", "two")).groupby(keys).agg("mean") @@ -1193,7 +1195,7 @@ def test_groupby_complex(): tm.assert_series_equal(result, expected) -def test_groupby_complex_numbers(): +def test_groupby_complex_numbers(using_infer_string): # GH 17927 df = DataFrame( [ @@ -1202,10 +1204,11 @@ def test_groupby_complex_numbers(): {"a": 4, "b": 1}, ] ) + dtype = "string[pyarrow_numpy]" if using_infer_string else object expected = DataFrame( np.array([1, 1, 1], dtype=np.int64), index=Index([(1 + 1j), (1 + 2j), (1 + 0j)], name="b"), - columns=Index(["a"], dtype="object"), + columns=Index(["a"], dtype=dtype), ) result = df.groupby("b", sort=False).count() tm.assert_frame_equal(result, expected) @@ -1720,14 +1723,18 @@ def g(group): @pytest.mark.parametrize("grouper", ["A", ["A", "B"]]) -def test_set_group_name(df, grouper): +def test_set_group_name(df, grouper, using_infer_string): def f(group): assert group.name is not None return group def freduce(group): assert group.name is not None - return group.sum() + if using_infer_string and grouper == "A" and is_string_dtype(group.dtype): + with pytest.raises(TypeError, match="does not support"): + group.sum() + else: + return group.sum() def freducex(x): return freduce(x) @@ -2024,7 +2031,9 @@ def test_pivot_table_values_key_error(): @pytest.mark.parametrize( "op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew"] ) -def test_empty_groupby(columns, keys, values, method, op, using_array_manager, dropna): +def test_empty_groupby( + columns, keys, values, method, op, using_array_manager, dropna, using_infer_string +): # GH8093 & GH26411 override_dtype = None @@ -2065,7 +2074,11 @@ def get_categorical_invalid_expected(): # Categorical is special without 'observed=True' idx = Index(lev, name=keys[0]) - expected = DataFrame([], columns=[], index=idx) + if using_infer_string: + columns = Index([], dtype="string[pyarrow_numpy]") + else: + columns = [] + expected = DataFrame([], columns=columns, index=idx) return expected is_per = isinstance(df.dtypes.iloc[0], pd.PeriodDtype) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index b40c8f45b6d19..4f54621b19b64 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -546,9 +546,9 @@ def test_categorical_reducers(reduction_func, observed, sort, as_index, index_ki gb_filled = df_filled.groupby(keys, observed=observed, sort=sort, as_index=True) expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index() - expected["x"] = expected["x"].replace(4, None) + expected["x"] = expected["x"].cat.remove_categories([4]) if index_kind == "multi": - expected["x2"] = expected["x2"].replace(4, None) + expected["x2"] = expected["x2"].cat.remove_categories([4]) if as_index: if index_kind == "multi": expected = expected.set_index(["x", "x2"]) diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index 0141adf44c86b..ff4685b1e412d 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -180,6 +180,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): [ "category type does not support sum operations", re.escape(f"agg function failed [how->{method},dtype->object]"), + re.escape(f"agg function failed [how->{method},dtype->string]"), ] ) with pytest.raises(exception, match=msg): @@ -196,6 +197,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): "function is not implemented for this dtype", f"Cannot perform {method} with non-ordered Categorical", re.escape(f"agg function failed [how->{method},dtype->object]"), + re.escape(f"agg function failed [how->{method},dtype->string]"), ] ) with pytest.raises(exception, match=msg): @@ -206,7 +208,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): @pytest.mark.parametrize("numeric_only", [True, False, None]) -def test_axis1_numeric_only(request, groupby_func, numeric_only): +def test_axis1_numeric_only(request, groupby_func, numeric_only, using_infer_string): if groupby_func in ("idxmax", "idxmin"): pytest.skip("idxmax and idx_min tested in test_idxmin_idxmax_axis1") if groupby_func in ("corrwith", "skew"): @@ -268,8 +270,15 @@ def test_axis1_numeric_only(request, groupby_func, numeric_only): "can't multiply sequence by non-int of type 'float'", # cumsum, diff, pct_change "unsupported operand type", + "has no kernel", ) - with pytest.raises(TypeError, match=f"({'|'.join(msgs)})"): + if using_infer_string: + import pyarrow as pa + + errs = (TypeError, pa.lib.ArrowNotImplementedError) + else: + errs = TypeError + with pytest.raises(errs, match=f"({'|'.join(msgs)})"): with tm.assert_produces_warning(FutureWarning, match=warn_msg): method(*args, **kwargs) else: diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 2800f08b5fd90..0b451ce73db89 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -189,7 +189,7 @@ def test_groupby_raises_string( "sum": (None, ""), "var": ( TypeError, - re.escape("agg function failed [how->var,dtype->object]"), + re.escape("agg function failed [how->var,dtype->"), ), }[groupby_func] diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 3e78e728f5ea9..425079f943aba 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -1057,3 +1057,27 @@ def test_regression_allowlist_methods(op, axis, skipna, sort): if sort: expected = expected.sort_index(axis=axis) tm.assert_frame_equal(result, expected) + + +def test_groupby_prod_with_int64_dtype(): + # GH#46573 + data = [ + [1, 11], + [1, 41], + [1, 17], + [1, 37], + [1, 7], + [1, 29], + [1, 31], + [1, 2], + [1, 3], + [1, 43], + [1, 5], + [1, 47], + [1, 19], + [1, 88], + ] + df = DataFrame(data, columns=["A", "B"], dtype="int64") + result = df.groupby(["A"]).prod().reset_index() + expected = DataFrame({"A": [1], "B": [180970905912331920]}, dtype="int64") + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/base_class/test_formats.py b/pandas/tests/indexes/base_class/test_formats.py index 379aea8826414..f30b578cfcf56 100644 --- a/pandas/tests/indexes/base_class/test_formats.py +++ b/pandas/tests/indexes/base_class/test_formats.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype import pandas._config.config as cf from pandas import Index @@ -15,6 +16,7 @@ def test_repr_is_valid_construction_code(self): res = eval(repr(idx)) tm.assert_index_equal(res, idx) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") @pytest.mark.parametrize( "index,expected", [ @@ -79,6 +81,7 @@ def test_string_index_repr(self, index, expected): result = repr(index) assert result == expected + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") @pytest.mark.parametrize( "index,expected", [ diff --git a/pandas/tests/indexes/base_class/test_reshape.py b/pandas/tests/indexes/base_class/test_reshape.py index 6586f5f9de480..814a6a516904b 100644 --- a/pandas/tests/indexes/base_class/test_reshape.py +++ b/pandas/tests/indexes/base_class/test_reshape.py @@ -33,13 +33,15 @@ def test_insert(self): # test empty null_index = Index([]) - tm.assert_index_equal(Index(["a"]), null_index.insert(0, "a")) + tm.assert_index_equal(Index(["a"], dtype=object), null_index.insert(0, "a")) - def test_insert_missing(self, nulls_fixture): + def test_insert_missing(self, nulls_fixture, using_infer_string): # GH#22295 # test there is no mangling of NA values - expected = Index(["a", nulls_fixture, "b", "c"]) - result = Index(list("abc")).insert(1, nulls_fixture) + expected = Index(["a", nulls_fixture, "b", "c"], dtype=object) + result = Index(list("abc"), dtype=object).insert( + 1, Index([nulls_fixture], dtype=object) + ) tm.assert_index_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py index e538ad512d691..3ef3f3ad4d3a2 100644 --- a/pandas/tests/indexes/base_class/test_setops.py +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -154,7 +154,7 @@ def test_intersection_str_dates(self, sort): def test_intersection_non_monotonic_non_unique(self, index2, expected_arr, sort): # non-monotonic non-unique index1 = Index(["A", "B", "A", "C"]) - expected = Index(expected_arr, dtype="object") + expected = Index(expected_arr) result = index1.intersection(index2, sort=sort) if sort is None: expected = expected.sort_values() diff --git a/pandas/tests/indexes/categorical/test_astype.py b/pandas/tests/indexes/categorical/test_astype.py index da1d692f9eb2d..a17627b7515b2 100644 --- a/pandas/tests/indexes/categorical/test_astype.py +++ b/pandas/tests/indexes/categorical/test_astype.py @@ -18,7 +18,7 @@ def test_astype(self): ci = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False) result = ci.astype(object) - tm.assert_index_equal(result, Index(np.array(ci))) + tm.assert_index_equal(result, Index(np.array(ci), dtype=object)) # this IS equal, but not the same class assert result.equals(ci) diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 142a00d32815a..03a298a13dc2b 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import index as libindex from pandas._libs.arrays import NDArrayBacked @@ -47,7 +49,7 @@ def test_insert(self, simple_index): # invalid -> cast to object expected = ci.astype(object).insert(0, "d") - result = ci.insert(0, "d") + result = ci.insert(0, "d").astype(object) tm.assert_index_equal(result, expected, exact=True) # GH 18295 (test missing) @@ -194,6 +196,7 @@ def test_unique(self, data, categories, expected_data, ordered): expected = CategoricalIndex(expected_data, dtype=dtype) tm.assert_index_equal(idx.unique(), expected) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr doesn't roundtrip") def test_repr_roundtrip(self): ci = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) str(ci) diff --git a/pandas/tests/indexes/categorical/test_formats.py b/pandas/tests/indexes/categorical/test_formats.py index ea3e4ce213e67..522ca1bc2afde 100644 --- a/pandas/tests/indexes/categorical/test_formats.py +++ b/pandas/tests/indexes/categorical/test_formats.py @@ -1,6 +1,9 @@ """ Tests for CategoricalIndex.__repr__ and related methods. """ +import pytest + +from pandas._config import using_pyarrow_string_dtype import pandas._config.config as cf from pandas import CategoricalIndex @@ -16,6 +19,7 @@ def test_format_different_scalar_lengths(self): with tm.assert_produces_warning(FutureWarning, match=msg): assert idx.format() == expected + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") def test_string_categorical_index_repr(self): # short idx = CategoricalIndex(["a", "bb", "ccc"]) diff --git a/pandas/tests/indexes/categorical/test_reindex.py b/pandas/tests/indexes/categorical/test_reindex.py index 8ca5c6099b4e7..5b1f2b9fb159a 100644 --- a/pandas/tests/indexes/categorical/test_reindex.py +++ b/pandas/tests/indexes/categorical/test_reindex.py @@ -40,7 +40,7 @@ def test_reindex_duplicate_target(self): # See GH25459 cat = CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c", "d"]) res, indexer = cat.reindex(["a", "c", "c"]) - exp = Index(["a", "c", "c"], dtype="object") + exp = Index(["a", "c", "c"]) tm.assert_index_equal(res, exp, exact=True) tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp)) diff --git a/pandas/tests/indexes/datetimes/methods/test_map.py b/pandas/tests/indexes/datetimes/methods/test_map.py index c31e2407190ea..f35f07bd32068 100644 --- a/pandas/tests/indexes/datetimes/methods/test_map.py +++ b/pandas/tests/indexes/datetimes/methods/test_map.py @@ -16,7 +16,7 @@ def test_map(self): f = lambda x: x.strftime("%Y%m%d") result = rng.map(f) - exp = Index([f(x) for x in rng], dtype=" t.values - result = s.str.cat(t.values, sep=sep) - tm.assert_equal(result, expected) - - # Series/Index with Series having matching Index - t = Series(t.values, index=s) - result = s.str.cat(t, sep=sep) - tm.assert_equal(result, expected) - - # Series/Index with Series.values - result = s.str.cat(t.values, sep=sep) - tm.assert_equal(result, expected) + expected = Index(["ab", "aa", "bb", "ac"]) + expected = ( + expected + if box == Index + else Series(expected, index=Index(s, dtype=dtype_caller)) + ) - # Series/Index with Series having different Index - t = Series(t.values, index=t.values) - expected = Index(["aa", "aa", "bb", "bb", "aa"]) - expected = expected if box == Index else Series(expected, index=expected.str[:1]) + # Series/Index with unaligned Index -> t.values + result = s.str.cat(t.values, sep=sep) + tm.assert_equal(result, expected) + + # Series/Index with Series having matching Index + t = Series(t.values, index=Index(s, dtype=dtype_caller)) + result = s.str.cat(t, sep=sep) + tm.assert_equal(result, expected) + + # Series/Index with Series.values + result = s.str.cat(t.values, sep=sep) + tm.assert_equal(result, expected) + + # Series/Index with Series having different Index + t = Series(t.values, index=t.values) + expected = Index(["aa", "aa", "bb", "bb", "aa"]) + dtype = object if dtype_caller == "object" else s.dtype.categories.dtype + expected = ( + expected + if box == Index + else Series(expected, index=Index(expected.str[:1], dtype=dtype)) + ) - result = s.str.cat(t, sep=sep) - tm.assert_equal(result, expected) + result = s.str.cat(t, sep=sep) + tm.assert_equal(result, expected) @pytest.mark.parametrize( @@ -321,8 +343,9 @@ def test_str_cat_all_na(index_or_series, index_or_series2): # all-NA target if box == Series: - expected = Series([np.nan] * 4, index=s.index, dtype=object) + expected = Series([np.nan] * 4, index=s.index, dtype=s.dtype) else: # box == Index + # TODO: Strimg option, this should return string dtype expected = Index([np.nan] * 4, dtype=object) result = s.str.cat(t, join="left") tm.assert_equal(result, expected) diff --git a/pandas/tests/strings/test_extract.py b/pandas/tests/strings/test_extract.py index 9ad9b1eca41d9..77d008c650264 100644 --- a/pandas/tests/strings/test_extract.py +++ b/pandas/tests/strings/test_extract.py @@ -47,13 +47,16 @@ def test_extract_expand_False_mixed_object(): # two groups result = ser.str.extract(".*(BAD[_]+).*(BAD)", expand=False) er = [np.nan, np.nan] # empty row - expected = DataFrame([["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er]) + expected = DataFrame( + [["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er], dtype=object + ) tm.assert_frame_equal(result, expected) # single group result = ser.str.extract(".*(BAD[_]+).*BAD", expand=False) expected = Series( - ["BAD_", np.nan, "BAD_", np.nan, np.nan, np.nan, None, np.nan, np.nan] + ["BAD_", np.nan, "BAD_", np.nan, np.nan, np.nan, None, np.nan, np.nan], + dtype=object, ) tm.assert_series_equal(result, expected) @@ -238,7 +241,9 @@ def test_extract_expand_True_mixed_object(): ) result = mixed.str.extract(".*(BAD[_]+).*(BAD)", expand=True) - expected = DataFrame([["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er]) + expected = DataFrame( + [["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er], dtype=object + ) tm.assert_frame_equal(result, expected) @@ -603,8 +608,8 @@ def test_extractall_stringindex(any_string_dtype): # index.name doesn't affect to the result if any_string_dtype == "object": for idx in [ - Index(["a1a2", "b1", "c1"]), - Index(["a1a2", "b1", "c1"], name="xxx"), + Index(["a1a2", "b1", "c1"], dtype=object), + Index(["a1a2", "b1", "c1"], name="xxx", dtype=object), ]: result = idx.str.extractall(r"[ab](?P\d)") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index bd64a5dce3b9a..3f58c6d703f8f 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -242,7 +242,7 @@ def test_contains_nan(any_string_dtype): @pytest.mark.parametrize("pat", ["foo", ("foo", "baz")]) -@pytest.mark.parametrize("dtype", [None, "category"]) +@pytest.mark.parametrize("dtype", ["object", "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) @pytest.mark.parametrize("na", [True, False]) def test_startswith(pat, dtype, null_value, na): @@ -254,10 +254,10 @@ def test_startswith(pat, dtype, null_value, na): result = values.str.startswith(pat) exp = Series([False, np.nan, True, False, False, np.nan, True]) - if dtype is None and null_value is pd.NA: + if dtype == "object" and null_value is pd.NA: # GH#18463 exp = exp.fillna(null_value) - elif dtype is None and null_value is None: + elif dtype == "object" and null_value is None: exp[exp.isna()] = None tm.assert_series_equal(result, exp) @@ -300,7 +300,7 @@ def test_startswith_nullable_string_dtype(nullable_string_dtype, na): @pytest.mark.parametrize("pat", ["foo", ("foo", "baz")]) -@pytest.mark.parametrize("dtype", [None, "category"]) +@pytest.mark.parametrize("dtype", ["object", "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) @pytest.mark.parametrize("na", [True, False]) def test_endswith(pat, dtype, null_value, na): @@ -312,10 +312,10 @@ def test_endswith(pat, dtype, null_value, na): result = values.str.endswith(pat) exp = Series([False, np.nan, False, False, True, np.nan, True]) - if dtype is None and null_value is pd.NA: + if dtype == "object" and null_value is pd.NA: # GH#18463 - exp = exp.fillna(pd.NA) - elif dtype is None and null_value is None: + exp = exp.fillna(null_value) + elif dtype == "object" and null_value is None: exp[exp.isna()] = None tm.assert_series_equal(result, exp) @@ -382,7 +382,9 @@ def test_replace_mixed_object(): ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] ) result = Series(ser).str.replace("BAD[_]*", "", regex=True) - expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan]) + expected = Series( + ["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan], dtype=object + ) tm.assert_series_equal(result, expected) @@ -469,7 +471,9 @@ def test_replace_compiled_regex_mixed_object(): ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] ) result = Series(ser).str.replace(pat, "", regex=True) - expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan]) + expected = Series( + ["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan], dtype=object + ) tm.assert_series_equal(result, expected) @@ -913,7 +917,7 @@ def test_translate_mixed_object(): # Series with non-string values s = Series(["a", "b", "c", 1.2]) table = str.maketrans("abc", "cde") - expected = Series(["c", "d", "e", np.nan]) + expected = Series(["c", "d", "e", np.nan], dtype=object) result = s.str.translate(table) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 0a7d409773dd6..9ff1fc0e13ae9 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -681,14 +681,16 @@ def test_partition_sep_kwarg(any_string_dtype, method): def test_get(): ser = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) result = ser.str.split("_").str.get(1) - expected = Series(["b", "d", np.nan, "g"]) + expected = Series(["b", "d", np.nan, "g"], dtype=object) tm.assert_series_equal(result, expected) def test_get_mixed_object(): ser = Series(["a_b_c", np.nan, "c_d_e", True, datetime.today(), None, 1, 2.0]) result = ser.str.split("_").str.get(1) - expected = Series(["b", np.nan, "d", np.nan, np.nan, None, np.nan, np.nan]) + expected = Series( + ["b", np.nan, "d", np.nan, np.nan, None, np.nan, np.nan], dtype=object + ) tm.assert_series_equal(result, expected) @@ -696,7 +698,7 @@ def test_get_mixed_object(): def test_get_bounds(idx): ser = Series(["1_2_3_4_5", "6_7_8_9_10", "11_12"]) result = ser.str.split("_").str.get(idx) - expected = Series(["3", "8", np.nan]) + expected = Series(["3", "8", np.nan], dtype=object) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py index a88dcc8956931..0b3f368afea5e 100644 --- a/pandas/tests/strings/test_string_array.py +++ b/pandas/tests/strings/test_string_array.py @@ -8,6 +8,7 @@ DataFrame, Series, _testing as tm, + option_context, ) @@ -56,7 +57,8 @@ def test_string_array(nullable_string_dtype, any_string_method): columns = expected.select_dtypes(include="object").columns assert all(result[columns].dtypes == nullable_string_dtype) result[columns] = result[columns].astype(object) - expected[columns] = expected[columns].fillna(NA) # GH#18463 + with option_context("future.no_silent_downcasting", True): + expected[columns] = expected[columns].fillna(NA) # GH#18463 tm.assert_equal(result, expected) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 4315835b70a40..f662dfd7e2b14 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -76,7 +76,8 @@ def test_repeat_mixed_object(): ser = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0]) result = ser.str.repeat(3) expected = Series( - ["aaa", np.nan, "bbb", np.nan, np.nan, "foofoofoo", None, np.nan, np.nan] + ["aaa", np.nan, "bbb", np.nan, np.nan, "foofoofoo", None, np.nan, np.nan], + dtype=object, ) tm.assert_series_equal(result, expected) @@ -270,7 +271,8 @@ def test_spilt_join_roundtrip_mixed_object(): ) result = ser.str.split("_").str.join("_") expected = Series( - ["a_b", np.nan, "asdf_cas_asdf", np.nan, np.nan, "foo", None, np.nan, np.nan] + ["a_b", np.nan, "asdf_cas_asdf", np.nan, np.nan, "foo", None, np.nan, np.nan], + dtype=object, ) tm.assert_series_equal(result, expected) @@ -398,7 +400,7 @@ def test_slice(start, stop, step, expected, any_string_dtype): def test_slice_mixed_object(start, stop, step, expected): ser = Series(["aafootwo", np.nan, "aabartwo", True, datetime.today(), None, 1, 2.0]) result = ser.str.slice(start, stop, step) - expected = Series(expected) + expected = Series(expected, dtype=object) tm.assert_series_equal(result, expected) @@ -453,7 +455,7 @@ def test_strip_lstrip_rstrip_mixed_object(method, exp): ser = Series([" aa ", np.nan, " bb \t\n", True, datetime.today(), None, 1, 2.0]) result = getattr(ser.str, method)() - expected = Series(exp + [np.nan, np.nan, None, np.nan, np.nan]) + expected = Series(exp + [np.nan, np.nan, None, np.nan, np.nan], dtype=object) tm.assert_series_equal(result, expected) @@ -529,7 +531,7 @@ def test_string_slice_out_of_bounds(any_string_dtype): def test_encode_decode(any_string_dtype): ser = Series(["a", "b", "a\xe4"], dtype=any_string_dtype).str.encode("utf-8") result = ser.str.decode("utf-8") - expected = ser.map(lambda x: x.decode("utf-8")) + expected = ser.map(lambda x: x.decode("utf-8")).astype(object) tm.assert_series_equal(result, expected) @@ -559,7 +561,7 @@ def test_decode_errors_kwarg(): ser.str.decode("cp1252") result = ser.str.decode("cp1252", "ignore") - expected = ser.map(lambda x: x.decode("cp1252", "ignore")) + expected = ser.map(lambda x: x.decode("cp1252", "ignore")).astype(object) tm.assert_series_equal(result, expected) @@ -672,7 +674,7 @@ def test_str_accessor_in_apply_func(): def test_zfill(): # https://github.com/pandas-dev/pandas/issues/20868 value = Series(["-1", "1", "1000", 10, np.nan]) - expected = Series(["-01", "001", "1000", np.nan, np.nan]) + expected = Series(["-01", "001", "1000", np.nan, np.nan], dtype=object) tm.assert_series_equal(value.str.zfill(3), expected) value = Series(["-2", "+5"]) @@ -704,10 +706,10 @@ def test_get_with_dict_label(): ] ) result = s.str.get("name") - expected = Series(["Hello", "Goodbye", None]) + expected = Series(["Hello", "Goodbye", None], dtype=object) tm.assert_series_equal(result, expected) result = s.str.get("value") - expected = Series(["World", "Planet", "Sea"]) + expected = Series(["World", "Planet", "Sea"], dtype=object) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 5356704cc64a2..718d1b3ee2e83 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -992,6 +992,45 @@ def test_large(self): expected[1] = True tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize("dtype", ["m8[ns]", "M8[ns]", "M8[ns, UTC]", "period[D]"]) + def test_isin_datetimelike_all_nat(self, dtype): + # GH#56427 + dta = date_range("2013-01-01", periods=3)._values + arr = Series(dta.view("i8")).array.view(dtype) + + arr[0] = NaT + result = algos.isin(arr, [NaT]) + expected = np.array([True, False, False], dtype=bool) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("dtype", ["m8[ns]", "M8[ns]", "M8[ns, UTC]"]) + def test_isin_datetimelike_strings_deprecated(self, dtype): + # GH#53111 + dta = date_range("2013-01-01", periods=3)._values + arr = Series(dta.view("i8")).array.view(dtype) + + vals = [str(x) for x in arr] + msg = "The behavior of 'isin' with dtype=.* is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = algos.isin(arr, vals) + assert res.all() + + vals2 = np.array(vals, dtype=str) + with tm.assert_produces_warning(FutureWarning, match=msg): + res2 = algos.isin(arr, vals2) + assert res2.all() + + def test_isin_dt64tz_with_nat(self): + # the all-NaT values used to get inferred to tznaive, which was evaluated + # as non-matching GH#56427 + dti = date_range("2016-01-01", periods=3, tz="UTC") + ser = Series(dti) + ser[0] = NaT + + res = algos.isin(ser._values, [NaT]) + exp = np.array([True, False, False], dtype=bool) + tm.assert_numpy_array_equal(res, exp) + def test_categorical_from_codes(self): # GH 16639 vals = np.array([0, 1, 2, 0]) @@ -1798,57 +1837,6 @@ def test_pct_max_many_rows(self): assert result == 1 -def test_int64_add_overflow(): - # see gh-14068 - msg = "Overflow in int64 addition" - m = np.iinfo(np.int64).max - n = np.iinfo(np.int64).min - - with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr(np.array([m, m]), m) - with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr(np.array([m, m]), np.array([m, m])) - with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr(np.array([n, n]), n) - with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr(np.array([n, n]), np.array([n, n])) - with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr(np.array([m, n]), np.array([n, n])) - with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr( - np.array([m, m]), np.array([m, m]), arr_mask=np.array([False, True]) - ) - with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr( - np.array([m, m]), np.array([m, m]), b_mask=np.array([False, True]) - ) - with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr( - np.array([m, m]), - np.array([m, m]), - arr_mask=np.array([False, True]), - b_mask=np.array([False, True]), - ) - with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr(np.array([m, m]), np.array([np.nan, m])) - - # Check that the nan boolean arrays override whether or not - # the addition overflows. We don't check the result but just - # the fact that an OverflowError is not raised. - algos.checked_add_with_arr( - np.array([m, m]), np.array([m, m]), arr_mask=np.array([True, True]) - ) - algos.checked_add_with_arr( - np.array([m, m]), np.array([m, m]), b_mask=np.array([True, True]) - ) - algos.checked_add_with_arr( - np.array([m, m]), - np.array([m, m]), - arr_mask=np.array([True, False]), - b_mask=np.array([False, True]), - ) - - class TestMode: def test_no_mode(self): exp = Series([], dtype=np.float64, index=Index([], dtype=int)) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index f74fe459eb4d6..de5d67e6bd25f 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1040,7 +1040,7 @@ def test_to_datetime_now(self): # See GH#18666 with tm.set_timezone("US/Eastern"): # GH#18705 - now = Timestamp("now") + now = Timestamp("now").as_unit("ns") pdnow = to_datetime("now") pdnow2 = to_datetime(["now"])[0] @@ -1066,7 +1066,7 @@ def test_to_datetime_today(self, tz): pdtoday = to_datetime("today") pdtoday2 = to_datetime(["today"])[0] - tstoday = Timestamp("today") + tstoday = Timestamp("today").as_unit("ns") tstoday2 = Timestamp.today().as_unit("ns") # These should all be equal with infinite perf; this gives diff --git a/pandas/tests/tslibs/test_api.py b/pandas/tests/tslibs/test_api.py index b52bc78d58296..42d055326c2a5 100644 --- a/pandas/tests/tslibs/test_api.py +++ b/pandas/tests/tslibs/test_api.py @@ -54,10 +54,10 @@ def test_namespace(): "get_unit_from_dtype", "periods_per_day", "periods_per_second", - "is_supported_unit", - "get_supported_reso", - "npy_unit_to_abbrev", "guess_datetime_format", + "add_overflowsafe", + "get_supported_dtype", + "is_supported_dtype", ] expected = set(submodules + api) diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 425decc14251a..d8f23156bd4d4 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -6,6 +6,7 @@ from dateutil.parser import parse as du_parse from dateutil.tz import tzlocal +from hypothesis import given import numpy as np import pytest @@ -21,6 +22,7 @@ import pandas.util._test_decorators as td import pandas._testing as tm +from pandas._testing._hypothesis import DATETIME_NO_TZ @pytest.mark.skipif( @@ -367,3 +369,46 @@ def test_guess_datetime_format_f(input): result = parsing.guess_datetime_format(input) expected = "%Y-%m-%dT%H:%M:%S.%f" assert result == expected + + +def _helper_hypothesis_delimited_date(call, date_string, **kwargs): + msg, result = None, None + try: + result = call(date_string, **kwargs) + except ValueError as err: + msg = str(err) + return msg, result + + +@given(DATETIME_NO_TZ) +@pytest.mark.parametrize("delimiter", list(" -./")) +@pytest.mark.parametrize("dayfirst", [True, False]) +@pytest.mark.parametrize( + "date_format", + ["%d %m %Y", "%m %d %Y", "%m %Y", "%Y %m %d", "%y %m %d", "%Y%m%d", "%y%m%d"], +) +def test_hypothesis_delimited_date( + request, date_format, dayfirst, delimiter, test_datetime +): + if date_format == "%m %Y" and delimiter == ".": + request.applymarker( + pytest.mark.xfail( + reason="parse_datetime_string cannot reliably tell whether " + "e.g. %m.%Y is a float or a date" + ) + ) + date_string = test_datetime.strftime(date_format.replace(" ", delimiter)) + + except_out_dateutil, result = _helper_hypothesis_delimited_date( + parsing.py_parse_datetime_string, date_string, dayfirst=dayfirst + ) + except_in_dateutil, expected = _helper_hypothesis_delimited_date( + du_parse, + date_string, + default=datetime(1, 1, 1), + dayfirst=dayfirst, + yearfirst=False, + ) + + assert except_out_dateutil == except_in_dateutil + assert result == expected diff --git a/pandas/tests/tslibs/test_period_asfreq.py b/pandas/tests/tslibs/test_period.py similarity index 92% rename from pandas/tests/tslibs/test_period_asfreq.py rename to pandas/tests/tslibs/test_period.py index 149817357fbd6..715e2d3da88db 100644 --- a/pandas/tests/tslibs/test_period_asfreq.py +++ b/pandas/tests/tslibs/test_period.py @@ -7,6 +7,7 @@ ) from pandas._libs.tslibs.period import ( extract_ordinals, + get_period_field_arr, period_asfreq, period_ordinal, ) @@ -114,3 +115,9 @@ def test_extract_ordinals_2d(self): res = extract_ordinals(arr, freq) res2 = extract_ordinals(arr.reshape(5, 2), freq) tm.assert_numpy_array_equal(res, res2.reshape(-1)) + + +def test_get_period_field_array_raises_on_out_of_range(): + msg = "Buffer dtype mismatch, expected 'const int64_t' but got 'double'" + with pytest.raises(ValueError, match=msg): + get_period_field_arr(-1, np.empty(1), 0) diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 0a6a852bb0f85..98b55f8d690cf 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -228,11 +228,12 @@ def validate_pep8(self): file.name, ] response = subprocess.run(cmd, capture_output=True, check=False, text=True) - stdout = response.stdout - stdout = stdout.replace(file.name, "") - messages = stdout.strip("\n").splitlines() - if messages: - error_messages.extend(messages) + for output in ("stdout", "stderr"): + out = getattr(response, output) + out = out.replace(file.name, "") + messages = out.strip("\n").splitlines() + if messages: + error_messages.extend(messages) finally: file.close() os.unlink(file.name) diff --git a/web/pandas_web.py b/web/pandas_web.py index 1cd3be456bfe0..407debf5828be 100755 --- a/web/pandas_web.py +++ b/web/pandas_web.py @@ -175,7 +175,9 @@ def maintainers_add_info(context): context["maintainers"]["active"] + context["maintainers"]["inactive"] ): resp = requests.get( - f"https://api.github.com/users/{user}", headers=GITHUB_API_HEADERS + f"https://api.github.com/users/{user}", + headers=GITHUB_API_HEADERS, + timeout=5, ) if resp.status_code == 403: sys.stderr.write( @@ -184,7 +186,7 @@ def maintainers_add_info(context): # if we exceed github api quota, we use the github info # of maintainers saved with the website resp_bkp = requests.get( - context["main"]["production_url"] + "maintainers.json" + context["main"]["production_url"] + "maintainers.json", timeout=5 ) resp_bkp.raise_for_status() maintainers_info = resp_bkp.json() @@ -214,10 +216,13 @@ def home_add_releases(context): resp = requests.get( f"https://api.github.com/repos/{github_repo_url}/releases", headers=GITHUB_API_HEADERS, + timeout=5, ) if resp.status_code == 403: sys.stderr.write("WARN: GitHub API quota exceeded when fetching releases\n") - resp_bkp = requests.get(context["main"]["production_url"] + "releases.json") + resp_bkp = requests.get( + context["main"]["production_url"] + "releases.json", timeout=5 + ) resp_bkp.raise_for_status() releases = resp_bkp.json() else: @@ -302,10 +307,13 @@ def roadmap_pdeps(context): "https://api.github.com/search/issues?" f"q=is:pr is:open label:PDEP repo:{github_repo_url}", headers=GITHUB_API_HEADERS, + timeout=5, ) if resp.status_code == 403: sys.stderr.write("WARN: GitHub API quota exceeded when fetching pdeps\n") - resp_bkp = requests.get(context["main"]["production_url"] + "pdeps.json") + resp_bkp = requests.get( + context["main"]["production_url"] + "pdeps.json", timeout=5 + ) resp_bkp.raise_for_status() pdeps = resp_bkp.json() else: