From e22bf5b4d1e87f95d7e910ea236495701a89598d Mon Sep 17 00:00:00 2001 From: Quang Nguyen Date: Wed, 25 Mar 2020 00:44:25 +0700 Subject: [PATCH 1/9] enable utc and %z and add test --- pandas/core/tools/datetimes.py | 9 +++++---- pandas/tests/tools/test_to_datetime.py | 5 +++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 7414165ab5711..a5bdf8e815db7 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -229,13 +229,14 @@ def _return_parsed_timezone_results(result, timezones, tz, name): ------- tz_result : Index-like of parsed dates with timezone """ - if tz is not None: - raise ValueError( - "Cannot pass a tz argument when parsing strings with timezone information." - ) tz_results = np.array( [Timestamp(res).tz_localize(zone) for res, zone in zip(result, timezones)] ) + if tz is not None: + # GH 32792 + tz_results = np.array( + [tz_result.astimezone(tz) for tz_result in tz_results] + ) from pandas import Index return Index(tz_results, name=name) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 6689021392a92..88fe3a341c6a2 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -323,8 +323,9 @@ def test_to_datetime_parse_tzname_or_tzoffset(self, fmt, dates, expected_dates): expected = pd.Index(expected_dates) tm.assert_equal(result, expected) - with pytest.raises(ValueError): - pd.to_datetime(dates, format=fmt, utc=True) + # GH 32792 + result = pd.to_datetime(dates, format=fmt, utc=True) + assert result.dtype._tz is pytz.UTC @pytest.mark.parametrize( "offset", ["+0", "-1foo", "UTCbar", ":10", "+01:000:01", ""] From bd0e53551a978fc3db80a82b4eb69d784b07fb7d Mon Sep 17 00:00:00 2001 From: Quang Nguyen Date: Wed, 25 Mar 2020 00:46:18 +0700 Subject: [PATCH 2/9] reformat --- pandas/core/tools/datetimes.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index a5bdf8e815db7..3490abfb68b81 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -234,9 +234,7 @@ def _return_parsed_timezone_results(result, timezones, tz, name): ) if tz is not None: # GH 32792 - tz_results = np.array( - [tz_result.astimezone(tz) for tz_result in tz_results] - ) + tz_results = np.array([tz_result.astimezone(tz) for tz_result in tz_results]) from pandas import Index return Index(tz_results, name=name) From 809adf3ec07344835f1790ce21ed292b9a99209a Mon Sep 17 00:00:00 2001 From: Quang Nguyen Date: Wed, 25 Mar 2020 12:13:53 +0700 Subject: [PATCH 3/9] add tests and asv --- asv_bench/benchmarks/timeseries.py | 18 +++++++++++++++++- pandas/tests/tools/test_to_datetime.py | 2 ++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 6c9f8ee77e5ad..da27680afcb48 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -336,14 +336,30 @@ def time_infer_quarter(self): class ToDatetimeFormat: def setup(self): - self.s = Series(["19MAY11", "19MAY11:00:00:00"] * 100000) + N = 100000 + self.s = Series(["19MAY11", "19MAY11:00:00:00"] * N) self.s2 = self.s.str.replace(":\\S+$", "") + self.same_offset = ['10/11/2018 00:00:00.045-07:00'] * N + self.diff_offset = [f'10/11/2018 00:00:00.045-0{offset}:00' for offset in range(10)] * int(N / 10) + def time_exact(self): to_datetime(self.s2, format="%d%b%y") def time_no_exact(self): to_datetime(self.s, format="%d%b%y", exact=False) + + def time_same_offset(self): + to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z") + + def time_different_offset(self): + to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z") + + def time_same_offset_to_utc(self): + to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z", utc=True) + + def time_different_offset_to_utc(self): + to_datetime(self.diff_offset, format="%m/%d/%Y %H:%M:%S.%f%z", utc=True) class ToDatetimeCache: diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 88fe3a341c6a2..494ebb5b5d688 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -325,6 +325,8 @@ def test_to_datetime_parse_tzname_or_tzoffset(self, fmt, dates, expected_dates): # GH 32792 result = pd.to_datetime(dates, format=fmt, utc=True) + expected = pd.Index([date.astimezone('UTC') for date in expected_dates]) + tm.assert_equal(result, expected) assert result.dtype._tz is pytz.UTC @pytest.mark.parametrize( From 9dbef1f272a5d37bd8eb2d277e1afe986171f4bc Mon Sep 17 00:00:00 2001 From: Quang Nguyen Date: Wed, 25 Mar 2020 12:23:17 +0700 Subject: [PATCH 4/9] use tz_convert --- pandas/core/tools/datetimes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 3490abfb68b81..d6c0e661219fd 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -234,7 +234,7 @@ def _return_parsed_timezone_results(result, timezones, tz, name): ) if tz is not None: # GH 32792 - tz_results = np.array([tz_result.astimezone(tz) for tz_result in tz_results]) + tz_results = np.array([tz_result.tz_convert(tz) for tz_result in tz_results]) from pandas import Index return Index(tz_results, name=name) From 10ea27109b1d920e4135c26f70e809f53b49138b Mon Sep 17 00:00:00 2001 From: Quang Nguyen Date: Wed, 25 Mar 2020 12:32:37 +0700 Subject: [PATCH 5/9] whatsnew --- asv_bench/benchmarks/timeseries.py | 6 ++++-- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/tests/tools/test_to_datetime.py | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index da27680afcb48..ca1ebb13f64da 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -341,14 +341,16 @@ def setup(self): self.s2 = self.s.str.replace(":\\S+$", "") self.same_offset = ['10/11/2018 00:00:00.045-07:00'] * N - self.diff_offset = [f'10/11/2018 00:00:00.045-0{offset}:00' for offset in range(10)] * int(N / 10) + self.diff_offset = [ + f'10/11/2018 00:00:00.045-0{offset}:00' for offset in range(10) + ] * int(N / 10) def time_exact(self): to_datetime(self.s2, format="%d%b%y") def time_no_exact(self): to_datetime(self.s, format="%d%b%y", exact=False) - + def time_same_offset(self): to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z") diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 692df075f25cb..4b9051fbe98eb 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -70,6 +70,7 @@ Other enhancements - :func:`timedelta_range` will now infer a frequency when passed ``start``, ``stop``, and ``periods`` (:issue:`32377`) - Positional slicing on a :class:`IntervalIndex` now supports slices with ``step > 1`` (:issue:`31658`) - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`) +- :func:`to_datetime` will now allow parsing formats with ``%z`` while ``utc=True`` (:issue:`32792`) - .. --------------------------------------------------------------------------- diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 494ebb5b5d688..eafbe80428a8b 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -325,7 +325,7 @@ def test_to_datetime_parse_tzname_or_tzoffset(self, fmt, dates, expected_dates): # GH 32792 result = pd.to_datetime(dates, format=fmt, utc=True) - expected = pd.Index([date.astimezone('UTC') for date in expected_dates]) + expected = pd.DatetimeIndex([date.astimezone("UTC") for date in expected_dates]) tm.assert_equal(result, expected) assert result.dtype._tz is pytz.UTC From e141f4f5003e68a0bc15f97581d85a6da7bc376b Mon Sep 17 00:00:00 2001 From: Quang Nguyen Date: Wed, 25 Mar 2020 13:27:14 +0700 Subject: [PATCH 6/9] run black on asvbench --- asv_bench/benchmarks/timeseries.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index ca1ebb13f64da..e912fe1104f9f 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -340,9 +340,9 @@ def setup(self): self.s = Series(["19MAY11", "19MAY11:00:00:00"] * N) self.s2 = self.s.str.replace(":\\S+$", "") - self.same_offset = ['10/11/2018 00:00:00.045-07:00'] * N + self.same_offset = ["10/11/2018 00:00:00.045-07:00"] * N self.diff_offset = [ - f'10/11/2018 00:00:00.045-0{offset}:00' for offset in range(10) + f"10/11/2018 00:00:00.045-0{offset}:00" for offset in range(10) ] * int(N / 10) def time_exact(self): From 2e68d1f1152fccf978b259691d4ec82487694e9c Mon Sep 17 00:00:00 2001 From: Quang Nguyen Date: Wed, 25 Mar 2020 20:46:48 +0700 Subject: [PATCH 7/9] fix to suggestions --- asv_bench/benchmarks/timeseries.py | 2 +- doc/source/whatsnew/v1.1.0.rst | 16 ++++++++++++++++ pandas/core/tools/datetimes.py | 2 +- pandas/tests/tools/test_to_datetime.py | 17 ++++++++++++++++- 4 files changed, 34 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index e912fe1104f9f..b494dbd8a38fa 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -355,7 +355,7 @@ def time_same_offset(self): to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z") def time_different_offset(self): - to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z") + to_datetime(self.diff_offset, format="%m/%d/%Y %H:%M:%S.%f%z") def time_same_offset_to_utc(self): to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z", utc=True) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 4b9051fbe98eb..14899a7bdfae2 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -58,6 +58,22 @@ For example: For more on working with fold, see :ref:`Fold subsection ` in the user guide. +.. _whatsnew_110.to_datetime_multiple_tzname_tzoffset_support: + +Parsing timezone-aware format with different timezones in to_datetime +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`to_datetime` now supports parsing formats containing timezone names (``%Z``) and UTC offsets (``%z``) from different timezones then converting them to UTC by setting ``utc=True``. This would returns a ``DatetimeIndex`` with timezone at UTC as opposed to an ``Index`` with ``object`` dtype if ``utc=True`` is not set (:issue:`32792`). + +For example: + +.. ipython:: python + + tz_strs = ["2010-01-01 12:00:00 +0100", "2010-01-01 12:00:00 -0100", + "2010-01-01 12:00:00 +0300", "2010-01-01 12:00:00 +0400"] + pd.to_datetime(tz_strs, format='%Y-%m-%d %H:%M:%S %z', utc=True) + pd.to_datetime(tz_strs, format='%Y-%m-%d %H:%M:%S %z') + .. _whatsnew_110.enhancements.other: Other enhancements diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index d6c0e661219fd..b922e1b318622 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -233,7 +233,7 @@ def _return_parsed_timezone_results(result, timezones, tz, name): [Timestamp(res).tz_localize(zone) for res, zone in zip(result, timezones)] ) if tz is not None: - # GH 32792 + # Convert to the same tz tz_results = np.array([tz_result.tz_convert(tz) for tz_result in tz_results]) from pandas import Index diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index eafbe80428a8b..d09951705240d 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -323,9 +323,24 @@ def test_to_datetime_parse_tzname_or_tzoffset(self, fmt, dates, expected_dates): expected = pd.Index(expected_dates) tm.assert_equal(result, expected) + def test_to_datetime_parse_tzname_or_tzoffset_different_tz_to_utc(self): # GH 32792 + dates = [ + "2010-01-01 12:00:00 +0100", + "2010-01-01 12:00:00 -0100", + "2010-01-01 12:00:00 +0300", + "2010-01-01 12:00:00 +0400", + ] + expected_dates = [ + "2010-01-01 11:00:00+00:00", + "2010-01-01 13:00:00+00:00", + "2010-01-01 09:00:00+00:00", + "2010-01-01 08:00:00+00:00", + ] + fmt = "%Y-%m-%d %H:%M:%S %z" + result = pd.to_datetime(dates, format=fmt, utc=True) - expected = pd.DatetimeIndex([date.astimezone("UTC") for date in expected_dates]) + expected = pd.DatetimeIndex(expected_dates) tm.assert_equal(result, expected) assert result.dtype._tz is pytz.UTC From 5c0cd808100d4b87816b9bc35008f74c7ef69722 Mon Sep 17 00:00:00 2001 From: Quang Nguyen Date: Wed, 25 Mar 2020 20:49:36 +0700 Subject: [PATCH 8/9] --- doc/source/whatsnew/v1.1.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 14899a7bdfae2..57b17268e3413 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -86,7 +86,6 @@ Other enhancements - :func:`timedelta_range` will now infer a frequency when passed ``start``, ``stop``, and ``periods`` (:issue:`32377`) - Positional slicing on a :class:`IntervalIndex` now supports slices with ``step > 1`` (:issue:`31658`) - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`) -- :func:`to_datetime` will now allow parsing formats with ``%z`` while ``utc=True`` (:issue:`32792`) - .. --------------------------------------------------------------------------- From a12164e4a2a159f985b81cceda1b1ad97ea0b8dc Mon Sep 17 00:00:00 2001 From: Quang Nguyen Date: Sun, 29 Mar 2020 10:08:53 +0700 Subject: [PATCH 9/9] fix doc and test --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/tests/tools/test_to_datetime.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 57b17268e3413..83df71520ba2f 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -63,7 +63,7 @@ For more on working with fold, see :ref:`Fold subsection ` in t Parsing timezone-aware format with different timezones in to_datetime ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:func:`to_datetime` now supports parsing formats containing timezone names (``%Z``) and UTC offsets (``%z``) from different timezones then converting them to UTC by setting ``utc=True``. This would returns a ``DatetimeIndex`` with timezone at UTC as opposed to an ``Index`` with ``object`` dtype if ``utc=True`` is not set (:issue:`32792`). +:func:`to_datetime` now supports parsing formats containing timezone names (``%Z``) and UTC offsets (``%z``) from different timezones then converting them to UTC by setting ``utc=True``. This would return a :class:`DatetimeIndex` with timezone at UTC as opposed to an :class:`Index` with ``object`` dtype if ``utc=True`` is not set (:issue:`32792`). For example: diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index d09951705240d..a751182dbf7af 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -341,8 +341,7 @@ def test_to_datetime_parse_tzname_or_tzoffset_different_tz_to_utc(self): result = pd.to_datetime(dates, format=fmt, utc=True) expected = pd.DatetimeIndex(expected_dates) - tm.assert_equal(result, expected) - assert result.dtype._tz is pytz.UTC + tm.assert_index_equal(result, expected) @pytest.mark.parametrize( "offset", ["+0", "-1foo", "UTCbar", ":10", "+01:000:01", ""]