From 2c4356eee5587fddaf3e1c1fa23e9fafe25ad79a Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 12 Sep 2013 12:55:43 -0400 Subject: [PATCH] PERF: much faster to_datetime performance with a format of '%Y%m%d' --- doc/source/release.rst | 1 + pandas/tseries/tests/test_timeseries.py | 15 +++++++++++++++ pandas/tseries/tools.py | 14 +++++++++++++- vb_suite/timeseries.py | 18 ++++++++++++++++++ 4 files changed, 47 insertions(+), 1 deletion(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 75194f6877a6e..5376e0396799e 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -105,6 +105,7 @@ Improvements to existing features test to vbench (:issue:`4705` and :issue:`4722`) - Add ``axis`` and ``level`` keywords to ``where``, so that the ``other`` argument can now be an alignable pandas object. + - ``to_datetime`` with a format of 'YYYYMMDD' now parses much faster API Changes ~~~~~~~~~~~ diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index b5697a98de412..c9e643e25b761 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -834,6 +834,21 @@ def test_to_datetime_format(self): else: self.assert_(result.equals(expected)) + def test_to_datetime_format_YYYYMMDD(self): + s = Series([19801222,19801222] + [19810105]*5) + expected = Series([ Timestamp(x) for x in s.apply(str) ]) + + result = to_datetime(s,format='%Y%m%d') + assert_series_equal(result, expected) + + result = to_datetime(s.apply(str),format='%Y%m%d') + assert_series_equal(result, expected) + + # with NaT + s[2] = np.nan + self.assertRaises(ValueError, to_datetime, s,format='%Y%m%d') + self.assertRaises(ValueError, to_datetime, s.apply(str),format='%Y%m%d') + def test_to_datetime_format_microsecond(self): val = '01-Apr-2011 00:00:01.978' format = '%d-%b-%Y %H:%M:%S.%f' diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index 3087d54396691..cca4850c2c1bf 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -101,7 +101,19 @@ def _convert_listlike(arg, box): arg = com._ensure_object(arg) try: if format is not None: - result = tslib.array_strptime(arg, format) + result = None + + # shortcut formatting here + if format == '%Y%m%d': + try: + carg = arg.astype(np.int64).astype(object) + result = lib.try_parse_year_month_day(carg/10000,carg/100 % 100, carg % 100) + except: + raise ValueError("cannot convert the input to '%Y%m%d' date format") + + # fallback + if result is None: + result = tslib.array_strptime(arg, format) else: result = tslib.array_to_datetime(arg, raise_=errors == 'raise', utc=utc, dayfirst=dayfirst, diff --git a/vb_suite/timeseries.py b/vb_suite/timeseries.py index 4dd1dd2e96bdd..999c3869daf62 100644 --- a/vb_suite/timeseries.py +++ b/vb_suite/timeseries.py @@ -147,6 +147,24 @@ def date_range(start=None, end=None, periods=None, freq=None): Benchmark('to_datetime(strings)', setup, start_date=datetime(2012, 7, 11)) +setup = common_setup + """ +rng = date_range('1/1/2000', periods=10000, freq='D') +strings = Series(rng.year*10000+rng.month*100+rng.day,dtype=np.int64).apply(str) +""" + +timeseries_to_datetime_YYYYMMDD = \ + Benchmark('to_datetime(strings,format="%Y%m%d")', setup, + start_date=datetime(2013, 9, 1)) + +setup = common_setup + """ +rng = date_range('1/1/2000', periods=10000, freq='D') +strings = Series(rng.year*10000+rng.month*100+rng.day,dtype=np.int64).apply(str) +""" + +timeseries_to_datetime_YYYYMMDD_old = \ + Benchmark('pandas.tslib.array_strptime(strings.values,"%Y%m%d")', setup, + start_date=datetime(2013, 9, 1)) + # ---- infer_freq # infer_freq