Skip to content

Commit 6d3fb3f

Browse files
committed
Merge pull request #4828 from jreback/dunit2
BUG: enhanced to_datetime with format '%Y%m%d' to handle NaT/nan better
2 parents 41dbca6 + 13a7ebc commit 6d3fb3f

File tree

4 files changed

+52
-15
lines changed

4 files changed

+52
-15
lines changed

doc/source/release.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ Improvements to existing features
105105
test to vbench (:issue:`4705` and :issue:`4722`)
106106
- Add ``axis`` and ``level`` keywords to ``where``, so that the ``other`` argument
107107
can now be an alignable pandas object.
108-
- ``to_datetime`` with a format of 'YYYYMMDD' now parses much faster
108+
- ``to_datetime`` with a format of '%Y%m%d' now parses much faster
109109

110110
API Changes
111111
~~~~~~~~~~~

pandas/tseries/tests/test_timeseries.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -845,9 +845,19 @@ def test_to_datetime_format_YYYYMMDD(self):
845845
assert_series_equal(result, expected)
846846

847847
# with NaT
848+
expected = Series([Timestamp("19801222"),Timestamp("19801222")] + [Timestamp("19810105")]*5)
849+
expected[2] = np.nan
848850
s[2] = np.nan
849-
self.assertRaises(ValueError, to_datetime, s,format='%Y%m%d')
850-
self.assertRaises(ValueError, to_datetime, s.apply(str),format='%Y%m%d')
851+
852+
result = to_datetime(s,format='%Y%m%d')
853+
assert_series_equal(result, expected)
854+
855+
# string with NaT
856+
s = s.apply(str)
857+
s[2] = 'nat'
858+
result = to_datetime(s,format='%Y%m%d')
859+
assert_series_equal(result, expected)
860+
851861

852862
def test_to_datetime_format_microsecond(self):
853863
val = '01-Apr-2011 00:00:01.978'

pandas/tseries/tools.py

+38-2
Original file line numberDiff line numberDiff line change
@@ -106,8 +106,7 @@ def _convert_listlike(arg, box):
106106
# shortcut formatting here
107107
if format == '%Y%m%d':
108108
try:
109-
carg = arg.astype(np.int64).astype(object)
110-
result = lib.try_parse_year_month_day(carg/10000,carg/100 % 100, carg % 100)
109+
result = _attempt_YYYYMMDD(arg)
111110
except:
112111
raise ValueError("cannot convert the input to '%Y%m%d' date format")
113112

@@ -144,6 +143,43 @@ def _convert_listlike(arg, box):
144143
class DateParseError(ValueError):
145144
pass
146145

146+
def _attempt_YYYYMMDD(arg):
147+
""" try to parse the YYYYMMDD/%Y%m%d format, try to deal with NaT-like,
148+
arg is a passed in as an object dtype, but could really be ints/strings with nan-like/or floats (e.g. with nan) """
149+
150+
def calc(carg):
151+
# calculate the actual result
152+
carg = carg.astype(object)
153+
return lib.try_parse_year_month_day(carg/10000,carg/100 % 100, carg % 100)
154+
155+
def calc_with_mask(carg,mask):
156+
result = np.empty(carg.shape, dtype='M8[ns]')
157+
iresult = result.view('i8')
158+
iresult[-mask] = tslib.iNaT
159+
result[mask] = calc(carg[mask].astype(np.float64).astype(np.int64)).astype('M8[ns]')
160+
return result
161+
162+
# try intlike / strings that are ints
163+
try:
164+
return calc(arg.astype(np.int64))
165+
except:
166+
pass
167+
168+
# a float with actual np.nan
169+
try:
170+
carg = arg.astype(np.float64)
171+
return calc_with_mask(carg,com.notnull(carg))
172+
except:
173+
pass
174+
175+
# string with NaN-like
176+
try:
177+
mask = ~lib.ismember(arg, tslib._nat_strings)
178+
return calc_with_mask(arg,mask)
179+
except:
180+
pass
181+
182+
return None
147183

148184
# patterns for quarters like '4Q2005', '05Q1'
149185
qpat1full = re.compile(r'(\d)Q(\d\d\d\d)')

vb_suite/timeseries.py

+1-10
Original file line numberDiff line numberDiff line change
@@ -154,16 +154,7 @@ def date_range(start=None, end=None, periods=None, freq=None):
154154

155155
timeseries_to_datetime_YYYYMMDD = \
156156
Benchmark('to_datetime(strings,format="%Y%m%d")', setup,
157-
start_date=datetime(2013, 9, 1))
158-
159-
setup = common_setup + """
160-
rng = date_range('1/1/2000', periods=10000, freq='D')
161-
strings = Series(rng.year*10000+rng.month*100+rng.day,dtype=np.int64).apply(str)
162-
"""
163-
164-
timeseries_to_datetime_YYYYMMDD_old = \
165-
Benchmark('pandas.tslib.array_strptime(strings.values,"%Y%m%d")', setup,
166-
start_date=datetime(2013, 9, 1))
157+
start_date=datetime(2012, 7, 1))
167158

168159
# ---- infer_freq
169160
# infer_freq

0 commit comments

Comments
 (0)