diff --git a/pandas/_libs/__init__.py b/pandas/_libs/__init__.py index ab3832d0292ba..b4c3ff8008015 100644 --- a/pandas/_libs/__init__.py +++ b/pandas/_libs/__init__.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # flake8: noqa from .tslib import iNaT, NaT, Timestamp, Timedelta, OutOfBoundsDatetime diff --git a/pandas/_libs/src/datetime.pxd b/pandas/_libs/src/datetime.pxd index 23620e790c132..86c8f3bfc74f3 100644 --- a/pandas/_libs/src/datetime.pxd +++ b/pandas/_libs/src/datetime.pxd @@ -94,6 +94,7 @@ cdef extern from "datetime/np_datetime.h": PANDAS_DATETIMEUNIT fr, pandas_datetimestruct *result) nogil int days_per_month_table[2][12] + pandas_datetimestruct _NS_MIN_DTS, _NS_MAX_DTS int dayofweek(int y, int m, int d) nogil int is_leapyear(int64_t year) nogil @@ -161,3 +162,17 @@ cdef inline int64_t _date_to_datetime64(object val, dts.hour = dts.min = dts.sec = dts.us = 0 dts.ps = dts.as = 0 return pandas_datetimestruct_to_datetime(PANDAS_FR_ns, dts) + + +cdef inline bint check_dts_bounds(pandas_datetimestruct *dts): + """Returns True if an error needs to be raised""" + cdef: + bint error = False + + if (dts.year <= 1677 and + cmp_pandas_datetimestruct(dts, &_NS_MIN_DTS) == -1): + error = True + elif (dts.year >= 2262 and + cmp_pandas_datetimestruct(dts, &_NS_MAX_DTS) == 1): + error = True + return error diff --git a/pandas/_libs/src/datetime/np_datetime.c b/pandas/_libs/src/datetime/np_datetime.c index 8458418988863..ffb901981f939 100644 --- a/pandas/_libs/src/datetime/np_datetime.c +++ b/pandas/_libs/src/datetime/np_datetime.c @@ -40,6 +40,12 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #define PyInt_AsUnsignedLongLongMask PyLong_AsUnsignedLongLongMask #endif +const pandas_datetimestruct _NS_MIN_DTS = { + 1677, 9, 21, 0, 12, 43, 145225, 0, 0}; +const pandas_datetimestruct _NS_MAX_DTS = { + 2262, 4, 11, 23, 47, 16, 854775, 807000, 0}; + + const int days_per_month_table[2][12] = { {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}, {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}}; diff --git a/pandas/_libs/src/datetime/np_datetime.h b/pandas/_libs/src/datetime/np_datetime.h index 97ec5782b625b..a20bff60126aa 100644 --- a/pandas/_libs/src/datetime/np_datetime.h +++ b/pandas/_libs/src/datetime/np_datetime.h @@ -54,6 +54,9 @@ typedef struct { int num; } pandas_datetime_metadata; +extern const pandas_datetimestruct _NS_MIN_DTS; +extern const pandas_datetimestruct _NS_MAX_DTS; + // stuff pandas needs // ---------------------------------------------------------------------------- diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index c629ccbd8e1fd..d4ca5af09367e 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -50,6 +50,7 @@ from datetime cimport ( npy_datetime, is_leapyear, dayofweek, + check_dts_bounds, PANDAS_FR_ns, PyDateTime_Check, PyDate_Check, PyDateTime_IMPORT, @@ -69,6 +70,7 @@ from khash cimport ( cimport cython import re +import time # dateutil compat from dateutil.tz import (tzoffset, tzlocal as _dateutil_tzlocal, @@ -1691,21 +1693,10 @@ class OutOfBoundsDatetime(ValueError): pass cdef inline _check_dts_bounds(pandas_datetimestruct *dts): - cdef: - bint error = False - - if dts.year <= 1677 and cmp_pandas_datetimestruct(dts, &_NS_MIN_DTS) == -1: - error = True - elif ( - dts.year >= 2262 and - cmp_pandas_datetimestruct(dts, &_NS_MAX_DTS) == 1): - error = True - - if error: + if check_dts_bounds(dts): fmt = '%d-%.2d-%.2d %.2d:%.2d:%.2d' % (dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec) - raise OutOfBoundsDatetime( 'Out of bounds nanosecond timestamp: %s' % fmt) @@ -3515,284 +3506,6 @@ cpdef convert_to_timedelta64(object ts, object unit): return ts.astype('timedelta64[ns]') -def array_strptime(ndarray[object] values, object fmt, - bint exact=True, errors='raise'): - """ - Parameters - ---------- - values : ndarray of string-like objects - fmt : string-like regex - exact : matches must be exact if True, search if False - coerce : if invalid values found, coerce to NaT - """ - - cdef: - Py_ssize_t i, n = len(values) - pandas_datetimestruct dts - ndarray[int64_t] iresult - int year, month, day, minute, hour, second, weekday, julian, tz - int week_of_year, week_of_year_start - int64_t us, ns - object val, group_key, ampm, found - dict found_key - bint is_raise = errors=='raise' - bint is_ignore = errors=='ignore' - bint is_coerce = errors=='coerce' - - assert is_raise or is_ignore or is_coerce - - global _TimeRE_cache, _regex_cache - with _cache_lock: - if _getlang() != _TimeRE_cache.locale_time.lang: - _TimeRE_cache = TimeRE() - _regex_cache.clear() - if len(_regex_cache) > _CACHE_MAX_SIZE: - _regex_cache.clear() - locale_time = _TimeRE_cache.locale_time - format_regex = _regex_cache.get(fmt) - if not format_regex: - try: - format_regex = _TimeRE_cache.compile(fmt) - # KeyError raised when a bad format is found; can be specified as - # \\, in which case it was a stray % but with a space after it - except KeyError, err: - bad_directive = err.args[0] - if bad_directive == "\\": - bad_directive = "%" - del err - raise ValueError("'%s' is a bad directive in format '%s'" % - (bad_directive, fmt)) - # IndexError only occurs when the format string is "%" - except IndexError: - raise ValueError("stray %% in format '%s'" % fmt) - _regex_cache[fmt] = format_regex - - result = np.empty(n, dtype='M8[ns]') - iresult = result.view('i8') - - dts.us = dts.ps = dts.as = 0 - - cdef dict _parse_code_table = { - 'y': 0, - 'Y': 1, - 'm': 2, - 'B': 3, - 'b': 4, - 'd': 5, - 'H': 6, - 'I': 7, - 'M': 8, - 'S': 9, - 'f': 10, - 'A': 11, - 'a': 12, - 'w': 13, - 'j': 14, - 'U': 15, - 'W': 16, - 'Z': 17, - 'p': 18 # just an additional key, works only with I - } - cdef int parse_code - - for i in range(n): - val = values[i] - if util.is_string_object(val): - if val in _nat_strings: - iresult[i] = NPY_NAT - continue - else: - if _checknull_with_nat(val): - iresult[i] = NPY_NAT - continue - else: - val = str(val) - - # exact matching - if exact: - found = format_regex.match(val) - if not found: - if is_coerce: - iresult[i] = NPY_NAT - continue - raise ValueError("time data %r does not match " - "format %r (match)" % (values[i], fmt)) - if len(val) != found.end(): - if is_coerce: - iresult[i] = NPY_NAT - continue - raise ValueError("unconverted data remains: %s" % - values[i][found.end():]) - - # search - else: - found = format_regex.search(val) - if not found: - if is_coerce: - iresult[i] = NPY_NAT - continue - raise ValueError("time data %r does not match format " - "%r (search)" % (values[i], fmt)) - - year = 1900 - month = day = 1 - hour = minute = second = ns = us = 0 - tz = -1 - # Default to -1 to signify that values not known; not critical to have, - # though - week_of_year = -1 - week_of_year_start = -1 - # weekday and julian defaulted to -1 so as to signal need to calculate - # values - weekday = julian = -1 - found_dict = found.groupdict() - for group_key in found_dict.iterkeys(): - # Directives not explicitly handled below: - # c, x, X - # handled by making out of other directives - # U, W - # worthless without day of the week - parse_code = _parse_code_table[group_key] - - if parse_code == 0: - year = int(found_dict['y']) - # Open Group specification for strptime() states that a %y - #value in the range of [00, 68] is in the century 2000, while - #[69,99] is in the century 1900 - if year <= 68: - year += 2000 - else: - year += 1900 - elif parse_code == 1: - year = int(found_dict['Y']) - elif parse_code == 2: - month = int(found_dict['m']) - elif parse_code == 3: - # elif group_key == 'B': - month = locale_time.f_month.index(found_dict['B'].lower()) - elif parse_code == 4: - # elif group_key == 'b': - month = locale_time.a_month.index(found_dict['b'].lower()) - elif parse_code == 5: - # elif group_key == 'd': - day = int(found_dict['d']) - elif parse_code == 6: - # elif group_key == 'H': - hour = int(found_dict['H']) - elif parse_code == 7: - hour = int(found_dict['I']) - ampm = found_dict.get('p', '').lower() - # If there was no AM/PM indicator, we'll treat this like AM - if ampm in ('', locale_time.am_pm[0]): - # We're in AM so the hour is correct unless we're - # looking at 12 midnight. - # 12 midnight == 12 AM == hour 0 - if hour == 12: - hour = 0 - elif ampm == locale_time.am_pm[1]: - # We're in PM so we need to add 12 to the hour unless - # we're looking at 12 noon. - # 12 noon == 12 PM == hour 12 - if hour != 12: - hour += 12 - elif parse_code == 8: - minute = int(found_dict['M']) - elif parse_code == 9: - second = int(found_dict['S']) - elif parse_code == 10: - s = found_dict['f'] - # Pad to always return nanoseconds - s += "0" * (9 - len(s)) - us = long(s) - ns = us % 1000 - us = us / 1000 - elif parse_code == 11: - weekday = locale_time.f_weekday.index(found_dict['A'].lower()) - elif parse_code == 12: - weekday = locale_time.a_weekday.index(found_dict['a'].lower()) - elif parse_code == 13: - weekday = int(found_dict['w']) - if weekday == 0: - weekday = 6 - else: - weekday -= 1 - elif parse_code == 14: - julian = int(found_dict['j']) - elif parse_code == 15 or parse_code == 16: - week_of_year = int(found_dict[group_key]) - if group_key == 'U': - # U starts week on Sunday. - week_of_year_start = 6 - else: - # W starts week on Monday. - week_of_year_start = 0 - elif parse_code == 17: - # Since -1 is default value only need to worry about setting tz - # if it can be something other than -1. - found_zone = found_dict['Z'].lower() - for value, tz_values in enumerate(locale_time.timezone): - if found_zone in tz_values: - # Deal w/ bad locale setup where timezone names are the - # same and yet time.daylight is true; too ambiguous to - # be able to tell what timezone has daylight savings - if (time.tzname[0] == time.tzname[1] and - time.daylight and found_zone not in ( - "utc", "gmt")): - break - else: - tz = value - break - # If we know the wk of the year and what day of that wk, we can figure - # out the Julian day of the year. - if julian == -1 and week_of_year != -1 and weekday != -1: - week_starts_Mon = True if week_of_year_start == 0 else False - julian = _calc_julian_from_U_or_W(year, week_of_year, weekday, - week_starts_Mon) - # Cannot pre-calculate datetime_date() since can change in Julian - # calculation and thus could have different value for the day of the wk - # calculation. - try: - if julian == -1: - # Need to add 1 to result since first day of the year is 1, not - # 0. - julian = datetime_date(year, month, day).toordinal() - \ - datetime_date(year, 1, 1).toordinal() + 1 - else: # Assume that if they bothered to include Julian day it will - # be accurate. - datetime_result = datetime_date.fromordinal( - (julian - 1) + datetime_date(year, 1, 1).toordinal()) - year = datetime_result.year - month = datetime_result.month - day = datetime_result.day - except ValueError: - if is_coerce: - iresult[i] = NPY_NAT - continue - raise - if weekday == -1: - weekday = datetime_date(year, month, day).weekday() - - dts.year = year - dts.month = month - dts.day = day - dts.hour = hour - dts.min = minute - dts.sec = second - dts.us = us - dts.ps = ns * 1000 - - iresult[i] = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) - try: - _check_dts_bounds(&dts) - except ValueError: - if is_coerce: - iresult[i] = NPY_NAT - continue - raise - - return result - - #---------------------------------------------------------------------- # NaT methods/property setups @@ -5176,320 +4889,3 @@ def shift_months(int64_t[:] dtindex, int months, object day=None): raise ValueError("day must be None, 'start' or 'end'") return np.asarray(out) - -#---------------------------------------------------------------------- -# Don't even ask - -"""Strptime-related classes and functions. - -CLASSES: - LocaleTime -- Discovers and stores locale-specific time information - TimeRE -- Creates regexes for pattern matching a string of text containing - time information - -FUNCTIONS: - _getlang -- Figure out what language is being used for the locale - strptime -- Calculates the time struct represented by the passed-in string - -""" -import time -import locale -import calendar -from re import compile as re_compile -from re import IGNORECASE -from re import escape as re_escape -from datetime import date as datetime_date - -# Python 2 vs Python 3 -try: - from thread import allocate_lock as _thread_allocate_lock -except: - try: - from _thread import allocate_lock as _thread_allocate_lock - except: - try: - from dummy_thread import allocate_lock as _thread_allocate_lock - except: - from _dummy_thread import allocate_lock as _thread_allocate_lock - -__all__ = [] - - -def _getlang(): - # Figure out what the current language is set to. - return locale.getlocale(locale.LC_TIME) - - -class LocaleTime(object): - """Stores and handles locale-specific information related to time. - - ATTRIBUTES: - f_weekday -- full weekday names (7-item list) - a_weekday -- abbreviated weekday names (7-item list) - f_month -- full month names (13-item list; dummy value in [0], which - is added by code) - a_month -- abbreviated month names (13-item list, dummy value in - [0], which is added by code) - am_pm -- AM/PM representation (2-item list) - LC_date_time -- format string for date/time representation (string) - LC_date -- format string for date representation (string) - LC_time -- format string for time representation (string) - timezone -- daylight- and non-daylight-savings timezone representation - (2-item list of sets) - lang -- Language used by instance (2-item tuple) - """ - - def __init__(self): - """Set all attributes. - - Order of methods called matters for dependency reasons. - - The locale language is set at the offset and then checked again before - exiting. This is to make sure that the attributes were not set with a - mix of information from more than one locale. This would most likely - happen when using threads where one thread calls a locale-dependent - function while another thread changes the locale while the function in - the other thread is still running. Proper coding would call for - locks to prevent changing the locale while locale-dependent code is - running. The check here is done in case someone does not think about - doing this. - - Only other possible issue is if someone changed the timezone and did - not call tz.tzset . That is an issue for the programmer, though, - since changing the timezone is worthless without that call. - - """ - self.lang = _getlang() - self.__calc_weekday() - self.__calc_month() - self.__calc_am_pm() - self.__calc_timezone() - self.__calc_date_time() - if _getlang() != self.lang: - raise ValueError("locale changed during initialization") - - def __pad(self, seq, front): - # Add '' to seq to either the front (is True), else the back. - seq = list(seq) - if front: - seq.insert(0, '') - else: - seq.append('') - return seq - - def __calc_weekday(self): - # Set self.a_weekday and self.f_weekday using the calendar - # module. - a_weekday = [calendar.day_abbr[i].lower() for i in range(7)] - f_weekday = [calendar.day_name[i].lower() for i in range(7)] - self.a_weekday = a_weekday - self.f_weekday = f_weekday - - def __calc_month(self): - # Set self.f_month and self.a_month using the calendar module. - a_month = [calendar.month_abbr[i].lower() for i in range(13)] - f_month = [calendar.month_name[i].lower() for i in range(13)] - self.a_month = a_month - self.f_month = f_month - - def __calc_am_pm(self): - # Set self.am_pm by using time.strftime(). - - # The magic date (1999,3,17,hour,44,55,2,76,0) is not really that - # magical; just happened to have used it everywhere else where a - # static date was needed. - am_pm = [] - for hour in (01, 22): - time_tuple = time.struct_time( - (1999, 3, 17, hour, 44, 55, 2, 76, 0)) - am_pm.append(time.strftime("%p", time_tuple).lower()) - self.am_pm = am_pm - - def __calc_date_time(self): - # Set self.date_time, self.date, & self.time by using - # time.strftime(). - - # Use (1999,3,17,22,44,55,2,76,0) for magic date because the amount of - # overloaded numbers is minimized. The order in which searches for - # values within the format string is very important; it eliminates - # possible ambiguity for what something represents. - time_tuple = time.struct_time((1999, 3, 17, 22, 44, 55, 2, 76, 0)) - date_time = [None, None, None] - date_time[0] = time.strftime("%c", time_tuple).lower() - date_time[1] = time.strftime("%x", time_tuple).lower() - date_time[2] = time.strftime("%X", time_tuple).lower() - replacement_pairs = [('%', '%%'), (self.f_weekday[2], '%A'), - (self.f_month[3], - '%B'), (self.a_weekday[2], '%a'), - (self.a_month[3], '%b'), (self.am_pm[1], '%p'), - ('1999', '%Y'), ('99', '%y'), ('22', '%H'), - ('44', '%M'), ('55', '%S'), ('76', '%j'), - ('17', '%d'), ('03', '%m'), ('3', '%m'), - # '3' needed for when no leading zero. - ('2', '%w'), ('10', '%I')] - replacement_pairs.extend([(tz, "%Z") for tz_values in self.timezone - for tz in tz_values]) - for offset, directive in ((0, '%c'), (1, '%x'), (2, '%X')): - current_format = date_time[offset] - for old, new in replacement_pairs: - # Must deal with possible lack of locale info - # manifesting itself as the empty string (e.g., Swedish's - # lack of AM/PM info) or a platform returning a tuple of empty - # strings (e.g., MacOS 9 having timezone as ('','')). - if old: - current_format = current_format.replace(old, new) - # If %W is used, then Sunday, 2005-01-03 will fall on week 0 since - # 2005-01-03 occurs before the first Monday of the year. Otherwise - # %U is used. - time_tuple = time.struct_time((1999, 1, 3, 1, 1, 1, 6, 3, 0)) - if '00' in time.strftime(directive, time_tuple): - U_W = '%W' - else: - U_W = '%U' - date_time[offset] = current_format.replace('11', U_W) - self.LC_date_time = date_time[0] - self.LC_date = date_time[1] - self.LC_time = date_time[2] - - def __calc_timezone(self): - # Set self.timezone by using time.tzname. - # Do not worry about possibility of time.tzname[0] == timetzname[1] - # and time.daylight; handle that in strptime . - try: - time.tzset() - except AttributeError: - pass - no_saving = frozenset(["utc", "gmt", time.tzname[0].lower()]) - if time.daylight: - has_saving = frozenset([time.tzname[1].lower()]) - else: - has_saving = frozenset() - self.timezone = (no_saving, has_saving) - - -class TimeRE(dict): - """Handle conversion from format directives to regexes.""" - - def __init__(self, locale_time=None): - """Create keys/values. - - Order of execution is important for dependency reasons. - - """ - if locale_time: - self.locale_time = locale_time - else: - self.locale_time = LocaleTime() - base = super(TimeRE, self) - base.__init__({ - # The " \d" part of the regex is to make %c from ANSI C work - 'd': r"(?P3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])", - 'f': r"(?P[0-9]{1,9})", - 'H': r"(?P2[0-3]|[0-1]\d|\d)", - 'I': r"(?P1[0-2]|0[1-9]|[1-9])", - 'j': (r"(?P36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|" - r"[1-9]\d|0[1-9]|[1-9])"), - 'm': r"(?P1[0-2]|0[1-9]|[1-9])", - 'M': r"(?P[0-5]\d|\d)", - 'S': r"(?P6[0-1]|[0-5]\d|\d)", - 'U': r"(?P5[0-3]|[0-4]\d|\d)", - 'w': r"(?P[0-6])", - # W is set below by using 'U' - 'y': r"(?P\d\d)", - #XXX: Does 'Y' need to worry about having less or more than - # 4 digits? - 'Y': r"(?P\d\d\d\d)", - 'A': self.__seqToRE(self.locale_time.f_weekday, 'A'), - 'a': self.__seqToRE(self.locale_time.a_weekday, 'a'), - 'B': self.__seqToRE(self.locale_time.f_month[1:], 'B'), - 'b': self.__seqToRE(self.locale_time.a_month[1:], 'b'), - 'p': self.__seqToRE(self.locale_time.am_pm, 'p'), - 'Z': self.__seqToRE([tz for tz_names in self.locale_time.timezone - for tz in tz_names], - 'Z'), - '%': '%'}) - base.__setitem__('W', base.__getitem__('U').replace('U', 'W')) - base.__setitem__('c', self.pattern(self.locale_time.LC_date_time)) - base.__setitem__('x', self.pattern(self.locale_time.LC_date)) - base.__setitem__('X', self.pattern(self.locale_time.LC_time)) - - def __seqToRE(self, to_convert, directive): - """Convert a list to a regex string for matching a directive. - - Want possible matching values to be from longest to shortest. This - prevents the possibility of a match occuring for a value that also - a substring of a larger value that should have matched (e.g., 'abc' - matching when 'abcdef' should have been the match). - - """ - to_convert = sorted(to_convert, key=len, reverse=True) - for value in to_convert: - if value != '': - break - else: - return '' - regex = '|'.join([re_escape(stuff) for stuff in to_convert]) - regex = '(?P<%s>%s' % (directive, regex) - return '%s)' % regex - - def pattern(self, format): - """Return regex pattern for the format string. - - Need to make sure that any characters that might be interpreted as - regex syntax are escaped. - - """ - processed_format = '' - # The sub() call escapes all characters that might be misconstrued - # as regex syntax. Cannot use re.escape since we have to deal with - # format directives (%m, etc.). - regex_chars = re_compile(r"([\\.^$*+?\(\){}\[\]|])") - format = regex_chars.sub(r"\\\1", format) - whitespace_replacement = re_compile(r'\s+') - format = whitespace_replacement.sub(r'\\s+', format) - while '%' in format: - directive_index = format.index('%') +1 - processed_format = "%s%s%s" % (processed_format, - format[:directive_index -1], - self[format[directive_index]]) - format = format[directive_index +1:] - return "%s%s" % (processed_format, format) - - def compile(self, format): - """Return a compiled re object for the format string.""" - return re_compile(self.pattern(format), IGNORECASE) - -_cache_lock = _thread_allocate_lock() -# DO NOT modify _TimeRE_cache or _regex_cache without acquiring the cache lock -# first! -_TimeRE_cache = TimeRE() -_CACHE_MAX_SIZE = 5 # Max number of regexes stored in _regex_cache -_regex_cache = {} - -cdef _calc_julian_from_U_or_W(int year, int week_of_year, - int day_of_week, int week_starts_Mon): - """Calculate the Julian day based on the year, week of the year, and day of - the week, with week_start_day representing whether the week of the year - assumes the week starts on Sunday or Monday (6 or 0).""" - - cdef: - int first_weekday, week_0_length, days_to_week - - first_weekday = datetime_date(year, 1, 1).weekday() - # If we are dealing with the %U directive (week starts on Sunday), it's - # easier to just shift the view to Sunday being the first day of the - # week. - if not week_starts_Mon: - first_weekday = (first_weekday + 1) % 7 - day_of_week = (day_of_week + 1) % 7 - # Need to watch out for a week 0 (when the first day of the year is not - # the same as that specified by %U or %W). - week_0_length = (7 - first_weekday) % 7 - if week_of_year == 0: - return 1 + day_of_week - first_weekday - else: - days_to_week = week_0_length + (7 * (week_of_year - 1)) - return 1 + days_to_week + day_of_week - -# def _strptime_time(data_string, format="%a %b %d %H:%M:%S %Y"): -# return _strptime(data_string, format)[0] diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx new file mode 100644 index 0000000000000..20b24d6be9a58 --- /dev/null +++ b/pandas/_libs/tslibs/strptime.pyx @@ -0,0 +1,640 @@ +# -*- coding: utf-8 -*- +# cython: profile=False +"""Strptime-related classes and functions. +""" +import time +import locale +import calendar +import re + + +# Python 2 vs Python 3 +try: + from thread import allocate_lock as _thread_allocate_lock +except: + try: + from _thread import allocate_lock as _thread_allocate_lock + except: + try: + from dummy_thread import allocate_lock as _thread_allocate_lock + except: + from _dummy_thread import allocate_lock as _thread_allocate_lock + + +from cython cimport Py_ssize_t +from cpython cimport PyFloat_Check + +cimport cython + +import numpy as np +cimport numpy as np +from numpy cimport ndarray, int64_t + +from datetime import date as datetime_date +from datetime cimport datetime + +# This is src/datetime.pxd +from datetime cimport ( + PANDAS_FR_ns, + check_dts_bounds, + pandas_datetimestruct, + pandas_datetimestruct_to_datetime) + +from util cimport is_string_object, get_nat + +cdef int64_t NPY_NAT = get_nat() + +cdef set _nat_strings = set(['NaT', 'nat', 'NAT', 'nan', 'NaN', 'NAN']) + + +# TODO: Consolidate with other implementations +cdef inline bint _checknull_with_nat(object val): + """ utility to check if a value is a nat or not """ + return (val is None or + (PyFloat_Check(val) and val != val) or + (isinstance(val, datetime) and not val == val)) + + +def array_strptime(ndarray[object] values, object fmt, + bint exact=True, errors='raise'): + """ + Calculates the datetime structs represented by the passed array of strings + + Parameters + ---------- + values : ndarray of string-like objects + fmt : string-like regex + exact : matches must be exact if True, search if False + coerce : if invalid values found, coerce to NaT + """ + + cdef: + Py_ssize_t i, n = len(values) + pandas_datetimestruct dts + ndarray[int64_t] iresult + int year, month, day, minute, hour, second, weekday, julian, tz + int week_of_year, week_of_year_start + int64_t us, ns + object val, group_key, ampm, found + dict found_key + bint is_raise = errors=='raise' + bint is_ignore = errors=='ignore' + bint is_coerce = errors=='coerce' + + assert is_raise or is_ignore or is_coerce + + global _TimeRE_cache, _regex_cache + with _cache_lock: + if _getlang() != _TimeRE_cache.locale_time.lang: + _TimeRE_cache = TimeRE() + _regex_cache.clear() + if len(_regex_cache) > _CACHE_MAX_SIZE: + _regex_cache.clear() + locale_time = _TimeRE_cache.locale_time + format_regex = _regex_cache.get(fmt) + if not format_regex: + try: + format_regex = _TimeRE_cache.compile(fmt) + # KeyError raised when a bad format is found; can be specified as + # \\, in which case it was a stray % but with a space after it + except KeyError, err: + bad_directive = err.args[0] + if bad_directive == "\\": + bad_directive = "%" + del err + raise ValueError("'%s' is a bad directive in format '%s'" % + (bad_directive, fmt)) + # IndexError only occurs when the format string is "%" + except IndexError: + raise ValueError("stray %% in format '%s'" % fmt) + _regex_cache[fmt] = format_regex + + result = np.empty(n, dtype='M8[ns]') + iresult = result.view('i8') + + dts.us = dts.ps = dts.as = 0 + + cdef dict _parse_code_table = { + 'y': 0, + 'Y': 1, + 'm': 2, + 'B': 3, + 'b': 4, + 'd': 5, + 'H': 6, + 'I': 7, + 'M': 8, + 'S': 9, + 'f': 10, + 'A': 11, + 'a': 12, + 'w': 13, + 'j': 14, + 'U': 15, + 'W': 16, + 'Z': 17, + 'p': 18 # just an additional key, works only with I + } + cdef int parse_code + + for i in range(n): + val = values[i] + if is_string_object(val): + if val in _nat_strings: + iresult[i] = NPY_NAT + continue + else: + if _checknull_with_nat(val): + iresult[i] = NPY_NAT + continue + else: + val = str(val) + + # exact matching + if exact: + found = format_regex.match(val) + if not found: + if is_coerce: + iresult[i] = NPY_NAT + continue + raise ValueError("time data %r does not match " + "format %r (match)" % (values[i], fmt)) + if len(val) != found.end(): + if is_coerce: + iresult[i] = NPY_NAT + continue + raise ValueError("unconverted data remains: %s" % + values[i][found.end():]) + + # search + else: + found = format_regex.search(val) + if not found: + if is_coerce: + iresult[i] = NPY_NAT + continue + raise ValueError("time data %r does not match format " + "%r (search)" % (values[i], fmt)) + + year = 1900 + month = day = 1 + hour = minute = second = ns = us = 0 + tz = -1 + # Default to -1 to signify that values not known; not critical to have, + # though + week_of_year = -1 + week_of_year_start = -1 + # weekday and julian defaulted to -1 so as to signal need to calculate + # values + weekday = julian = -1 + found_dict = found.groupdict() + for group_key in found_dict.iterkeys(): + # Directives not explicitly handled below: + # c, x, X + # handled by making out of other directives + # U, W + # worthless without day of the week + parse_code = _parse_code_table[group_key] + + if parse_code == 0: + year = int(found_dict['y']) + # Open Group specification for strptime() states that a %y + #value in the range of [00, 68] is in the century 2000, while + #[69,99] is in the century 1900 + if year <= 68: + year += 2000 + else: + year += 1900 + elif parse_code == 1: + year = int(found_dict['Y']) + elif parse_code == 2: + month = int(found_dict['m']) + elif parse_code == 3: + # elif group_key == 'B': + month = locale_time.f_month.index(found_dict['B'].lower()) + elif parse_code == 4: + # elif group_key == 'b': + month = locale_time.a_month.index(found_dict['b'].lower()) + elif parse_code == 5: + # elif group_key == 'd': + day = int(found_dict['d']) + elif parse_code == 6: + # elif group_key == 'H': + hour = int(found_dict['H']) + elif parse_code == 7: + hour = int(found_dict['I']) + ampm = found_dict.get('p', '').lower() + # If there was no AM/PM indicator, we'll treat this like AM + if ampm in ('', locale_time.am_pm[0]): + # We're in AM so the hour is correct unless we're + # looking at 12 midnight. + # 12 midnight == 12 AM == hour 0 + if hour == 12: + hour = 0 + elif ampm == locale_time.am_pm[1]: + # We're in PM so we need to add 12 to the hour unless + # we're looking at 12 noon. + # 12 noon == 12 PM == hour 12 + if hour != 12: + hour += 12 + elif parse_code == 8: + minute = int(found_dict['M']) + elif parse_code == 9: + second = int(found_dict['S']) + elif parse_code == 10: + s = found_dict['f'] + # Pad to always return nanoseconds + s += "0" * (9 - len(s)) + us = long(s) + ns = us % 1000 + us = us / 1000 + elif parse_code == 11: + weekday = locale_time.f_weekday.index(found_dict['A'].lower()) + elif parse_code == 12: + weekday = locale_time.a_weekday.index(found_dict['a'].lower()) + elif parse_code == 13: + weekday = int(found_dict['w']) + if weekday == 0: + weekday = 6 + else: + weekday -= 1 + elif parse_code == 14: + julian = int(found_dict['j']) + elif parse_code == 15 or parse_code == 16: + week_of_year = int(found_dict[group_key]) + if group_key == 'U': + # U starts week on Sunday. + week_of_year_start = 6 + else: + # W starts week on Monday. + week_of_year_start = 0 + elif parse_code == 17: + # Since -1 is default value only need to worry about setting tz + # if it can be something other than -1. + found_zone = found_dict['Z'].lower() + for value, tz_values in enumerate(locale_time.timezone): + if found_zone in tz_values: + # Deal w/ bad locale setup where timezone names are the + # same and yet time.daylight is true; too ambiguous to + # be able to tell what timezone has daylight savings + if (time.tzname[0] == time.tzname[1] and + time.daylight and found_zone not in ( + "utc", "gmt")): + break + else: + tz = value + break + # If we know the wk of the year and what day of that wk, we can figure + # out the Julian day of the year. + if julian == -1 and week_of_year != -1 and weekday != -1: + week_starts_Mon = True if week_of_year_start == 0 else False + julian = _calc_julian_from_U_or_W(year, week_of_year, weekday, + week_starts_Mon) + # Cannot pre-calculate datetime_date() since can change in Julian + # calculation and thus could have different value for the day of the wk + # calculation. + try: + if julian == -1: + # Need to add 1 to result since first day of the year is 1, not + # 0. + julian = datetime_date(year, month, day).toordinal() - \ + datetime_date(year, 1, 1).toordinal() + 1 + else: # Assume that if they bothered to include Julian day it will + # be accurate. + datetime_result = datetime_date.fromordinal( + (julian - 1) + datetime_date(year, 1, 1).toordinal()) + year = datetime_result.year + month = datetime_result.month + day = datetime_result.day + except ValueError: + if is_coerce: + iresult[i] = NPY_NAT + continue + raise + if weekday == -1: + weekday = datetime_date(year, month, day).weekday() + + dts.year = year + dts.month = month + dts.day = day + dts.hour = hour + dts.min = minute + dts.sec = second + dts.us = us + dts.ps = ns * 1000 + + iresult[i] = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) + if check_dts_bounds(&dts): + if is_coerce: + iresult[i] = NPY_NAT + continue + else: + from pandas._libs.tslib import OutOfBoundsDatetime + fmt = '%d-%.2d-%.2d %.2d:%.2d:%.2d' % (dts.year, dts.month, + dts.day, dts.hour, + dts.min, dts.sec) + raise OutOfBoundsDatetime( + 'Out of bounds nanosecond timestamp: %s' % fmt) + + return result + + +"""_getlang, LocaleTime, TimeRE, _calc_julian_from_U_or_W are vendored +from the standard library, see +https://github.com/python/cpython/blob/master/Lib/_strptime.py +The original module-level docstring follows. + +Strptime-related classes and functions. +CLASSES: + LocaleTime -- Discovers and stores locale-specific time information + TimeRE -- Creates regexes for pattern matching a string of text containing + time information +FUNCTIONS: + _getlang -- Figure out what language is being used for the locale + strptime -- Calculates the time struct represented by the passed-in string +""" + + +def _getlang(): + """Figure out what language is being used for the locale""" + return locale.getlocale(locale.LC_TIME) + + +class LocaleTime(object): + """Stores and handles locale-specific information related to time. + + ATTRIBUTES: + f_weekday -- full weekday names (7-item list) + a_weekday -- abbreviated weekday names (7-item list) + f_month -- full month names (13-item list; dummy value in [0], which + is added by code) + a_month -- abbreviated month names (13-item list, dummy value in + [0], which is added by code) + am_pm -- AM/PM representation (2-item list) + LC_date_time -- format string for date/time representation (string) + LC_date -- format string for date representation (string) + LC_time -- format string for time representation (string) + timezone -- daylight- and non-daylight-savings timezone representation + (2-item list of sets) + lang -- Language used by instance (2-item tuple) + """ + + def __init__(self): + """Set all attributes. + + Order of methods called matters for dependency reasons. + + The locale language is set at the offset and then checked again before + exiting. This is to make sure that the attributes were not set with a + mix of information from more than one locale. This would most likely + happen when using threads where one thread calls a locale-dependent + function while another thread changes the locale while the function in + the other thread is still running. Proper coding would call for + locks to prevent changing the locale while locale-dependent code is + running. The check here is done in case someone does not think about + doing this. + + Only other possible issue is if someone changed the timezone and did + not call tz.tzset . That is an issue for the programmer, though, + since changing the timezone is worthless without that call. + + """ + self.lang = _getlang() + self.__calc_weekday() + self.__calc_month() + self.__calc_am_pm() + self.__calc_timezone() + self.__calc_date_time() + if _getlang() != self.lang: + raise ValueError("locale changed during initialization") + + def __pad(self, seq, front): + # Add '' to seq to either the front (is True), else the back. + seq = list(seq) + if front: + seq.insert(0, '') + else: + seq.append('') + return seq + + def __calc_weekday(self): + # Set self.a_weekday and self.f_weekday using the calendar + # module. + a_weekday = [calendar.day_abbr[i].lower() for i in range(7)] + f_weekday = [calendar.day_name[i].lower() for i in range(7)] + self.a_weekday = a_weekday + self.f_weekday = f_weekday + + def __calc_month(self): + # Set self.f_month and self.a_month using the calendar module. + a_month = [calendar.month_abbr[i].lower() for i in range(13)] + f_month = [calendar.month_name[i].lower() for i in range(13)] + self.a_month = a_month + self.f_month = f_month + + def __calc_am_pm(self): + # Set self.am_pm by using time.strftime(). + + # The magic date (1999,3,17,hour,44,55,2,76,0) is not really that + # magical; just happened to have used it everywhere else where a + # static date was needed. + am_pm = [] + for hour in (01, 22): + time_tuple = time.struct_time( + (1999, 3, 17, hour, 44, 55, 2, 76, 0)) + am_pm.append(time.strftime("%p", time_tuple).lower()) + self.am_pm = am_pm + + def __calc_date_time(self): + # Set self.date_time, self.date, & self.time by using + # time.strftime(). + + # Use (1999,3,17,22,44,55,2,76,0) for magic date because the amount of + # overloaded numbers is minimized. The order in which searches for + # values within the format string is very important; it eliminates + # possible ambiguity for what something represents. + time_tuple = time.struct_time((1999, 3, 17, 22, 44, 55, 2, 76, 0)) + date_time = [None, None, None] + date_time[0] = time.strftime("%c", time_tuple).lower() + date_time[1] = time.strftime("%x", time_tuple).lower() + date_time[2] = time.strftime("%X", time_tuple).lower() + replacement_pairs = [('%', '%%'), (self.f_weekday[2], '%A'), + (self.f_month[3], + '%B'), (self.a_weekday[2], '%a'), + (self.a_month[3], '%b'), (self.am_pm[1], '%p'), + ('1999', '%Y'), ('99', '%y'), ('22', '%H'), + ('44', '%M'), ('55', '%S'), ('76', '%j'), + ('17', '%d'), ('03', '%m'), ('3', '%m'), + # '3' needed for when no leading zero. + ('2', '%w'), ('10', '%I')] + replacement_pairs.extend([(tz, "%Z") for tz_values in self.timezone + for tz in tz_values]) + for offset, directive in ((0, '%c'), (1, '%x'), (2, '%X')): + current_format = date_time[offset] + for old, new in replacement_pairs: + # Must deal with possible lack of locale info + # manifesting itself as the empty string (e.g., Swedish's + # lack of AM/PM info) or a platform returning a tuple of empty + # strings (e.g., MacOS 9 having timezone as ('','')). + if old: + current_format = current_format.replace(old, new) + # If %W is used, then Sunday, 2005-01-03 will fall on week 0 since + # 2005-01-03 occurs before the first Monday of the year. Otherwise + # %U is used. + time_tuple = time.struct_time((1999, 1, 3, 1, 1, 1, 6, 3, 0)) + if '00' in time.strftime(directive, time_tuple): + U_W = '%W' + else: + U_W = '%U' + date_time[offset] = current_format.replace('11', U_W) + self.LC_date_time = date_time[0] + self.LC_date = date_time[1] + self.LC_time = date_time[2] + + def __calc_timezone(self): + # Set self.timezone by using time.tzname. + # Do not worry about possibility of time.tzname[0] == timetzname[1] + # and time.daylight; handle that in strptime . + try: + time.tzset() + except AttributeError: + pass + no_saving = frozenset(["utc", "gmt", time.tzname[0].lower()]) + if time.daylight: + has_saving = frozenset([time.tzname[1].lower()]) + else: + has_saving = frozenset() + self.timezone = (no_saving, has_saving) + + +class TimeRE(dict): + """ + Handle conversion from format directives to regexes. + + Creates regexes for pattern matching a string of text containing + time information + """ + + def __init__(self, locale_time=None): + """Create keys/values. + + Order of execution is important for dependency reasons. + + """ + if locale_time: + self.locale_time = locale_time + else: + self.locale_time = LocaleTime() + base = super(TimeRE, self) + base.__init__({ + # The " \d" part of the regex is to make %c from ANSI C work + 'd': r"(?P3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])", + 'f': r"(?P[0-9]{1,9})", + 'H': r"(?P2[0-3]|[0-1]\d|\d)", + 'I': r"(?P1[0-2]|0[1-9]|[1-9])", + 'j': (r"(?P36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|" + r"[1-9]\d|0[1-9]|[1-9])"), + 'm': r"(?P1[0-2]|0[1-9]|[1-9])", + 'M': r"(?P[0-5]\d|\d)", + 'S': r"(?P6[0-1]|[0-5]\d|\d)", + 'U': r"(?P5[0-3]|[0-4]\d|\d)", + 'w': r"(?P[0-6])", + # W is set below by using 'U' + 'y': r"(?P\d\d)", + #XXX: Does 'Y' need to worry about having less or more than + # 4 digits? + 'Y': r"(?P\d\d\d\d)", + 'A': self.__seqToRE(self.locale_time.f_weekday, 'A'), + 'a': self.__seqToRE(self.locale_time.a_weekday, 'a'), + 'B': self.__seqToRE(self.locale_time.f_month[1:], 'B'), + 'b': self.__seqToRE(self.locale_time.a_month[1:], 'b'), + 'p': self.__seqToRE(self.locale_time.am_pm, 'p'), + 'Z': self.__seqToRE([tz for tz_names in self.locale_time.timezone + for tz in tz_names], + 'Z'), + '%': '%'}) + base.__setitem__('W', base.__getitem__('U').replace('U', 'W')) + base.__setitem__('c', self.pattern(self.locale_time.LC_date_time)) + base.__setitem__('x', self.pattern(self.locale_time.LC_date)) + base.__setitem__('X', self.pattern(self.locale_time.LC_time)) + + def __seqToRE(self, to_convert, directive): + """Convert a list to a regex string for matching a directive. + + Want possible matching values to be from longest to shortest. This + prevents the possibility of a match occuring for a value that also + a substring of a larger value that should have matched (e.g., 'abc' + matching when 'abcdef' should have been the match). + + """ + to_convert = sorted(to_convert, key=len, reverse=True) + for value in to_convert: + if value != '': + break + else: + return '' + regex = '|'.join([re.escape(stuff) for stuff in to_convert]) + regex = '(?P<%s>%s' % (directive, regex) + return '%s)' % regex + + def pattern(self, format): + """Return regex pattern for the format string. + + Need to make sure that any characters that might be interpreted as + regex syntax are escaped. + + """ + processed_format = '' + # The sub() call escapes all characters that might be misconstrued + # as regex syntax. Cannot use re.escape since we have to deal with + # format directives (%m, etc.). + regex_chars = re.compile(r"([\\.^$*+?\(\){}\[\]|])") + format = regex_chars.sub(r"\\\1", format) + whitespace_replacement = re.compile(r'\s+') + format = whitespace_replacement.sub(r'\\s+', format) + while '%' in format: + directive_index = format.index('%') +1 + processed_format = "%s%s%s" % (processed_format, + format[:directive_index -1], + self[format[directive_index]]) + format = format[directive_index +1:] + return "%s%s" % (processed_format, format) + + def compile(self, format): + """Return a compiled re object for the format string.""" + return re.compile(self.pattern(format), re.IGNORECASE) + + +_cache_lock = _thread_allocate_lock() +# DO NOT modify _TimeRE_cache or _regex_cache without acquiring the cache lock +# first! +_TimeRE_cache = TimeRE() +_CACHE_MAX_SIZE = 5 # Max number of regexes stored in _regex_cache +_regex_cache = {} + + +cdef _calc_julian_from_U_or_W(int year, int week_of_year, + int day_of_week, int week_starts_Mon): + """Calculate the Julian day based on the year, week of the year, and day of + the week, with week_start_day representing whether the week of the year + assumes the week starts on Sunday or Monday (6 or 0).""" + + cdef: + int first_weekday, week_0_length, days_to_week + + first_weekday = datetime_date(year, 1, 1).weekday() + # If we are dealing with the %U directive (week starts on Sunday), it's + # easier to just shift the view to Sunday being the first day of the + # week. + if not week_starts_Mon: + first_weekday = (first_weekday + 1) % 7 + day_of_week = (day_of_week + 1) % 7 + + # Need to watch out for a week 0 (when the first day of the year is not + # the same as that specified by %U or %W). + week_0_length = (7 - first_weekday) % 7 + if week_of_year == 0: + return 1 + day_of_week - first_weekday + else: + days_to_week = week_0_length + (7 * (week_of_year - 1)) + return 1 + days_to_week + day_of_week diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 95fe3ab83c2ab..bf89509fd1746 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -3,6 +3,7 @@ from collections import MutableMapping from pandas._libs import lib, tslib +from pandas._libs.tslibs.strptime import array_strptime from pandas._libs.tslibs.timezones import get_timezone from pandas.core.dtypes.common import ( @@ -416,8 +417,8 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): # fallback if result is None: try: - result = tslib.array_strptime(arg, format, exact=exact, - errors=errors) + result = array_strptime(arg, format, exact=exact, + errors=errors) except tslib.OutOfBoundsDatetime: if errors == 'raise': raise diff --git a/setup.py b/setup.py index 555cf9dc4a9b3..25a4924dad0bc 100755 --- a/setup.py +++ b/setup.py @@ -471,7 +471,6 @@ def pxd(name): 'pandas/_libs/src/datetime/np_datetime_strings.h', 'pandas/_libs/src/datetime.pxd'] - # some linux distros require it libraries = ['m'] if not is_platform_windows() else [] @@ -483,6 +482,10 @@ def pxd(name): 'pxdfiles': ['_libs/hashtable'], 'depends': (['pandas/_libs/src/klib/khash_python.h'] + _pxi_dep['hashtable'])}, + '_libs.tslibs.strptime': {'pyxfile': '_libs/tslibs/strptime', + 'depends': tseries_depends, + 'sources': ['pandas/_libs/src/datetime/np_datetime.c', + 'pandas/_libs/src/datetime/np_datetime_strings.c']}, '_libs.tslib': {'pyxfile': '_libs/tslib', 'pxdfiles': ['_libs/src/util', '_libs/lib'], 'depends': tseries_depends,