From 62008ec521267d4f8e93c639ce60aefddc7042d6 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 24 Jul 2017 12:36:56 -0400 Subject: [PATCH 1/9] ENH: Add skipna parameter to infer_dtype --- doc/source/whatsnew/v0.21.0.txt | 2 + pandas/_libs/src/inference.pyx | 134 ++++++++++++++++++-------- pandas/tests/dtypes/test_inference.py | 33 ++++++- 3 files changed, 129 insertions(+), 40 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 5a5ea827e74ad..d277e8d4b9434 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -24,6 +24,8 @@ New features `_ on most readers and writers (:issue:`13823`) - Added ``__fspath__`` method to :class:`~pandas.HDFStore`, :class:`~pandas.ExcelFile`, and :class:`~pandas.ExcelWriter` to work properly with the file system path protocol (:issue:`13823`) +- Added ``skipna`` parameter :func:`~pandas.api.types.infer_dtype` to support + type inference in the presence of missing values (:issue:`17059`). .. _whatsnew_0210.enhancements.infer_objects: diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index 38e95fe6ee652..1b9f268f9a843 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -222,7 +222,7 @@ cdef _try_infer_map(v): return None -def infer_dtype(object value): +def infer_dtype(object value, bint skipna=False): """ Effeciently infer the type of a passed val, or list-like array of values. Return a string describing the type. @@ -230,6 +230,8 @@ def infer_dtype(object value): Parameters ---------- value : scalar, list, ndarray, or pandas type + skipna : bool, default False + Ignore NaN values when inferring the type. Returns ------- @@ -272,6 +274,9 @@ def infer_dtype(object value): >>> infer_dtype(['foo', 'bar']) 'string' + >>> infer_dtype(['a', np.nan, 'b'], skipna=True) + 'string' + >>> infer_dtype([b'foo', b'bar']) 'bytes' @@ -310,7 +315,6 @@ def infer_dtype(object value): >>> infer_dtype(pd.Series(list('aabc')).astype('category')) 'categorical' - """ cdef: Py_ssize_t i, n @@ -356,7 +360,7 @@ def infer_dtype(object value): values = values.ravel() # try to use a valid value - for i from 0 <= i < n: + for i in range(n): val = util.get_value_1d(values, i) # do not use is_nul_datetimelike to keep @@ -403,11 +407,11 @@ def infer_dtype(object value): return 'datetime' elif is_date(val): - if is_date_array(values): + if is_date_array(values, skipna=skipna): return 'date' elif is_time(val): - if is_time_array(values): + if is_time_array(values, skipna=skipna): return 'time' elif is_decimal(val): @@ -420,19 +424,19 @@ def infer_dtype(object value): return 'mixed-integer-float' elif util.is_bool_object(val): - if is_bool_array(values): + if is_bool_array(values, skipna=skipna): return 'boolean' elif PyString_Check(val): - if is_string_array(values): + if is_string_array(values, skipna=skipna): return 'string' elif PyUnicode_Check(val): - if is_unicode_array(values): + if is_unicode_array(values, skipna=skipna): return 'unicode' elif PyBytes_Check(val): - if is_bytes_array(values): + if is_bytes_array(values, skipna=skipna): return 'bytes' elif is_period(val): @@ -593,10 +597,11 @@ cdef inline bint is_timedelta(object o): return PyDelta_Check(o) or util.is_timedelta64_object(o) -cpdef bint is_bool_array(ndarray values): +cpdef bint is_bool_array(ndarray values, bint skipna=False): cdef: Py_ssize_t i, n = len(values) ndarray[object] objbuf + object val if issubclass(values.dtype.type, np.bool_): return True @@ -606,9 +611,16 @@ cpdef bint is_bool_array(ndarray values): if n == 0: return False - for i in range(n): - if not util.is_bool_object(objbuf[i]): - return False + if skipna: + for i in range(n): + val = objbuf[i] + if not util._checknull(val) and not util.is_bool_object(val): + return False + else: + for i in range(n): + val = objbuf[i] + if not util.is_bool_object(val): + return False return True else: return False @@ -639,6 +651,7 @@ cpdef bint is_integer_float_array(ndarray values): cdef: Py_ssize_t i, n = len(values) ndarray[object] objbuf + object value if issubclass(values.dtype.type, np.integer): return True @@ -649,9 +662,8 @@ cpdef bint is_integer_float_array(ndarray values): return False for i in range(n): - if not (util.is_integer_object(objbuf[i]) or - util.is_float_object(objbuf[i])): - + val = objbuf[i] + if not (util.is_integer_object(val) or util.is_float_object(val)): return False return True else: @@ -679,10 +691,11 @@ cpdef bint is_float_array(ndarray values): return False -cpdef bint is_string_array(ndarray values): +cpdef bint is_string_array(ndarray values, bint skipna=False): cdef: Py_ssize_t i, n = len(values) ndarray[object] objbuf + object val if ((PY2 and issubclass(values.dtype.type, np.string_)) or not PY2 and issubclass(values.dtype.type, np.unicode_)): @@ -693,18 +706,26 @@ cpdef bint is_string_array(ndarray values): if n == 0: return False - for i in range(n): - if not PyString_Check(objbuf[i]): - return False + if skipna: + for i in range(n): + val = objbuf[i] + if not util._checknull(val) and not PyString_Check(val): + return False + else: + for i in range(n): + val = objbuf[i] + if not PyString_Check(val): + return False return True else: return False -cpdef bint is_unicode_array(ndarray values): +cpdef bint is_unicode_array(ndarray values, bint skipna=False): cdef: Py_ssize_t i, n = len(values) ndarray[object] objbuf + object val if issubclass(values.dtype.type, np.unicode_): return True @@ -714,18 +735,26 @@ cpdef bint is_unicode_array(ndarray values): if n == 0: return False - for i in range(n): - if not PyUnicode_Check(objbuf[i]): - return False + if skipna: + for i in range(n): + val = objbuf[i] + if not util._checknull(val) and not PyUnicode_Check(val): + return False + else: + for i in range(n): + val = objbuf[i] + if not PyUnicode_Check(val): + return False return True else: return False -cpdef bint is_bytes_array(ndarray values): +cpdef bint is_bytes_array(ndarray values, bint skipna=False): cdef: Py_ssize_t i, n = len(values) ndarray[object] objbuf + object val if issubclass(values.dtype.type, np.bytes_): return True @@ -735,9 +764,16 @@ cpdef bint is_bytes_array(ndarray values): if n == 0: return False - for i in range(n): - if not PyBytes_Check(objbuf[i]): - return False + if skipna: + for i in range(n): + val = objbuf[i] + if not util._checknull(val) and not PyBytes_Check(val): + return False + else: + for i in range(n): + val = objbuf[i] + if not PyBytes_Check(val): + return False return True else: return False @@ -856,23 +892,45 @@ cpdef bint is_timedelta_or_timedelta64_array(ndarray values): return null_count != n -cpdef bint is_date_array(ndarray[object] values): - cdef Py_ssize_t i, n = len(values) +cpdef bint is_date_array(ndarray[object] values, bint skipna=False): + cdef: + Py_ssize_t i, n = len(values) + object val + if n == 0: return False - for i in range(n): - if not is_date(values[i]): - return False + + if skipna: + for i in range(n): + val = values[i] + if not util._checknull(val) and not is_date(val): + return False + else: + for i in range(n): + val = values[i] + if not is_date(val): + return False return True -cpdef bint is_time_array(ndarray[object] values): - cdef Py_ssize_t i, n = len(values) +cpdef bint is_time_array(ndarray[object] values, bint skipna=False): + cdef: + Py_ssize_t i, n = len(values) + object val + if n == 0: return False - for i in range(n): - if not is_time(values[i]): - return False + + if skipna: + for i in range(n): + val = values[i] + if not util._checknull(val) and not is_time(val): + return False + else: + for i in range(n): + val = values[i] + if not is_time(val): + return False return True diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index d26ea047bb41f..dbde7ae5081d4 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -239,6 +239,9 @@ def test_infer_dtype_bytes(self): arr = arr.astype(object) assert lib.infer_dtype(arr) == compare + # object array of bytes with missing values + assert lib.infer_dtype([b'a', np.nan, b'c'], skipna=True) == compare + def test_isinf_scalar(self): # GH 11352 assert lib.isposinf_scalar(float('inf')) @@ -444,6 +447,10 @@ def test_bools(self): result = lib.infer_dtype(arr) assert result == 'boolean' + arr = np.array([True, np.nan, False], dtype='O') + result = lib.infer_dtype(arr, skipna=True) + assert result == 'boolean' + def test_floats(self): arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O') result = lib.infer_dtype(arr) @@ -472,11 +479,26 @@ def test_decimals(self): result = lib.infer_dtype(arr) assert result == 'mixed' + arr = np.array([Decimal(1), Decimal('NaN'), Decimal(3)]) + result = lib.infer_dtype(arr) + assert result == 'decimal' + + arr = np.array([Decimal(1), np.nan, Decimal(3)], dtype='O') + result = lib.infer_dtype(arr) + assert result == 'decimal' + def test_string(self): pass def test_unicode(self): - pass + arr = [u'a', np.nan, u'c'] + result = lib.infer_dtype(arr) + assert result == 'mixed' + + arr = [u'a', np.nan, u'c'] + result = lib.infer_dtype(arr, skipna=True) + expected = 'unicode' if PY2 else 'string' + assert result == expected def test_datetime(self): @@ -714,10 +736,17 @@ def test_is_datetimelike_array_all_nan_nat_like(self): def test_date(self): - dates = [date(2012, 1, x) for x in range(1, 20)] + dates = [date(2012, 1, day) for day in range(1, 20)] index = Index(dates) assert index.inferred_type == 'date' + dates = [date(2012, 1, day) for day in range(1, 20)] + [np.nan] + result = lib.infer_dtype(dates) + assert result == 'mixed' + + result = lib.infer_dtype(dates, skipna=True) + assert result == 'date' + def test_to_object_array_tuples(self): r = (5, 6) values = [r] From 942265129df96f46f8a1b536de17265d143f632f Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 24 Jul 2017 16:16:37 -0400 Subject: [PATCH 2/9] ENH: Refactor type inference Cython code --- pandas/_libs/src/inference.pyx | 532 +++++++++++++++++---------------- 1 file changed, 273 insertions(+), 259 deletions(-) diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index 1b9f268f9a843..82c2b125badac 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -597,222 +597,262 @@ cdef inline bint is_timedelta(object o): return PyDelta_Check(o) or util.is_timedelta64_object(o) -cpdef bint is_bool_array(ndarray values, bint skipna=False): +cdef class Validator: + cdef: - Py_ssize_t i, n = len(values) - ndarray[object] objbuf - object val + Py_ssize_t n + object dtype + bint skipna - if issubclass(values.dtype.type, np.bool_): - return True - elif values.dtype == np.object_: - objbuf = values + def __cinit__(self, Py_ssize_t n, object dtype=None, bint skipna=False): + self.n = n + self.dtype = dtype if dtype is not None else np.dtype(np.object_) + self.skipna = skipna - if n == 0: + cdef bint validate(self, object[:] values): + if not self.n: return False - if skipna: - for i in range(n): - val = objbuf[i] - if not util._checknull(val) and not util.is_bool_object(val): - return False + if self.is_array_typed(): + return True + elif self.dtype.type == np.object_: + if self.skipna: + return self._validate_skipna(values) + else: + return self._validate(values) else: - for i in range(n): - val = objbuf[i] - if not util.is_bool_object(val): - return False - return True - else: + return False + + cdef bint _validate(self, object[:] values): + cdef: + Py_ssize_t i + Py_ssize_t n = self.n + + for i in range(n): + if not self.is_valid(values[i]): + return False + return self.finalize() + + cdef bint _validate_skipna(self, object[:] values): + cdef: + Py_ssize_t i + Py_ssize_t n = self.n + + for i in range(n): + if not self.is_valid_skipna(values[i]): + return False + return self.finalize() + + cdef bint is_valid(self, object value) except -1: + return self.is_value_typed(value) + + cdef bint is_valid_skipna(self, object value) except -1: + return self.is_valid(value) or self.is_valid_null(value) + + cdef bint is_value_typed(self, object value) except -1: + raise NotImplementedError( + 'is_value_typed must be implemented in subclasses' + ) + + cdef bint is_valid_null(self, object value) except -1: + return util._checknull(value) + + cdef bint is_array_typed(self) except -1: return False + cdef bint finalize(self): + return True + + +cdef class BoolValidator(Validator): + + cdef bint is_value_typed(self, object value) except -1: + return util.is_bool_object(value) + + cdef bint is_array_typed(self) except -1: + return issubclass(self.dtype.type, np.bool_) + + +cpdef bint is_bool_array(ndarray values, bint skipna=False): + cdef: + BoolValidator validator = BoolValidator( + len(values), + values.dtype, + skipna=skipna + ) + return validator.validate(values) + + +cdef class IntegerValidator(Validator): + + cdef bint is_value_typed(self, object value) except -1: + return util.is_integer_object(value) + + cdef bint is_array_typed(self) except -1: + return issubclass(self.dtype.type, np.integer) + cpdef bint is_integer_array(ndarray values): cdef: - Py_ssize_t i, n = len(values) - ndarray[object] objbuf + IntegerValidator validator = IntegerValidator( + len(values), + values.dtype, + ) + return validator.validate(values) - if issubclass(values.dtype.type, np.integer): - return True - elif values.dtype == np.object_: - objbuf = values - if n == 0: - return False +cdef class IntegerFloatValidator(Validator): - for i in range(n): - if not util.is_integer_object(objbuf[i]): - return False - return True - else: - return False + cdef bint is_value_typed(self, object value) except -1: + return util.is_integer_object(value) or util.is_float_object(value) + + cdef bint is_array_typed(self) except -1: + return issubclass(self.dtype.type, np.integer) cpdef bint is_integer_float_array(ndarray values): cdef: - Py_ssize_t i, n = len(values) - ndarray[object] objbuf - object value + IntegerFloatValidator validator = IntegerFloatValidator( + len(values), + values.dtype, + ) + return validator.validate(values) - if issubclass(values.dtype.type, np.integer): - return True - elif values.dtype == np.object_: - objbuf = values - if n == 0: - return False +cdef class FloatValidator(Validator): - for i in range(n): - val = objbuf[i] - if not (util.is_integer_object(val) or util.is_float_object(val)): - return False - return True - else: - return False + cdef bint is_value_typed(self, object value) except -1: + return util.is_float_object(value) + + cdef bint is_array_typed(self) except -1: + return issubclass(self.dtype.type, np.floating) cpdef bint is_float_array(ndarray values): - cdef: - Py_ssize_t i, n = len(values) - ndarray[object] objbuf + cdef FloatValidator validator = FloatValidator(len(values), values.dtype) + return validator.validate(values) - if issubclass(values.dtype.type, np.floating): - return True - elif values.dtype == np.object_: - objbuf = values - if n == 0: - return False +cdef class StringValidator(Validator): - for i in range(n): - if not util.is_float_object(objbuf[i]): - return False - return True - else: - return False + cdef bint is_value_typed(self, object value) except -1: + return PyString_Check(value) + + cdef bint is_array_typed(self) except -1: + return ( + PY2 and issubclass(self.dtype.type, np.string_) + ) or not PY2 and issubclass(self.dtype.type, np.unicode_) cpdef bint is_string_array(ndarray values, bint skipna=False): cdef: - Py_ssize_t i, n = len(values) - ndarray[object] objbuf - object val + StringValidator validator = StringValidator( + len(values), + values.dtype, + skipna=skipna, + ) + return validator.validate(values) - if ((PY2 and issubclass(values.dtype.type, np.string_)) or - not PY2 and issubclass(values.dtype.type, np.unicode_)): - return True - elif values.dtype == np.object_: - objbuf = values - if n == 0: - return False +cdef class UnicodeValidator(Validator): - if skipna: - for i in range(n): - val = objbuf[i] - if not util._checknull(val) and not PyString_Check(val): - return False - else: - for i in range(n): - val = objbuf[i] - if not PyString_Check(val): - return False - return True - else: - return False + cdef bint is_value_typed(self, object value) except -1: + return PyUnicode_Check(value) + + cdef bint is_array_typed(self) except -1: + return issubclass(self.dtype.type, np.unicode_) cpdef bint is_unicode_array(ndarray values, bint skipna=False): cdef: - Py_ssize_t i, n = len(values) - ndarray[object] objbuf - object val + UnicodeValidator validator = UnicodeValidator( + len(values), + values.dtype, + skipna=skipna, + ) + return validator.validate(values) - if issubclass(values.dtype.type, np.unicode_): - return True - elif values.dtype == np.object_: - objbuf = values - if n == 0: - return False +cdef class BytesValidator(Validator): - if skipna: - for i in range(n): - val = objbuf[i] - if not util._checknull(val) and not PyUnicode_Check(val): - return False - else: - for i in range(n): - val = objbuf[i] - if not PyUnicode_Check(val): - return False - return True - else: - return False + cdef bint is_value_typed(self, object value) except -1: + return PyBytes_Check(value) + + cdef bint is_array_typed(self) except -1: + return issubclass(self.dtype.type, np.bytes_) cpdef bint is_bytes_array(ndarray values, bint skipna=False): cdef: - Py_ssize_t i, n = len(values) - ndarray[object] objbuf - object val + BytesValidator validator = BytesValidator( + len(values), + values.dtype, + skipna=skipna + ) + return validator.validate(values) - if issubclass(values.dtype.type, np.bytes_): - return True - elif values.dtype == np.object_: - objbuf = values - if n == 0: - return False +cdef class TemporalValidator(Validator): - if skipna: - for i in range(n): - val = objbuf[i] - if not util._checknull(val) and not PyBytes_Check(val): - return False - else: - for i in range(n): - val = objbuf[i] - if not PyBytes_Check(val): - return False - return True - else: - return False + cdef Py_ssize_t generic_null_count + + def __cinit__(self, Py_ssize_t n, object dtype=None, bint skipna=False): + self.n = n + self.dtype = dtype if dtype is not None else np.dtype(np.object_) + self.skipna = skipna + self.generic_null_count = 0 + + cdef bint is_valid(self, object value) except -1: + return self.is_value_typed(value) or self.is_valid_null(value) + + cdef bint is_value_typed(self, object value) except -1: + raise NotImplementedError() + + cdef bint is_valid_null(self, object value) except -1: + raise NotImplementedError() + + cdef bint is_valid_skipna(self, object value) except -1: + cdef: + bint is_typed_null = self.is_valid_null(value) + bint is_generic_null = util._checknull(value) + self.generic_null_count += is_typed_null and is_generic_null + return self.is_value_typed(value) or is_typed_null or is_generic_null + + cdef bint finalize(self): + return self.generic_null_count != self.n + + +cdef class DatetimeValidator(TemporalValidator): + + cdef bint is_value_typed(self, object value) except -1: + return is_datetime(value) + + cdef bint is_valid_null(self, object value) except -1: + return is_null_datetime64(value) cpdef bint is_datetime_array(ndarray[object] values): - cdef Py_ssize_t i, null_count = 0, n = len(values) - cdef object v - if n == 0: - return False + cdef: + DatetimeValidator validator = DatetimeValidator( + len(values), + skipna=True, + ) + return validator.validate(values) - # return False for all nulls - for i in range(n): - v = values[i] - if is_null_datetime64(v): - # we are a regular null - if util._checknull(v): - null_count += 1 - elif not is_datetime(v): - return False - return null_count != n +cdef class Datetime64Validator(DatetimeValidator): -cpdef bint is_datetime64_array(ndarray values): - cdef Py_ssize_t i, null_count = 0, n = len(values) - cdef object v - if n == 0: - return False + cdef bint is_value_typed(self, object value) except -1: + return util.is_datetime64_object(value) - # return False for all nulls - for i in range(n): - v = values[i] - if is_null_datetime64(v): - # we are a regular null - if util._checknull(v): - null_count += 1 - elif not util.is_datetime64_object(v): - return False - return null_count != n + +cpdef bint is_datetime64_array(ndarray values): + cdef: + Datetime64Validator validator = Datetime64Validator( + len(values), + skipna=True, + ) + return validator.validate(values) cpdef bint is_datetime_with_singletz_array(ndarray[object] values): @@ -843,130 +883,104 @@ cpdef bint is_datetime_with_singletz_array(ndarray[object] values): return True +cdef class TimedeltaValidator(TemporalValidator): + + cdef bint is_value_typed(self, object value) except -1: + return PyDelta_Check(value) + + cdef bint is_valid_null(self, object value) except -1: + return is_null_timedelta64(value) + + cpdef bint is_timedelta_array(ndarray values): - cdef Py_ssize_t i, null_count = 0, n = len(values) - cdef object v - if n == 0: - return False - for i in range(n): - v = values[i] - if is_null_timedelta64(v): - # we are a regular null - if util._checknull(v): - null_count += 1 - elif not PyDelta_Check(v): - return False - return null_count != n + cdef: + TimedeltaValidator validator = TimedeltaValidator( + len(values), + skipna=True, + ) + return validator.validate(values) + + +cdef class Timedelta64Validator(TimedeltaValidator): + + cdef bint is_value_typed(self, object value) except -1: + return util.is_timedelta64_object(value) cpdef bint is_timedelta64_array(ndarray values): - cdef Py_ssize_t i, null_count = 0, n = len(values) - cdef object v - if n == 0: - return False - for i in range(n): - v = values[i] - if is_null_timedelta64(v): - # we are a regular null - if util._checknull(v): - null_count += 1 - elif not util.is_timedelta64_object(v): - return False - return null_count != n + cdef: + Timedelta64Validator validator = Timedelta64Validator( + len(values), + skipna=True, + ) + return validator.validate(values) + + +cdef class AnyTimedeltaValidator(TimedeltaValidator): + + cdef bint is_value_typed(self, object value) except -1: + return is_timedelta(value) cpdef bint is_timedelta_or_timedelta64_array(ndarray values): """ infer with timedeltas and/or nat/none """ - cdef Py_ssize_t i, null_count = 0, n = len(values) - cdef object v - if n == 0: - return False - for i in range(n): - v = values[i] - if is_null_timedelta64(v): - # we are a regular null - if util._checknull(v): - null_count += 1 - elif not is_timedelta(v): - return False - return null_count != n + cdef: + AnyTimedeltaValidator validator = AnyTimedeltaValidator( + len(values), + skipna=True, + ) + return validator.validate(values) + + +cdef class DateValidator(Validator): + + cdef bint is_value_typed(self, object value) except -1: + return is_date(value) cpdef bint is_date_array(ndarray[object] values, bint skipna=False): - cdef: - Py_ssize_t i, n = len(values) - object val + cdef DateValidator validator = DateValidator(len(values), skipna=skipna) + return validator.validate(values) - if n == 0: - return False - if skipna: - for i in range(n): - val = values[i] - if not util._checknull(val) and not is_date(val): - return False - else: - for i in range(n): - val = values[i] - if not is_date(val): - return False - return True +cdef class TimeValidator(Validator): + + cdef bint is_value_typed(self, object value) except -1: + return is_time(value) cpdef bint is_time_array(ndarray[object] values, bint skipna=False): - cdef: - Py_ssize_t i, n = len(values) - object val + cdef TimeValidator validator = TimeValidator(len(values), skipna=skipna) + return validator.validate(values) - if n == 0: - return False - if skipna: - for i in range(n): - val = values[i] - if not util._checknull(val) and not is_time(val): - return False - else: - for i in range(n): - val = values[i] - if not is_time(val): - return False - return True +cdef class PeriodValidator(TemporalValidator): + + cdef bint is_value_typed(self, object value) except -1: + return is_period(value) + + cdef bint is_valid_null(self, object value) except -1: + return is_null_period(value) cpdef bint is_period_array(ndarray[object] values): - cdef Py_ssize_t i, null_count = 0, n = len(values) - cdef object v - if n == 0: - return False + cdef PeriodValidator validator = PeriodValidator(len(values), skipna=True) + return validator.validate(values) - # return False for all nulls - for i in range(n): - v = values[i] - if is_null_period(v): - # we are a regular null - if util._checknull(v): - null_count += 1 - elif not is_period(v): - return False - return null_count != n + +cdef class IntervalValidator(Validator): + + cdef bint is_value_typed(self, object value) except -1: + return is_interval(value) cpdef bint is_interval_array(ndarray[object] values): cdef: - Py_ssize_t i, n = len(values), null_count = 0 - object v - - if n == 0: - return False - for i in range(n): - v = values[i] - if util._checknull(v): - null_count += 1 - continue - if not is_interval(v): - return False - return null_count != n + IntervalValidator validator = IntervalValidator( + len(values), + skipna=True, + ) + return validator.validate(values) cdef extern from "parse_helper.h": From 68b277bf312426227a51e13562f13b172f79a86c Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 24 Jul 2017 19:40:38 -0400 Subject: [PATCH 3/9] PERF: No negative index or out of bounds check --- pandas/_libs/src/inference.pyx | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index 82c2b125badac..7ea506212868b 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -1,6 +1,7 @@ import sys from decimal import Decimal cimport util +cimport cython from tslib import NaT, get_timezone from datetime import datetime, timedelta iNaT = util.get_nat() @@ -623,6 +624,8 @@ cdef class Validator: else: return False + @cython.wraparound(False) + @cython.boundscheck(False) cdef bint _validate(self, object[:] values): cdef: Py_ssize_t i @@ -631,8 +634,11 @@ cdef class Validator: for i in range(n): if not self.is_valid(values[i]): return False + return self.finalize() + @cython.wraparound(False) + @cython.boundscheck(False) cdef bint _validate_skipna(self, object[:] values): cdef: Py_ssize_t i @@ -641,6 +647,7 @@ cdef class Validator: for i in range(n): if not self.is_valid_skipna(values[i]): return False + return self.finalize() cdef bint is_valid(self, object value) except -1: From b4552c35592b761ded460fd83fa2a8a7c7ce9c20 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 24 Jul 2017 20:04:27 -0400 Subject: [PATCH 4/9] DOC: versionadded tag --- pandas/_libs/src/inference.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index 7ea506212868b..e01973a4962a9 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -234,6 +234,8 @@ def infer_dtype(object value, bint skipna=False): skipna : bool, default False Ignore NaN values when inferring the type. + .. versionadded:: 0.21.0 + Returns ------- string describing the common type of the input data. From ddfca5176dd0517a9160a9d17922ab7a32786e63 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 24 Jul 2017 20:13:56 -0400 Subject: [PATCH 5/9] PERF: Inline leaf classes' methods --- pandas/_libs/src/inference.pyx | 60 +++++++++++++++++----------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index e01973a4962a9..2b25b9029009c 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -612,7 +612,7 @@ cdef class Validator: self.dtype = dtype if dtype is not None else np.dtype(np.object_) self.skipna = skipna - cdef bint validate(self, object[:] values): + cdef inline bint validate(self, object[:] values): if not self.n: return False @@ -628,7 +628,7 @@ cdef class Validator: @cython.wraparound(False) @cython.boundscheck(False) - cdef bint _validate(self, object[:] values): + cdef inline bint _validate(self, object[:] values): cdef: Py_ssize_t i Py_ssize_t n = self.n @@ -641,7 +641,7 @@ cdef class Validator: @cython.wraparound(False) @cython.boundscheck(False) - cdef bint _validate_skipna(self, object[:] values): + cdef inline bint _validate_skipna(self, object[:] values): cdef: Py_ssize_t i Py_ssize_t n = self.n @@ -675,10 +675,10 @@ cdef class Validator: cdef class BoolValidator(Validator): - cdef bint is_value_typed(self, object value) except -1: + cdef inline bint is_value_typed(self, object value) except -1: return util.is_bool_object(value) - cdef bint is_array_typed(self) except -1: + cdef inline bint is_array_typed(self) except -1: return issubclass(self.dtype.type, np.bool_) @@ -694,10 +694,10 @@ cpdef bint is_bool_array(ndarray values, bint skipna=False): cdef class IntegerValidator(Validator): - cdef bint is_value_typed(self, object value) except -1: + cdef inline bint is_value_typed(self, object value) except -1: return util.is_integer_object(value) - cdef bint is_array_typed(self) except -1: + cdef inline bint is_array_typed(self) except -1: return issubclass(self.dtype.type, np.integer) @@ -712,10 +712,10 @@ cpdef bint is_integer_array(ndarray values): cdef class IntegerFloatValidator(Validator): - cdef bint is_value_typed(self, object value) except -1: + cdef inline bint is_value_typed(self, object value) except -1: return util.is_integer_object(value) or util.is_float_object(value) - cdef bint is_array_typed(self) except -1: + cdef inline bint is_array_typed(self) except -1: return issubclass(self.dtype.type, np.integer) @@ -730,10 +730,10 @@ cpdef bint is_integer_float_array(ndarray values): cdef class FloatValidator(Validator): - cdef bint is_value_typed(self, object value) except -1: + cdef inline bint is_value_typed(self, object value) except -1: return util.is_float_object(value) - cdef bint is_array_typed(self) except -1: + cdef inline bint is_array_typed(self) except -1: return issubclass(self.dtype.type, np.floating) @@ -744,10 +744,10 @@ cpdef bint is_float_array(ndarray values): cdef class StringValidator(Validator): - cdef bint is_value_typed(self, object value) except -1: + cdef inline bint is_value_typed(self, object value) except -1: return PyString_Check(value) - cdef bint is_array_typed(self) except -1: + cdef inline bint is_array_typed(self) except -1: return ( PY2 and issubclass(self.dtype.type, np.string_) ) or not PY2 and issubclass(self.dtype.type, np.unicode_) @@ -765,10 +765,10 @@ cpdef bint is_string_array(ndarray values, bint skipna=False): cdef class UnicodeValidator(Validator): - cdef bint is_value_typed(self, object value) except -1: + cdef inline bint is_value_typed(self, object value) except -1: return PyUnicode_Check(value) - cdef bint is_array_typed(self) except -1: + cdef inline bint is_array_typed(self) except -1: return issubclass(self.dtype.type, np.unicode_) @@ -784,10 +784,10 @@ cpdef bint is_unicode_array(ndarray values, bint skipna=False): cdef class BytesValidator(Validator): - cdef bint is_value_typed(self, object value) except -1: + cdef inline bint is_value_typed(self, object value) except -1: return PyBytes_Check(value) - cdef bint is_array_typed(self) except -1: + cdef inline bint is_array_typed(self) except -1: return issubclass(self.dtype.type, np.bytes_) @@ -811,7 +811,7 @@ cdef class TemporalValidator(Validator): self.skipna = skipna self.generic_null_count = 0 - cdef bint is_valid(self, object value) except -1: + cdef inline bint is_valid(self, object value) except -1: return self.is_value_typed(value) or self.is_valid_null(value) cdef bint is_value_typed(self, object value) except -1: @@ -820,14 +820,14 @@ cdef class TemporalValidator(Validator): cdef bint is_valid_null(self, object value) except -1: raise NotImplementedError() - cdef bint is_valid_skipna(self, object value) except -1: + cdef inline bint is_valid_skipna(self, object value) except -1: cdef: bint is_typed_null = self.is_valid_null(value) bint is_generic_null = util._checknull(value) self.generic_null_count += is_typed_null and is_generic_null return self.is_value_typed(value) or is_typed_null or is_generic_null - cdef bint finalize(self): + cdef inline bint finalize(self): return self.generic_null_count != self.n @@ -836,7 +836,7 @@ cdef class DatetimeValidator(TemporalValidator): cdef bint is_value_typed(self, object value) except -1: return is_datetime(value) - cdef bint is_valid_null(self, object value) except -1: + cdef inline bint is_valid_null(self, object value) except -1: return is_null_datetime64(value) @@ -851,7 +851,7 @@ cpdef bint is_datetime_array(ndarray[object] values): cdef class Datetime64Validator(DatetimeValidator): - cdef bint is_value_typed(self, object value) except -1: + cdef inline bint is_value_typed(self, object value) except -1: return util.is_datetime64_object(value) @@ -897,7 +897,7 @@ cdef class TimedeltaValidator(TemporalValidator): cdef bint is_value_typed(self, object value) except -1: return PyDelta_Check(value) - cdef bint is_valid_null(self, object value) except -1: + cdef inline bint is_valid_null(self, object value) except -1: return is_null_timedelta64(value) @@ -912,7 +912,7 @@ cpdef bint is_timedelta_array(ndarray values): cdef class Timedelta64Validator(TimedeltaValidator): - cdef bint is_value_typed(self, object value) except -1: + cdef inline bint is_value_typed(self, object value) except -1: return util.is_timedelta64_object(value) @@ -927,7 +927,7 @@ cpdef bint is_timedelta64_array(ndarray values): cdef class AnyTimedeltaValidator(TimedeltaValidator): - cdef bint is_value_typed(self, object value) except -1: + cdef inline bint is_value_typed(self, object value) except -1: return is_timedelta(value) @@ -943,7 +943,7 @@ cpdef bint is_timedelta_or_timedelta64_array(ndarray values): cdef class DateValidator(Validator): - cdef bint is_value_typed(self, object value) except -1: + cdef inline bint is_value_typed(self, object value) except -1: return is_date(value) @@ -954,7 +954,7 @@ cpdef bint is_date_array(ndarray[object] values, bint skipna=False): cdef class TimeValidator(Validator): - cdef bint is_value_typed(self, object value) except -1: + cdef inline bint is_value_typed(self, object value) except -1: return is_time(value) @@ -965,10 +965,10 @@ cpdef bint is_time_array(ndarray[object] values, bint skipna=False): cdef class PeriodValidator(TemporalValidator): - cdef bint is_value_typed(self, object value) except -1: + cdef inline bint is_value_typed(self, object value) except -1: return is_period(value) - cdef bint is_valid_null(self, object value) except -1: + cdef inline bint is_valid_null(self, object value) except -1: return is_null_period(value) @@ -979,7 +979,7 @@ cpdef bint is_period_array(ndarray[object] values): cdef class IntervalValidator(Validator): - cdef bint is_value_typed(self, object value) except -1: + cdef inline bint is_value_typed(self, object value) except -1: return is_interval(value) From 234110af84d121871f43730c7e67c68097afd6dc Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 24 Jul 2017 20:16:12 -0400 Subject: [PATCH 6/9] PERF: Don't try to inline the larger methods --- pandas/_libs/src/inference.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index 2b25b9029009c..78023bcc7f576 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -612,7 +612,7 @@ cdef class Validator: self.dtype = dtype if dtype is not None else np.dtype(np.object_) self.skipna = skipna - cdef inline bint validate(self, object[:] values): + cdef bint validate(self, object[:] values): if not self.n: return False @@ -628,7 +628,7 @@ cdef class Validator: @cython.wraparound(False) @cython.boundscheck(False) - cdef inline bint _validate(self, object[:] values): + cdef bint _validate(self, object[:] values): cdef: Py_ssize_t i Py_ssize_t n = self.n @@ -641,7 +641,7 @@ cdef class Validator: @cython.wraparound(False) @cython.boundscheck(False) - cdef inline bint _validate_skipna(self, object[:] values): + cdef bint _validate_skipna(self, object[:] values): cdef: Py_ssize_t i Py_ssize_t n = self.n From ca2e65ba36120b2684095419f674c2c2f3922d98 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Tue, 25 Jul 2017 09:57:29 -0400 Subject: [PATCH 7/9] DOC: typo --- doc/source/whatsnew/v0.21.0.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index d277e8d4b9434..5fd245cee7d88 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -24,8 +24,8 @@ New features `_ on most readers and writers (:issue:`13823`) - Added ``__fspath__`` method to :class:`~pandas.HDFStore`, :class:`~pandas.ExcelFile`, and :class:`~pandas.ExcelWriter` to work properly with the file system path protocol (:issue:`13823`) -- Added ``skipna`` parameter :func:`~pandas.api.types.infer_dtype` to support - type inference in the presence of missing values (:issue:`17059`). +- Added ``skipna`` parameter to :func:`~pandas.api.types.infer_dtype` to + support type inference in the presence of missing values (:issue:`17059`). .. _whatsnew_0210.enhancements.infer_objects: From 9a428360ecfda81a9f473c49c56ea33d6ff296c1 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Tue, 25 Jul 2017 10:32:12 -0400 Subject: [PATCH 8/9] REF: Address comments --- pandas/_libs/src/inference.pyx | 63 +++++++++++++++++++++++----------- 1 file changed, 43 insertions(+), 20 deletions(-) diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index 78023bcc7f576..ab19e66502848 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -232,7 +232,8 @@ def infer_dtype(object value, bint skipna=False): ---------- value : scalar, list, ndarray, or pandas type skipna : bool, default False - Ignore NaN values when inferring the type. + Ignore NaN values when inferring the type. The default of ``False`` + will be deprecated in a later version of pandas. .. versionadded:: 0.21.0 @@ -280,6 +281,9 @@ def infer_dtype(object value, bint skipna=False): >>> infer_dtype(['a', np.nan, 'b'], skipna=True) 'string' + >>> infer_dtype(['a', np.nan, 'b'], skipna=False) + 'mixed' + >>> infer_dtype([b'foo', b'bar']) 'bytes' @@ -323,7 +327,8 @@ def infer_dtype(object value, bint skipna=False): Py_ssize_t i, n object val ndarray values - bint seen_pdnat = False, seen_val = False + bint seen_pdnat = False + bint seen_val = False if isinstance(value, np.ndarray): values = value @@ -604,21 +609,26 @@ cdef class Validator: cdef: Py_ssize_t n - object dtype + np.dtype dtype bint skipna - def __cinit__(self, Py_ssize_t n, object dtype=None, bint skipna=False): + def __cinit__( + self, + Py_ssize_t n, + np.dtype dtype=np.dtype(np.object_), + bint skipna=False + ): self.n = n - self.dtype = dtype if dtype is not None else np.dtype(np.object_) + self.dtype = dtype self.skipna = skipna - cdef bint validate(self, object[:] values): + cdef bint validate(self, object[:] values) except -1: if not self.n: return False if self.is_array_typed(): return True - elif self.dtype.type == np.object_: + elif self.dtype.type_num == NPY_OBJECT: if self.skipna: return self._validate_skipna(values) else: @@ -628,7 +638,7 @@ cdef class Validator: @cython.wraparound(False) @cython.boundscheck(False) - cdef bint _validate(self, object[:] values): + cdef bint _validate(self, object[:] values) except -1: cdef: Py_ssize_t i Py_ssize_t n = self.n @@ -637,11 +647,11 @@ cdef class Validator: if not self.is_valid(values[i]): return False - return self.finalize() + return self.finalize_validate() @cython.wraparound(False) @cython.boundscheck(False) - cdef bint _validate_skipna(self, object[:] values): + cdef bint _validate_skipna(self, object[:] values) except -1: cdef: Py_ssize_t i Py_ssize_t n = self.n @@ -650,7 +660,7 @@ cdef class Validator: if not self.is_valid_skipna(values[i]): return False - return self.finalize() + return self.finalize_validate_skipna() cdef bint is_valid(self, object value) except -1: return self.is_value_typed(value) @@ -660,7 +670,9 @@ cdef class Validator: cdef bint is_value_typed(self, object value) except -1: raise NotImplementedError( - 'is_value_typed must be implemented in subclasses' + '{} child class must define is_value_typed'.format( + type(self).__name__ + ) ) cdef bint is_valid_null(self, object value) except -1: @@ -669,7 +681,12 @@ cdef class Validator: cdef bint is_array_typed(self) except -1: return False - cdef bint finalize(self): + cdef inline bint finalize_validate(self): + return True + + cdef bint finalize_validate_skipna(self): + # TODO(phillipc): Remove the existing validate methods and replace them + # with the skipna versions upon full deprecation of skipna=False return True @@ -805,20 +822,26 @@ cdef class TemporalValidator(Validator): cdef Py_ssize_t generic_null_count - def __cinit__(self, Py_ssize_t n, object dtype=None, bint skipna=False): + def __cinit__( + self, + Py_ssize_t n, + np.dtype dtype=np.dtype(np.object_), + bint skipna=False + ): self.n = n - self.dtype = dtype if dtype is not None else np.dtype(np.object_) + self.dtype = dtype self.skipna = skipna self.generic_null_count = 0 cdef inline bint is_valid(self, object value) except -1: return self.is_value_typed(value) or self.is_valid_null(value) - cdef bint is_value_typed(self, object value) except -1: - raise NotImplementedError() - cdef bint is_valid_null(self, object value) except -1: - raise NotImplementedError() + raise NotImplementedError( + '{} child class must define is_valid_null'.format( + type(self).__name__ + ) + ) cdef inline bint is_valid_skipna(self, object value) except -1: cdef: @@ -827,7 +850,7 @@ cdef class TemporalValidator(Validator): self.generic_null_count += is_typed_null and is_generic_null return self.is_value_typed(value) or is_typed_null or is_generic_null - cdef inline bint finalize(self): + cdef inline bint finalize_validate_skipna(self): return self.generic_null_count != self.n From 18839270046b35e1f0bbd836723c479d37eaecbb Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Tue, 25 Jul 2017 12:04:05 -0400 Subject: [PATCH 9/9] CLN: Simplify is_array_type for strings --- pandas/_libs/src/inference.pyx | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index ab19e66502848..6b5a8f20f0067 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -765,9 +765,7 @@ cdef class StringValidator(Validator): return PyString_Check(value) cdef inline bint is_array_typed(self) except -1: - return ( - PY2 and issubclass(self.dtype.type, np.string_) - ) or not PY2 and issubclass(self.dtype.type, np.unicode_) + return issubclass(self.dtype.type, np.str_) cpdef bint is_string_array(ndarray values, bint skipna=False):