Skip to content

Commit a9614fa

Browse files
committed
Merge remote-tracking branch 'upstream/master' into excel-fixture-cleanup
2 parents 8a53402 + 5b10e31 commit a9614fa

File tree

22 files changed

+517
-295
lines changed

22 files changed

+517
-295
lines changed

asv_bench/benchmarks/index_object.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,11 +196,20 @@ def setup(self, N):
196196
self.intv = IntervalIndex.from_arrays(left, right)
197197
self.intv._engine
198198

199+
self.left = IntervalIndex.from_breaks(np.arange(N))
200+
self.right = IntervalIndex.from_breaks(np.arange(N - 3, 2 * N - 3))
201+
199202
def time_monotonic_inc(self, N):
200203
self.intv.is_monotonic_increasing
201204

202205
def time_is_unique(self, N):
203206
self.intv.is_unique
204207

208+
def time_intersection(self, N):
209+
self.left.intersection(self.right)
210+
211+
def time_intersection_duplicate(self, N):
212+
self.intv.intersection(self.right)
213+
205214

206215
from .pandas_vb_common import setup # noqa: F401

doc/source/whatsnew/v0.25.0.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -502,14 +502,15 @@ Performance Improvements
502502
- Improved performance of :meth:`Series.searchsorted`. The speedup is especially large when the dtype is
503503
int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`)
504504
- Improved performance of :meth:`pandas.core.groupby.GroupBy.quantile` (:issue:`20405`)
505-
- Improved performance when slicing :class:`RangeIndex` (:issue:`26565`)
505+
- Improved performance of slicing and other selected operation on a :class:`RangeIndex` (:issue:`26565`, :issue:`26617`)
506506
- Improved performance of :meth:`read_csv` by faster tokenizing and faster parsing of small float numbers (:issue:`25784`)
507507
- Improved performance of :meth:`read_csv` by faster parsing of N/A and boolean values (:issue:`25804`)
508508
- Improved performance of :attr:`IntervalIndex.is_monotonic`, :attr:`IntervalIndex.is_monotonic_increasing` and :attr:`IntervalIndex.is_monotonic_decreasing` by removing conversion to :class:`MultiIndex` (:issue:`24813`)
509509
- Improved performance of :meth:`DataFrame.to_csv` when writing datetime dtypes (:issue:`25708`)
510510
- Improved performance of :meth:`read_csv` by much faster parsing of ``MM/YYYY`` and ``DD/MM/YYYY`` datetime formats (:issue:`25922`)
511511
- Improved performance of nanops for dtypes that cannot store NaNs. Speedup is particularly prominent for :meth:`Series.all` and :meth:`Series.any` (:issue:`25070`)
512512
- Improved performance of :meth:`Series.map` for dictionary mappers on categorical series by mapping the categories instead of mapping all values (:issue:`23785`)
513+
- Improved performance of :meth:`IntervalIndex.intersection` (:issue:`24813`)
513514
- Improved performance of :meth:`read_csv` by faster concatenating date columns without extra conversion to string for integer/float zero and float ``NaN``; by faster checking the string for the possibility of being a date (:issue:`25754`)
514515
- Improved performance of :attr:`IntervalIndex.is_unique` by removing conversion to ``MultiIndex`` (:issue:`24813`)
515516

@@ -537,6 +538,7 @@ Datetimelike
537538
- Bug in :func:`to_datetime` which does not replace the invalid argument with ``NaT`` when error is set to coerce (:issue:`26122`)
538539
- Bug in adding :class:`DateOffset` with nonzero month to :class:`DatetimeIndex` would raise ``ValueError`` (:issue:`26258`)
539540
- Bug in :func:`to_datetime` which raises unhandled ``OverflowError`` when called with mix of invalid dates and ``NaN`` values with ``format='%Y%m%d'`` and ``error='coerce'`` (:issue:`25512`)
541+
- Bug in :func:`to_datetime` which raises ``TypeError`` for ``format='%Y%m%d'`` when called for invalid integer dates with length >= 6 digits with ``errors='ignore'``
540542

541543
Timedelta
542544
^^^^^^^^^

environment.yml

Lines changed: 53 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -9,59 +9,72 @@ dependencies:
99
- python-dateutil>=2.5.0
1010
- pytz
1111

12-
# development
12+
# benchmarks
1313
- asv
14+
15+
# building
1416
- cython>=0.28.2
17+
18+
# code checks
19+
- cpplint
1520
- flake8
16-
- flake8-comprehensions
17-
- flake8-rst>=0.6.0,<=0.7.0
18-
- gitpython
19-
- hypothesis>=3.82
20-
- ipywidgets
21-
- isort
22-
- moto
21+
- flake8-comprehensions # used by flake8, linting of unnecessary comprehensions
22+
- flake8-rst>=0.6.0,<=0.7.0 # linting of code blocks in rst files
23+
- isort # check that imports are in the right order
2324
- mypy
25+
- pycodestyle # used by flake8
26+
27+
# documentation
28+
- gitpython # obtain contributors from git for whatsnew
29+
- sphinx
30+
- numpydoc>=0.9.0
31+
32+
# documentation (jupyter notebooks)
2433
- nbconvert>=5.4.1
25-
- nbformat
26-
- notebook>=5.7.5
34+
- nbsphinx
2735
- pandoc
28-
- pycodestyle
29-
- pyqt
30-
- python-snappy
36+
37+
# testing
38+
- boto3
39+
- botocore>=1.11
40+
- hypothesis>=3.82
41+
- moto # mock S3
3142
- pytest>=4.0.2
43+
- pytest-cov
3244
- pytest-mock
33-
- sphinx
34-
- numpydoc>=0.9.0
45+
- pytest-xdist
46+
- seaborn
47+
- statsmodels
48+
49+
# unused (required indirectly may be?)
50+
- ipywidgets
51+
- nbformat
52+
- notebook>=5.7.5
3553
- pip
3654

3755
# optional
38-
- beautifulsoup4>=4.2.1
3956
- blosc
40-
- botocore>=1.11
41-
- boto3
4257
- bottleneck>=1.2.1
43-
- fastparquet>=0.2.1
44-
- html5lib
45-
- ipython>=5.6.0
4658
- ipykernel
47-
- jinja2
48-
- lxml
49-
- matplotlib>=2.2.2
50-
- nbsphinx
59+
- ipython>=5.6.0
60+
- jinja2 # pandas.Styler
61+
- matplotlib>=2.2.2 # pandas.plotting, Series.plot, DataFrame.plot
5162
- numexpr>=2.6.8
52-
- openpyxl
53-
- pyarrow>=0.9.0
54-
- pytables>=3.4.2
55-
- pytest-cov
56-
- pytest-xdist
57-
- s3fs
5863
- scipy>=1.1
59-
- seaborn
60-
- sqlalchemy
61-
- statsmodels
62-
- xarray
63-
- xlrd
64-
- xlsxwriter
65-
- xlwt
66-
- pip:
67-
- cpplint
64+
65+
# optional for io
66+
- beautifulsoup4>=4.2.1 # pandas.read_html
67+
- fastparquet>=0.2.1 # pandas.read_parquet, DataFrame.to_parquet
68+
- html5lib # pandas.read_html
69+
- lxml # pandas.read_html
70+
- openpyxl # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
71+
- pyarrow>=0.9.0 # pandas.read_paquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather
72+
- pyqt # pandas.read_clipbobard
73+
- pytables>=3.4.2 # pandas.read_hdf, DataFrame.to_hdf
74+
- python-snappy # required by pyarrow
75+
- s3fs # pandas.read_csv... when using 's3://...' path
76+
- sqlalchemy # pandas.read_sql, DataFrame.to_sql
77+
- xarray # DataFrame.to_xarray
78+
- xlrd # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
79+
- xlsxwriter # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
80+
- xlwt # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile

pandas/_libs/tslibs/strptime.pyx

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -140,13 +140,13 @@ def array_strptime(object[:] values, object fmt,
140140
iresult[i] = NPY_NAT
141141
continue
142142
raise ValueError("time data %r does not match "
143-
"format %r (match)" % (values[i], fmt))
143+
"format %r (match)" % (val, fmt))
144144
if len(val) != found.end():
145145
if is_coerce:
146146
iresult[i] = NPY_NAT
147147
continue
148148
raise ValueError("unconverted data remains: %s" %
149-
values[i][found.end():])
149+
val[found.end():])
150150

151151
# search
152152
else:
@@ -156,7 +156,7 @@ def array_strptime(object[:] values, object fmt,
156156
iresult[i] = NPY_NAT
157157
continue
158158
raise ValueError("time data %r does not match format "
159-
"%r (search)" % (values[i], fmt))
159+
"%r (search)" % (val, fmt))
160160

161161
iso_year = -1
162162
year = 1900

pandas/conftest.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -378,12 +378,7 @@ def unique_nulls_fixture(request):
378378
FixedOffset(0), FixedOffset(-300), timezone.utc,
379379
timezone(timedelta(hours=1)),
380380
timezone(timedelta(hours=-1), name='foo')]
381-
TIMEZONE_IDS = ['None', 'UTC', 'US/Eastern', 'Asia/Tokyp',
382-
'dateutil/US/Pacific', 'dateutil/Asia/Singapore',
383-
'dateutil.tz.tzutz()', 'dateutil.tz.tzlocal()',
384-
'pytz.FixedOffset(300)', 'pytz.FixedOffset(0)',
385-
'pytz.FixedOffset(-300)', 'datetime.timezone.utc',
386-
'datetime.timezone.+1', 'datetime.timezone.-1.named']
381+
TIMEZONE_IDS = [repr(i) for i in TIMEZONES]
387382

388383

389384
@td.parametrize_fixture_doc(str(TIMEZONE_IDS))

pandas/core/arrays/integer.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -78,17 +78,6 @@ def construct_array_type(cls):
7878
"""
7979
return IntegerArray
8080

81-
@classmethod
82-
def construct_from_string(cls, string):
83-
"""
84-
Construction from a string, raise a TypeError if not
85-
possible
86-
"""
87-
if string == cls.name:
88-
return cls()
89-
raise TypeError("Cannot construct a '{}' from "
90-
"'{}'".format(cls, string))
91-
9281

9382
def integer_array(values, dtype=None, copy=False):
9483
"""

pandas/core/dtypes/base.py

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -172,17 +172,27 @@ def construct_array_type(cls):
172172
raise NotImplementedError
173173

174174
@classmethod
175-
def construct_from_string(cls, string):
176-
"""
177-
Attempt to construct this type from a string.
175+
def construct_from_string(cls, string: str):
176+
r"""
177+
Construct this type from a string.
178+
179+
This is useful mainly for data types that accept parameters.
180+
For example, a period dtype accepts a frequency parameter that
181+
can be set as ``period[H]`` (where H means hourly frequency).
182+
183+
By default, in the abstract class, just the name of the type is
184+
expected. But subclasses can overwrite this method to accept
185+
parameters.
178186
179187
Parameters
180188
----------
181189
string : str
190+
The name of the type, for example ``category``.
182191
183192
Returns
184193
-------
185-
self : instance of 'cls'
194+
ExtensionDtype
195+
Instance of the dtype.
186196
187197
Raises
188198
------
@@ -191,21 +201,26 @@ def construct_from_string(cls, string):
191201
192202
Examples
193203
--------
194-
If the extension dtype can be constructed without any arguments,
195-
the following may be an adequate implementation.
204+
For extension dtypes with arguments the following may be an
205+
adequate implementation.
196206
197207
>>> @classmethod
198-
... def construct_from_string(cls, string)
199-
... if string == cls.name:
200-
... return cls()
208+
... def construct_from_string(cls, string):
209+
... pattern = re.compile(r"^my_type\[(?P<arg_name>.+)\]$")
210+
... match = pattern.match(string)
211+
... if match:
212+
... return cls(**match.groupdict())
201213
... else:
202214
... raise TypeError("Cannot construct a '{}' from "
203-
... "'{}'".format(cls, string))
215+
... "'{}'".format(cls.__name__, string))
204216
"""
205-
raise AbstractMethodError(cls)
217+
if string != cls.name:
218+
raise TypeError("Cannot construct a '{}' from '{}'".format(
219+
cls.__name__, string))
220+
return cls()
206221

207222
@classmethod
208-
def is_dtype(cls, dtype):
223+
def is_dtype(cls, dtype) -> bool:
209224
"""Check if we match 'dtype'.
210225
211226
Parameters

pandas/core/dtypes/common.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1931,8 +1931,6 @@ def _is_dtype_type(arr_or_dtype, condition):
19311931
if issubclass(arr_or_dtype, ExtensionDtype):
19321932
arr_or_dtype = arr_or_dtype.type
19331933
return condition(np.dtype(arr_or_dtype).type)
1934-
elif arr_or_dtype is None:
1935-
return condition(type(None))
19361934

19371935
# if we have an array-like
19381936
if hasattr(arr_or_dtype, 'dtype'):

pandas/core/dtypes/dtypes.py

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -440,19 +440,6 @@ def construct_array_type(cls):
440440
from pandas import Categorical
441441
return Categorical
442442

443-
@classmethod
444-
def construct_from_string(cls, string):
445-
"""
446-
attempt to construct this type from a string, raise a TypeError if
447-
it's not possible """
448-
try:
449-
if string == 'category':
450-
return cls()
451-
else:
452-
raise TypeError("cannot construct a CategoricalDtype")
453-
except AttributeError:
454-
pass
455-
456443
@staticmethod
457444
def validate_ordered(ordered):
458445
"""

pandas/core/indexes/base.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2440,9 +2440,7 @@ def _union(self, other, sort):
24402440
def _wrap_setop_result(self, other, result):
24412441
return self._constructor(result, name=get_op_result_name(self, other))
24422442

2443-
# TODO: standardize return type of non-union setops type(self vs other)
2444-
def intersection(self, other, sort=False):
2445-
"""
2443+
_index_shared_docs['intersection'] = """
24462444
Form the intersection of two Index objects.
24472445
24482446
This returns a new Index with elements common to the index and `other`.
@@ -2476,6 +2474,10 @@ def intersection(self, other, sort=False):
24762474
>>> idx1.intersection(idx2)
24772475
Int64Index([3, 4], dtype='int64')
24782476
"""
2477+
2478+
# TODO: standardize return type of non-union setops type(self vs other)
2479+
@Appender(_index_shared_docs['intersection'])
2480+
def intersection(self, other, sort=False):
24792481
self._validate_sort_keyword(sort)
24802482
self._assert_can_do_setop(other)
24812483
other = ensure_index(other)
@@ -4013,11 +4015,7 @@ def __contains__(self, key):
40134015

40144016
@Appender(_index_shared_docs['contains'] % _index_doc_kwargs)
40154017
def contains(self, key):
4016-
hash(key)
4017-
try:
4018-
return key in self._engine
4019-
except (TypeError, ValueError):
4020-
return False
4018+
return key in self
40214019

40224020
def __hash__(self):
40234021
raise TypeError("unhashable type: %r" % type(self).__name__)

0 commit comments

Comments
 (0)