diff --git a/ci/azure-windows-27.yaml b/ci/azure-windows-27.yaml index bcd9ddee1715e..1bb8921df0f46 100644 --- a/ci/azure-windows-27.yaml +++ b/ci/azure-windows-27.yaml @@ -6,6 +6,7 @@ dependencies: - beautifulsoup4 - bottleneck - dateutil + - futures - gcsfs - html5lib - jinja2=2.8 diff --git a/ci/circle-27-compat.yaml b/ci/circle-27-compat.yaml index 84ec7e20fc8f1..2c2252713bdf2 100644 --- a/ci/circle-27-compat.yaml +++ b/ci/circle-27-compat.yaml @@ -5,6 +5,7 @@ channels: dependencies: - bottleneck=1.0.0 - cython=0.28.2 + - futures - jinja2=2.8 - numexpr=2.4.4 # we test that we correctly don't use an unsupported numexpr - numpy=1.9.3 diff --git a/ci/travis-27-locale.yaml b/ci/travis-27-locale.yaml index aca65f27d4187..d579ff0f297c3 100644 --- a/ci/travis-27-locale.yaml +++ b/ci/travis-27-locale.yaml @@ -5,6 +5,7 @@ channels: dependencies: - bottleneck=1.0.0 - cython=0.28.2 + - futures - lxml - matplotlib=1.4.3 - numpy=1.9.3 diff --git a/ci/travis-27.yaml b/ci/travis-27.yaml index cc0c5a3192188..3aef1caac73ac 100644 --- a/ci/travis-27.yaml +++ b/ci/travis-27.yaml @@ -8,6 +8,7 @@ dependencies: - cython=0.28.2 - fastparquet - feather-format + - futures - gcsfs - html5lib - ipython diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 3f76915655f58..6645f33c8d8b8 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -23,6 +23,18 @@ from pandas._libs import algos, hashtable as _hash from pandas._libs.tslibs import Timestamp, Timedelta, period as periodlib from pandas._libs.missing import checknull +# Python 2 vs Python 3 +try: + from thread import allocate_lock as _thread_allocate_lock +except ImportError: + try: + from _thread import allocate_lock as _thread_allocate_lock + except ImportError: + try: + from dummy_thread import allocate_lock as _thread_allocate_lock + except ImportError: + from _dummy_thread import allocate_lock as _thread_allocate_lock + cdef int64_t iNaT = util.get_nat() @@ -53,6 +65,9 @@ def get_value_box(arr: ndarray, loc: object) -> object: # Don't populate hash tables in monotonic indexes larger than this _SIZE_CUTOFF = 1000000 +# Used in _ensure_mapping_populated to ensure is_unique behaves correctly +# in multi-threaded code, see gh-21150 +_mapping_populated_lock = _thread_allocate_lock() cdef class IndexEngine: @@ -236,17 +251,17 @@ cdef class IndexEngine: cdef inline _ensure_mapping_populated(self): # this populates the mapping - # if its not already populated + # if it is not already populated # also satisfies the need_unique_check - if not self.is_mapping_populated: - - values = self._get_index_values() - self.mapping = self._make_hash_table(len(values)) - self._call_map_locations(values) + with _mapping_populated_lock: + if not self.is_mapping_populated: + values = self._get_index_values() + self.mapping = self._make_hash_table(len(values)) + self._call_map_locations(values) - if len(self.mapping) == len(values): - self.unique = 1 + if len(self.mapping) == len(values): + self.unique = 1 self.need_unique_check = 0 diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index a753e925b0ed8..00bbe9b90aadc 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -33,6 +33,8 @@ import pandas as pd from pandas._libs.tslib import Timestamp +from concurrent.futures import ThreadPoolExecutor + class TestIndex(Base): _holder = Index @@ -2509,6 +2511,21 @@ def test_ensure_index_from_sequences(self, data, names, expected): tm.assert_index_equal(result, expected) +class TestThreadSafety(object): + + @pytest.mark.slow + @pytest.mark.parametrize('execution_number', range(7)) + def test_isunique(self, execution_number): + """This test is executed seven times, each time it uses a pool of + two threads to run a test that is very likely to fail without the + fix for gh-21150. It is not a deterministic test, as there is + still a chance it will pass even though the bug exists. But + with the fix, it must always work with not issues.""" + x = pd.date_range('2001', '2020') + with ThreadPoolExecutor(2) as p: + assert all(p.map(lambda x: x.is_unique, [x] * 2)) + + @pytest.mark.parametrize('opname', ['eq', 'ne', 'le', 'lt', 'ge', 'gt', 'add', 'radd', 'sub', 'rsub', 'mul', 'rmul', 'truediv', 'rtruediv',