Skip to content

Commit 3414adf

Browse files
committed
Fix for issue #21150, using a simple lock to prevent an issue with multiple threads accessing an Index
1 parent c8ce3d0 commit 3414adf

File tree

6 files changed

+44
-8
lines changed

6 files changed

+44
-8
lines changed

ci/azure-windows-27.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ dependencies:
66
- beautifulsoup4
77
- bottleneck
88
- dateutil
9+
- futures
910
- gcsfs
1011
- html5lib
1112
- jinja2=2.8

ci/circle-27-compat.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ channels:
55
dependencies:
66
- bottleneck=1.0.0
77
- cython=0.28.2
8+
- futures
89
- jinja2=2.8
910
- numexpr=2.4.4 # we test that we correctly don't use an unsupported numexpr
1011
- numpy=1.9.3

ci/travis-27-locale.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ channels:
55
dependencies:
66
- bottleneck=1.0.0
77
- cython=0.28.2
8+
- futures
89
- lxml
910
- matplotlib=1.4.3
1011
- numpy=1.9.3

ci/travis-27.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ dependencies:
88
- cython=0.28.2
99
- fastparquet
1010
- feather-format
11+
- futures
1112
- gcsfs
1213
- html5lib
1314
- ipython

pandas/_libs/index.pyx

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,18 @@ from pandas._libs import algos, hashtable as _hash
2323
from pandas._libs.tslibs import Timestamp, Timedelta, period as periodlib
2424
from pandas._libs.missing import checknull
2525

26+
# Python 2 vs Python 3
27+
try:
28+
from thread import allocate_lock as _thread_allocate_lock
29+
except ImportError:
30+
try:
31+
from _thread import allocate_lock as _thread_allocate_lock
32+
except ImportError:
33+
try:
34+
from dummy_thread import allocate_lock as _thread_allocate_lock
35+
except ImportError:
36+
from _dummy_thread import allocate_lock as _thread_allocate_lock
37+
2638
cdef int64_t iNaT = util.get_nat()
2739

2840

@@ -53,6 +65,9 @@ def get_value_box(arr: ndarray, loc: object) -> object:
5365
# Don't populate hash tables in monotonic indexes larger than this
5466
_SIZE_CUTOFF = 1000000
5567

68+
# Used in _ensure_mapping_populated to ensure is_unique behaves correctly
69+
# in multi-threaded code, see gh-21150
70+
_mapping_populated_lock = _thread_allocate_lock()
5671

5772
cdef class IndexEngine:
5873

@@ -236,17 +251,17 @@ cdef class IndexEngine:
236251

237252
cdef inline _ensure_mapping_populated(self):
238253
# this populates the mapping
239-
# if its not already populated
254+
# if it is not already populated
240255
# also satisfies the need_unique_check
241256

242-
if not self.is_mapping_populated:
243-
244-
values = self._get_index_values()
245-
self.mapping = self._make_hash_table(len(values))
246-
self._call_map_locations(values)
257+
with _mapping_populated_lock:
258+
if not self.is_mapping_populated:
259+
values = self._get_index_values()
260+
self.mapping = self._make_hash_table(len(values))
261+
self._call_map_locations(values)
247262

248-
if len(self.mapping) == len(values):
249-
self.unique = 1
263+
if len(self.mapping) == len(values):
264+
self.unique = 1
250265

251266
self.need_unique_check = 0
252267

pandas/tests/indexes/test_base.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@
3333
import pandas as pd
3434
from pandas._libs.tslib import Timestamp
3535

36+
from concurrent.futures import ThreadPoolExecutor
37+
3638

3739
class TestIndex(Base):
3840
_holder = Index
@@ -2509,6 +2511,21 @@ def test_ensure_index_from_sequences(self, data, names, expected):
25092511
tm.assert_index_equal(result, expected)
25102512

25112513

2514+
class TestThreadSafety(object):
2515+
2516+
@pytest.mark.slow
2517+
@pytest.mark.parametrize('execution_number', range(7))
2518+
def test_isunique(self, execution_number):
2519+
"""This test is executed seven times, each time it uses a pool of
2520+
two threads to run a test that is very likely to fail without the
2521+
fix for gh-21150. It is not a deterministic test, as there is
2522+
still a chance it will pass even though the bug exists. But
2523+
with the fix, it must always work with not issues."""
2524+
x = pd.date_range('2001', '2020')
2525+
with ThreadPoolExecutor(2) as p:
2526+
assert all(p.map(lambda x: x.is_unique, [x] * 2))
2527+
2528+
25122529
@pytest.mark.parametrize('opname', ['eq', 'ne', 'le', 'lt', 'ge', 'gt',
25132530
'add', 'radd', 'sub', 'rsub',
25142531
'mul', 'rmul', 'truediv', 'rtruediv',

0 commit comments

Comments
 (0)