Skip to content

Commit cc9f49b

Browse files
committed
Fix for issue #21150, using a simple lock to prevent an issue with multiple threads accessing an Index
1 parent 114f415 commit cc9f49b

File tree

6 files changed

+44
-8
lines changed

6 files changed

+44
-8
lines changed

ci/appveyor-27.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ dependencies:
66
- beautifulsoup4
77
- bottleneck
88
- dateutil
9+
- futures
910
- gcsfs
1011
- html5lib
1112
- jinja2=2.8

ci/circle-27-compat.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ channels:
55
dependencies:
66
- bottleneck=1.0.0
77
- cython=0.28.2
8+
- futures
89
- jinja2=2.8
910
- numexpr=2.4.4 # we test that we correctly don't use an unsupported numexpr
1011
- numpy=1.9.2

ci/travis-27-locale.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ channels:
55
dependencies:
66
- bottleneck=1.0.0
77
- cython=0.28.2
8+
- futures
89
- lxml
910
- matplotlib=1.4.3
1011
- numpy=1.9.2

ci/travis-27.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ dependencies:
99
- fastparquet
1010
- feather-format
1111
- flake8=3.4.1
12+
- futures
1213
- gcsfs
1314
- html5lib
1415
- ipython

pandas/_libs/index.pyx

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,18 @@ from pandas._libs import algos, hashtable as _hash
2626
from pandas._libs.tslibs import Timestamp, Timedelta, period as periodlib
2727
from pandas._libs.missing import checknull
2828

29+
# Python 2 vs Python 3
30+
try:
31+
from thread import allocate_lock as _thread_allocate_lock
32+
except:
33+
try:
34+
from _thread import allocate_lock as _thread_allocate_lock
35+
except:
36+
try:
37+
from dummy_thread import allocate_lock as _thread_allocate_lock
38+
except:
39+
from _dummy_thread import allocate_lock as _thread_allocate_lock
40+
2941
cdef int64_t iNaT = util.get_nat()
3042

3143

@@ -72,6 +84,9 @@ cpdef object get_value_box(ndarray arr, object loc):
7284
# Don't populate hash tables in monotonic indexes larger than this
7385
_SIZE_CUTOFF = 1000000
7486

87+
# Used in _ensure_mapping_populated to ensure is_unique behaves correctly
88+
# in multi-threaded code, see gh-21150
89+
_mapping_populated_lock = _thread_allocate_lock()
7590

7691
cdef class IndexEngine:
7792

@@ -258,17 +273,17 @@ cdef class IndexEngine:
258273

259274
cdef inline _ensure_mapping_populated(self):
260275
# this populates the mapping
261-
# if its not already populated
276+
# if it is not already populated
262277
# also satisfies the need_unique_check
263278

264-
if not self.is_mapping_populated:
265-
266-
values = self._get_index_values()
267-
self.mapping = self._make_hash_table(len(values))
268-
self._call_map_locations(values)
279+
with _mapping_populated_lock:
280+
if not self.is_mapping_populated:
281+
values = self._get_index_values()
282+
self.mapping = self._make_hash_table(len(values))
283+
self._call_map_locations(values)
269284

270-
if len(self.mapping) == len(values):
271-
self.unique = 1
285+
if len(self.mapping) == len(values):
286+
self.unique = 1
272287

273288
self.need_unique_check = 0
274289

pandas/tests/indexes/test_base.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
import pandas as pd
3333
from pandas._libs.tslib import Timestamp
3434

35+
from concurrent.futures import ThreadPoolExecutor
36+
3537

3638
class TestIndex(Base):
3739
_holder = Index
@@ -2493,6 +2495,21 @@ def test_ensure_index_from_sequences(self, data, names, expected):
24932495
tm.assert_index_equal(result, expected)
24942496

24952497

2498+
class TestThreadSafety(object):
2499+
2500+
@pytest.mark.slow
2501+
@pytest.mark.parametrize('execution_number', range(7))
2502+
def test_isunique(self, execution_number):
2503+
"""This test is executed seven times, each time it uses a pool of
2504+
two threads to run a test that is very likely to fail without the
2505+
fix for gh-21150. It is not a deterministic test, as there is
2506+
still a chance it will pass even though the bug exists. But
2507+
with the fix, it must always work with not issues."""
2508+
x = pd.date_range('2001', '2020')
2509+
with ThreadPoolExecutor(2) as p:
2510+
assert all(p.map(lambda x: x.is_unique, [x] * 2))
2511+
2512+
24962513
@pytest.mark.parametrize('opname', ['eq', 'ne', 'le', 'lt', 'ge', 'gt',
24972514
'add', 'radd', 'sub', 'rsub',
24982515
'mul', 'rmul', 'truediv', 'rtruediv',

0 commit comments

Comments
 (0)