Skip to content

Commit 00ddd40

Browse files
committed
Fix for issue pandas-dev#21150, using a simple lock to prevent an issue with multiple threads accessing an Index
1 parent 322dbf4 commit 00ddd40

File tree

2 files changed

+47
-8
lines changed

2 files changed

+47
-8
lines changed

pandas/_libs/index.pyx

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,18 @@ from pandas._libs import algos, hashtable as _hash
2626
from pandas._libs.tslibs import Timestamp, Timedelta, period as periodlib
2727
from pandas._libs.missing import checknull
2828

29+
# Python 2 vs Python 3
30+
try:
31+
from thread import allocate_lock as _thread_allocate_lock
32+
except:
33+
try:
34+
from _thread import allocate_lock as _thread_allocate_lock
35+
except:
36+
try:
37+
from dummy_thread import allocate_lock as _thread_allocate_lock
38+
except:
39+
from _dummy_thread import allocate_lock as _thread_allocate_lock
40+
2941
cdef int64_t iNaT = util.get_nat()
3042

3143

@@ -72,6 +84,9 @@ cpdef object get_value_box(ndarray arr, object loc):
7284
# Don't populate hash tables in monotonic indexes larger than this
7385
_SIZE_CUTOFF = 1000000
7486

87+
# Used in _ensure_mapping_populated to ensure is_unique behaves correctly
88+
# in multi-threaded code, see gh-21150
89+
_mapping_populated_lock = _thread_allocate_lock()
7590

7691
cdef class IndexEngine:
7792

@@ -258,17 +273,17 @@ cdef class IndexEngine:
258273

259274
cdef inline _ensure_mapping_populated(self):
260275
# this populates the mapping
261-
# if its not already populated
276+
# if it is not already populated
262277
# also satisfies the need_unique_check
263278

264-
if not self.is_mapping_populated:
265-
266-
values = self._get_index_values()
267-
self.mapping = self._make_hash_table(len(values))
268-
self._call_map_locations(values)
279+
with _mapping_populated_lock:
280+
if not self.is_mapping_populated:
281+
values = self._get_index_values()
282+
self.mapping = self._make_hash_table(len(values))
283+
self._call_map_locations(values)
269284

270-
if len(self.mapping) == len(values):
271-
self.unique = 1
285+
if len(self.mapping) == len(values):
286+
self.unique = 1
272287

273288
self.need_unique_check = 0
274289

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Testing that indexes (and its engine) work correclty with multiple threads
4+
"""
5+
6+
import pytest
7+
import pandas as pd
8+
9+
from concurrent.futures import ThreadPoolExecutor
10+
11+
12+
class TestThreadSafety(object):
13+
14+
@pytest.mark.slow
15+
@pytest.mark.parametrize('execution_number', range(7))
16+
def test_isunique(self, execution_number):
17+
"""This test is executed seven times, each time it uses a pool of
18+
two threads to run a test that is very likely to fail without the
19+
fix for gh-21150. It is not a deterministic test, as there is
20+
still a chance it will pass even though the bug exists. But
21+
with the fix, it must always work with not issues."""
22+
x = pd.date_range('2001', '2020')
23+
with ThreadPoolExecutor(2) as p:
24+
assert all(p.map(lambda x: x.is_unique, [x] * 2))

0 commit comments

Comments
 (0)