Skip to content

Commit 356997c

Browse files
author
Ma Lin
authored
bpo-35859: Fix a few long-standing bugs in re engine (GH-12427)
In rare cases, capturing group could get wrong result. Regular expression engines in Perl and Java have similar bugs. The new behavior now matches the behavior of more modern RE engines: in the regex module and in PHP, Ruby and Node.js.
1 parent 7881549 commit 356997c

File tree

5 files changed

+152
-19
lines changed

5 files changed

+152
-19
lines changed

Doc/whatsnew/3.11.rst

+5
Original file line numberDiff line numberDiff line change
@@ -718,6 +718,11 @@ Changes in the Python API
718718
deprecated since Python 3.6.
719719
(Contributed by Serhiy Storchaka in :issue:`47066`.)
720720

721+
* :mod:`re` module: Fix a few long-standing bugs where, in rare cases,
722+
capturing group could get wrong result. So the result may be different than
723+
before.
724+
(Contributed by Ma Lin in :issue:`35859`.)
725+
721726
* The *population* parameter of :func:`random.sample` must be a sequence.
722727
Automatic conversion of sets to lists is no longer supported. If the sample size
723728
is larger than the population size, a :exc:`ValueError` is raised.

Lib/test/test_re.py

+69
Original file line numberDiff line numberDiff line change
@@ -2033,6 +2033,75 @@ def test_bug_34294(self):
20332033
{'tag': 'foo', 'text': None},
20342034
{'tag': 'foo', 'text': None}])
20352035

2036+
def test_MARK_PUSH_macro_bug(self):
2037+
# issue35859, MARK_PUSH() macro didn't protect MARK-0 if it
2038+
# was the only available mark.
2039+
self.assertEqual(re.match(r'(ab|a)*?b', 'ab').groups(), ('a',))
2040+
self.assertEqual(re.match(r'(ab|a)+?b', 'ab').groups(), ('a',))
2041+
self.assertEqual(re.match(r'(ab|a){0,2}?b', 'ab').groups(), ('a',))
2042+
self.assertEqual(re.match(r'(.b|a)*?b', 'ab').groups(), ('a',))
2043+
2044+
def test_MIN_UNTIL_mark_bug(self):
2045+
# Fixed in issue35859, reported in issue9134.
2046+
# JUMP_MIN_UNTIL_2 should MARK_PUSH() if in a repeat
2047+
s = 'axxzbcz'
2048+
p = r'(?:(?:a|bc)*?(xx)??z)*'
2049+
self.assertEqual(re.match(p, s).groups(), ('xx',))
2050+
2051+
# test-case provided by issue9134
2052+
s = 'xtcxyzxc'
2053+
p = r'((x|yz)+?(t)??c)*'
2054+
m = re.match(p, s)
2055+
self.assertEqual(m.span(), (0, 8))
2056+
self.assertEqual(m.span(2), (6, 7))
2057+
self.assertEqual(m.groups(), ('xyzxc', 'x', 't'))
2058+
2059+
def test_REPEAT_ONE_mark_bug(self):
2060+
# issue35859
2061+
# JUMP_REPEAT_ONE_1 should MARK_PUSH() if in a repeat
2062+
s = 'aabaab'
2063+
p = r'(?:[^b]*a(?=(b)|(a))ab)*'
2064+
m = re.match(p, s)
2065+
self.assertEqual(m.span(), (0, 6))
2066+
self.assertEqual(m.span(2), (4, 5))
2067+
self.assertEqual(m.groups(), (None, 'a'))
2068+
2069+
# JUMP_REPEAT_ONE_2 should MARK_PUSH() if in a repeat
2070+
s = 'abab'
2071+
p = r'(?:[^b]*(?=(b)|(a))ab)*'
2072+
m = re.match(p, s)
2073+
self.assertEqual(m.span(), (0, 4))
2074+
self.assertEqual(m.span(2), (2, 3))
2075+
self.assertEqual(m.groups(), (None, 'a'))
2076+
2077+
self.assertEqual(re.match(r'(ab?)*?b', 'ab').groups(), ('a',))
2078+
2079+
def test_MIN_REPEAT_ONE_mark_bug(self):
2080+
# issue35859
2081+
# JUMP_MIN_REPEAT_ONE should MARK_PUSH() if in a repeat
2082+
s = 'abab'
2083+
p = r'(?:.*?(?=(a)|(b))b)*'
2084+
m = re.match(p, s)
2085+
self.assertEqual(m.span(), (0, 4))
2086+
self.assertEqual(m.span(2), (3, 4))
2087+
self.assertEqual(m.groups(), (None, 'b'))
2088+
2089+
s = 'axxzaz'
2090+
p = r'(?:a*?(xx)??z)*'
2091+
self.assertEqual(re.match(p, s).groups(), ('xx',))
2092+
2093+
def test_ASSERT_NOT_mark_bug(self):
2094+
# Fixed in issue35859, reported in issue725149.
2095+
# JUMP_ASSERT_NOT should LASTMARK_SAVE()
2096+
self.assertEqual(re.match(r'(?!(..)c)', 'ab').groups(), (None,))
2097+
2098+
# JUMP_ASSERT_NOT should MARK_PUSH() if in a repeat
2099+
m = re.match(r'((?!(ab)c)(.))*', 'abab')
2100+
self.assertEqual(m.span(), (0, 4))
2101+
self.assertEqual(m.span(1), (3, 4))
2102+
self.assertEqual(m.span(3), (3, 4))
2103+
self.assertEqual(m.groups(), ('b', None, 'b'))
2104+
20362105
def test_bug_40736(self):
20372106
with self.assertRaisesRegex(TypeError, "got 'int'"):
20382107
re.search("x*", 5)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
:mod:`re` module, fix a few bugs about capturing group. In rare cases,
2+
capturing group gets an incorrect string. Patch by Ma Lin.

Modules/_sre.c

+17
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,14 @@ state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
532532
} else {
533533
i = STATE_OFFSET(state, state->mark[index]);
534534
j = STATE_OFFSET(state, state->mark[index+1]);
535+
536+
/* check wrong span */
537+
if (i > j) {
538+
PyErr_SetString(PyExc_SystemError,
539+
"The span of capturing group is wrong,"
540+
" please report a bug for the re module.");
541+
return NULL;
542+
}
535543
}
536544

537545
return getslice(state->isbytes, state->beginning, string, i, j);
@@ -2477,6 +2485,15 @@ pattern_new_match(_sremodulestate* module_state,
24772485
if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
24782486
match->mark[j+2] = ((char*) state->mark[j] - base) / n;
24792487
match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
2488+
2489+
/* check wrong span */
2490+
if (match->mark[j+2] > match->mark[j+3]) {
2491+
PyErr_SetString(PyExc_SystemError,
2492+
"The span of capturing group is wrong,"
2493+
" please report a bug for the re module.");
2494+
Py_DECREF(match);
2495+
return NULL;
2496+
}
24802497
} else
24812498
match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
24822499

Modules/sre_lib.h

+59-19
Original file line numberDiff line numberDiff line change
@@ -449,20 +449,20 @@ do { \
449449
DATA_STACK_LOOKUP_AT(state,t,p,pos)
450450

451451
#define MARK_PUSH(lastmark) \
452-
do if (lastmark > 0) { \
452+
do if (lastmark >= 0) { \
453453
i = lastmark; /* ctx->lastmark may change if reallocated */ \
454454
DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
455455
} while (0)
456456
#define MARK_POP(lastmark) \
457-
do if (lastmark > 0) { \
457+
do if (lastmark >= 0) { \
458458
DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
459459
} while (0)
460460
#define MARK_POP_KEEP(lastmark) \
461-
do if (lastmark > 0) { \
461+
do if (lastmark >= 0) { \
462462
DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
463463
} while (0)
464464
#define MARK_POP_DISCARD(lastmark) \
465-
do if (lastmark > 0) { \
465+
do if (lastmark >= 0) { \
466466
DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
467467
} while (0)
468468

@@ -770,8 +770,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
770770
/* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
771771
TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
772772
LASTMARK_SAVE();
773-
ctx->u.rep = state->repeat;
774-
if (ctx->u.rep)
773+
if (state->repeat)
775774
MARK_PUSH(ctx->lastmark);
776775
for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
777776
if (ctx->pattern[1] == SRE_OP_LITERAL &&
@@ -786,16 +785,16 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
786785
state->ptr = ctx->ptr;
787786
DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
788787
if (ret) {
789-
if (ctx->u.rep)
788+
if (state->repeat)
790789
MARK_POP_DISCARD(ctx->lastmark);
791790
RETURN_ON_ERROR(ret);
792791
RETURN_SUCCESS;
793792
}
794-
if (ctx->u.rep)
793+
if (state->repeat)
795794
MARK_POP_KEEP(ctx->lastmark);
796795
LASTMARK_RESTORE();
797796
}
798-
if (ctx->u.rep)
797+
if (state->repeat)
799798
MARK_POP_DISCARD(ctx->lastmark);
800799
RETURN_FAILURE;
801800

@@ -841,6 +840,8 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
841840
}
842841

843842
LASTMARK_SAVE();
843+
if (state->repeat)
844+
MARK_PUSH(ctx->lastmark);
844845

845846
if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
846847
/* tail starts with a literal. skip positions where
@@ -858,30 +859,41 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
858859
DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
859860
ctx->pattern+ctx->pattern[0]);
860861
if (ret) {
862+
if (state->repeat)
863+
MARK_POP_DISCARD(ctx->lastmark);
861864
RETURN_ON_ERROR(ret);
862865
RETURN_SUCCESS;
863866
}
864-
867+
if (state->repeat)
868+
MARK_POP_KEEP(ctx->lastmark);
865869
LASTMARK_RESTORE();
866870

867871
ctx->ptr--;
868872
ctx->count--;
869873
}
870-
874+
if (state->repeat)
875+
MARK_POP_DISCARD(ctx->lastmark);
871876
} else {
872877
/* general case */
873878
while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
874879
state->ptr = ctx->ptr;
875880
DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
876881
ctx->pattern+ctx->pattern[0]);
877882
if (ret) {
883+
if (state->repeat)
884+
MARK_POP_DISCARD(ctx->lastmark);
878885
RETURN_ON_ERROR(ret);
879886
RETURN_SUCCESS;
880887
}
888+
if (state->repeat)
889+
MARK_POP_KEEP(ctx->lastmark);
890+
LASTMARK_RESTORE();
891+
881892
ctx->ptr--;
882893
ctx->count--;
883-
LASTMARK_RESTORE();
884894
}
895+
if (state->repeat)
896+
MARK_POP_DISCARD(ctx->lastmark);
885897
}
886898
RETURN_FAILURE;
887899

@@ -930,15 +942,24 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
930942
} else {
931943
/* general case */
932944
LASTMARK_SAVE();
945+
if (state->repeat)
946+
MARK_PUSH(ctx->lastmark);
947+
933948
while ((Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT
934949
|| ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
935950
state->ptr = ctx->ptr;
936951
DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
937952
ctx->pattern+ctx->pattern[0]);
938953
if (ret) {
954+
if (state->repeat)
955+
MARK_POP_DISCARD(ctx->lastmark);
939956
RETURN_ON_ERROR(ret);
940957
RETURN_SUCCESS;
941958
}
959+
if (state->repeat)
960+
MARK_POP_KEEP(ctx->lastmark);
961+
LASTMARK_RESTORE();
962+
942963
state->ptr = ctx->ptr;
943964
ret = SRE(count)(state, ctx->pattern+3, 1);
944965
RETURN_ON_ERROR(ret);
@@ -948,8 +969,9 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
948969
assert(ret == 1);
949970
ctx->ptr++;
950971
ctx->count++;
951-
LASTMARK_RESTORE();
952972
}
973+
if (state->repeat)
974+
MARK_POP_DISCARD(ctx->lastmark);
953975
}
954976
RETURN_FAILURE;
955977

@@ -1098,8 +1120,9 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
10981120
tail matches */
10991121
state->repeat = ctx->u.rep->prev;
11001122
DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
1123+
state->repeat = ctx->u.rep; // restore repeat before return
1124+
11011125
RETURN_ON_SUCCESS(ret);
1102-
state->repeat = ctx->u.rep;
11031126
state->ptr = ctx->ptr;
11041127
RETURN_FAILURE;
11051128

@@ -1132,21 +1155,29 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
11321155
RETURN_FAILURE;
11331156
}
11341157

1135-
LASTMARK_SAVE();
1136-
11371158
/* see if the tail matches */
11381159
state->repeat = ctx->u.rep->prev;
1160+
1161+
LASTMARK_SAVE();
1162+
if (state->repeat)
1163+
MARK_PUSH(ctx->lastmark);
1164+
11391165
DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
1166+
SRE_REPEAT *repeat_of_tail = state->repeat;
1167+
state->repeat = ctx->u.rep; // restore repeat before return
1168+
11401169
if (ret) {
1170+
if (repeat_of_tail)
1171+
MARK_POP_DISCARD(ctx->lastmark);
11411172
RETURN_ON_ERROR(ret);
11421173
RETURN_SUCCESS;
11431174
}
1175+
if (repeat_of_tail)
1176+
MARK_POP(ctx->lastmark);
1177+
LASTMARK_RESTORE();
11441178

1145-
state->repeat = ctx->u.rep;
11461179
state->ptr = ctx->ptr;
11471180

1148-
LASTMARK_RESTORE();
1149-
11501181
if ((ctx->count >= (Py_ssize_t) ctx->u.rep->pattern[2]
11511182
&& ctx->u.rep->pattern[2] != SRE_MAXREPEAT) ||
11521183
state->ptr == ctx->u.rep->last_ptr)
@@ -1444,11 +1475,20 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
14441475
ctx->ptr, ctx->pattern[1]));
14451476
if (ctx->ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)ctx->pattern[1]) {
14461477
state->ptr = ctx->ptr - ctx->pattern[1];
1478+
LASTMARK_SAVE();
1479+
if (state->repeat)
1480+
MARK_PUSH(ctx->lastmark);
1481+
14471482
DO_JUMP0(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
14481483
if (ret) {
1484+
if (state->repeat)
1485+
MARK_POP_DISCARD(ctx->lastmark);
14491486
RETURN_ON_ERROR(ret);
14501487
RETURN_FAILURE;
14511488
}
1489+
if (state->repeat)
1490+
MARK_POP(ctx->lastmark);
1491+
LASTMARK_RESTORE();
14521492
}
14531493
ctx->pattern += ctx->pattern[0];
14541494
break;

0 commit comments

Comments
 (0)