Skip to content
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
f738245
1. add asserts
wjssz Mar 8, 2019
0d1e658
2. macro MARK_PUSH() bug
wjssz Mar 8, 2019
cbc10fc
3. restore state->repeat at the first opportunity
wjssz Mar 14, 2019
1043bc6
4. JUMP_MIN_UNTIL_2 should MARK_PUSH() if in a repeat
wjssz Mar 14, 2019
8e1407e
5. JUMP_REPEAT_ONE_[12] should MARK_PUSH() if in a repeat
wjssz Mar 14, 2019
5772bb1
6. JUMP_MIN_REPEAT_ONE should MARK_PUSH() if in a repeat
wjssz Mar 14, 2019
738ed31
7. JUMP_ASSERT_NOT should LASTMARK_SAVE()
wjssz Mar 14, 2019
abde0e9
8. JUMP_ASSERT_NOT should MARK_PUSH() if in a repeat
wjssz Mar 14, 2019
32b9797
9. reduce sizeof(match_context)
wjssz Mar 14, 2019
7ff501b
10. limit max group to 1,073,741,823
wjssz Mar 14, 2019
2b359f3
11. raise RuntimeError if the span of capturing group is wrong
wjssz Mar 14, 2019
3ae8e69
12. clean error process code
wjssz Mar 14, 2019
5b3e43d
Merge branch 'master' into Solution_A_repeat
animalize Apr 13, 2020
c84d7ed
Address review comments
wjssz May 1, 2020
b00c6e3
Merge branch 'main' into Solution_A_repeat
serhiy-storchaka Mar 21, 2022
d671c44
fix NEWS file
wjssz Mar 21, 2022
ccbdb92
Revert "10. limit max group to 1,073,741,823"
wjssz Mar 21, 2022
18bb63b
Revert "9. reduce sizeof(match_context)"
wjssz Mar 21, 2022
e084e64
split unit-tests
wjssz Mar 21, 2022
94a4fdc
Revert "1. add asserts"
wjssz Mar 21, 2022
49d00d2
add .span(group_num) to unit-tests
wjssz Mar 22, 2022
785d9bf
Revert "12. clean error process code"
wjssz Mar 28, 2022
3fe33a8
Porting to Python 3.11
wjssz Mar 29, 2022
d6c07b6
Merge branch 'main' into Solution_A_repeat
animalize Mar 29, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions Lib/test/test_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -2104,6 +2104,59 @@ def test_bug_34294(self):
{'tag': 'foo', 'text': None},
{'tag': 'foo', 'text': None}])

def test_bug_35859(self):
# MARK_PUSH() macro didn't protect MARK-0
# if it was the only available mark.
self.assertEqual(re.match(r'(ab|a)*?b', 'ab').groups(), ('a',))
self.assertEqual(re.match(r'(ab|a)+?b', 'ab').groups(), ('a',))
self.assertEqual(re.match(r'(ab|a){0,2}?b', 'ab').groups(), ('a',))
self.assertEqual(re.match(r'(.b|a)*?b', 'ab').groups(), ('a',))

# JUMP_MIN_UNTIL_2 should MARK_PUSH() if in a repeat
s = 'axxzbcz'
p = r'(?:(?:a|bc)*?(xx)??z)*'
self.assertEqual(re.match(p, s).groups(), ('xx',))
# test-case provided by issue9134
s = 'xtcxyzxc'
p = r'((x|yz)+?(t)??c)*'
self.assertEqual(re.match(p, s).groups(), ('xyzxc', 'x', 't'))

# JUMP_REPEAT_ONE_1 should MARK_PUSH() if in a repeat
s = 'aabaab'
p = r'(?:[^b]*a(?=(b)|(a))ab)*'
m = re.match(p, s)
self.assertEqual(m.span(), (0, 6))
self.assertEqual(m.groups(), (None, 'a'))

# JUMP_REPEAT_ONE_2 should MARK_PUSH() if in a repeat
s = 'abab'
p = r'(?:[^b]*(?=(b)|(a))ab)*'
m = re.match(p, s)
self.assertEqual(m.span(), (0, 4))
self.assertEqual(m.groups(), (None, 'a'))

self.assertEqual(re.match(r'(ab?)*?b', 'ab').groups(), ('a',))

# JUMP_MIN_REPEAT_ONE should MARK_PUSH() if in a repeat
s = 'abab'
p = r'(?:.*?(?=(a)|(b))b)*'
m = re.match(p, s)
self.assertEqual(m.span(), (0, 4))
self.assertEqual(m.groups(), (None, 'b'))

s = 'axxzaz'
p = r'(?:a*?(xx)??z)*'
self.assertEqual(re.match(p, s).groups(), ('xx',))

# JUMP_ASSERT_NOT should LASTMARK_SAVE()
# reported in issue725149
self.assertEqual(re.match(r'(?!(..)c)', 'ab').groups(), (None,))

# JUMP_ASSERT_NOT should MARK_PUSH() if in a repeat
m = re.match(r'((?!(ab)c)(.))*', 'abab')
self.assertEqual(m.span(), (0, 4))
self.assertEqual(m.groups(), ('b', None, 'b'))


class PatternReprTests(unittest.TestCase):
def check(self, pattern, expected):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
re module, fix a few bugs about capturing group. In rare cases, capturing
group gets an incorrect string. Patch by Ma Lin.
25 changes: 17 additions & 8 deletions Modules/_sre.c
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ static const char copyright[] =
/* error codes */
#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
#define SRE_ERROR_STATE -2 /* illegal state */
#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
#define SRE_ERROR_MEMORY -9 /* out of memory */
#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */

Expand Down Expand Up @@ -511,6 +510,14 @@ state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
} else {
i = STATE_OFFSET(state, state->mark[index]);
j = STATE_OFFSET(state, state->mark[index+1]);

/* check wrong span */
if (i > j) {
PyErr_SetString(PyExc_RuntimeError,
"the span of capturing group is wrong,"
" please report a bug.");
return NULL;
}
}

return getslice(state->isbytes, state->beginning, string, i, j);
Expand All @@ -520,13 +527,6 @@ static void
pattern_error(Py_ssize_t status)
{
switch (status) {
case SRE_ERROR_RECURSION_LIMIT:
/* This error code seems to be unused. */
PyErr_SetString(
PyExc_RecursionError,
"maximum recursion limit exceeded"
);
break;
case SRE_ERROR_MEMORY:
PyErr_NoMemory();
break;
Expand Down Expand Up @@ -2353,6 +2353,15 @@ pattern_new_match(PatternObject* pattern, SRE_STATE* state, Py_ssize_t status)
if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
match->mark[j+2] = ((char*) state->mark[j] - base) / n;
match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;

/* check wrong span */
if (match->mark[j+2] > match->mark[j+3]) {
PyErr_SetString(PyExc_RuntimeError,
"the span of capturing group is wrong,"
" please report a bug.");
Py_DECREF(match);
return NULL;
}
} else
match->mark[j+2] = match->mark[j+3] = -1; /* undefined */

Expand Down
14 changes: 8 additions & 6 deletions Modules/sre.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,14 @@
/* size of a code word (must be unsigned short or larger, and
large enough to hold a UCS4 character) */
#define SRE_CODE Py_UCS4

/* SRE_MAXGROUPS is 1,073,741,823 */
#define SRE_MAXGROUPS INT_MAX / 2

#if SIZEOF_SIZE_T > 4
# define SRE_MAXREPEAT (~(SRE_CODE)0)
# define SRE_MAXGROUPS ((~(SRE_CODE)0) / 2)
#else
# define SRE_MAXREPEAT ((SRE_CODE)PY_SSIZE_T_MAX)
# define SRE_MAXGROUPS ((SRE_CODE)PY_SSIZE_T_MAX / SIZEOF_SIZE_T / 2)
#endif

typedef struct {
Expand Down Expand Up @@ -71,18 +73,18 @@ typedef struct {
Py_ssize_t pos, endpos;
int isbytes;
int charsize; /* character size */
/* current repeat context */
SRE_REPEAT *repeat;
/* registers */
Py_ssize_t lastindex;
Py_ssize_t lastmark;
int32_t lastmark;
int32_t lastindex;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why? I you want just to save some memory, it is better to left it for other issue. Some local variables can change its type too.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it can be changed it another issue.

const void** mark;
int match_all;
int must_advance;
/* dynamically allocated stuff */
char* data_stack;
size_t data_stack_size;
size_t data_stack_base;
/* current repeat context */
SRE_REPEAT *repeat;
} SRE_STATE;

typedef struct {
Expand Down
Loading