Skip to content

Commit 19a498a

Browse files
committed
fix #131649 - extended charclass can trigger assert
The extended charclass parser makes some assumptions during the first pass which are only true on well structured input, and it does not properly catch various errors. later on the code assumes that things the first pass will let through are valid, when in fact they should trigger errors.
1 parent af05e4c commit 19a498a

File tree

6 files changed

+68
-32
lines changed

6 files changed

+68
-32
lines changed

β€Žpod/perldiag.pod

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5945,7 +5945,7 @@ yourself.
59455945
a perl4 interpreter, especially if the next 2 tokens are "use strict"
59465946
or "my $var" or "our $var".
59475947

5948-
=item Syntax error in (?[...]) in regex m/%s/
5948+
=item Syntax error in (?[...]) in regex; marked by <-- HERE in m/%s/
59495949

59505950
(F) Perl could not figure out what you meant inside this construct; this
59515951
notifies you that it is giving up trying.
@@ -6441,6 +6441,31 @@ to find out why that isn't happening.
64416441
(F) The unexec() routine failed for some reason. See your local FSF
64426442
representative, who probably put it there in the first place.
64436443

6444+
=item Unexpected ']' with no following ')' in (?[... in regex; marked by <-- HERE in m/%s/
6445+
6446+
(F) While parsing an extended character class a ']' character was encountered
6447+
at a point in the definition where the only legal use of ']' is to close the
6448+
character class definition as part of a '])', you may have forgotten the close
6449+
paren, or otherwise confused the parser.
6450+
6451+
=item Expecting close paren for nested extended charclass in regex; marked by <-- HERE in m/%s/
6452+
6453+
(F) While parsing a nested extended character class like:
6454+
6455+
(?[ ... (?flags:(?[ ... ])) ... ])
6456+
^
6457+
6458+
we expected to see a close paren ')' (marked by ^) but did not.
6459+
6460+
=item Expecting close paren for wrapper for nested extended charclass in regex; marked by <-- HERE in m/%s/
6461+
6462+
(F) While parsing a nested extended character class like:
6463+
6464+
(?[ ... (?flags:(?[ ... ])) ... ])
6465+
^
6466+
6467+
we expected to see a close paren ')' (marked by ^) but did not.
6468+
64446469
=item Unexpected binary operator '%c' with no preceding operand in regex;
64456470
marked by S<<-- HERE> in m/%s/
64466471

β€Žpod/perlrecharclass.pod

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1128,8 +1128,8 @@ hence both of the following work:
11281128
Any contained POSIX character classes, including things like C<\w> and C<\D>
11291129
respect the C<E<sol>a> (and C<E<sol>aa>) modifiers.
11301130

1131-
C<< (?[ ]) >> is a regex-compile-time construct. Any attempt to use
1132-
something which isn't knowable at the time the containing regular
1131+
Note that C<< (?[ ]) >> is a regex-compile-time construct. Any attempt
1132+
to use something which isn't knowable at the time the containing regular
11331133
expression is compiled is a fatal error. In practice, this means
11341134
just three limitations:
11351135

β€Žregcomp.c

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14947,8 +14947,9 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
1494714947
TRUE /* Force /x */ );
1494814948

1494914949
switch (*RExC_parse) {
14950-
case '?':
14951-
if (RExC_parse[1] == '[') nest_depth++, RExC_parse++;
14950+
case '(':
14951+
if (RExC_parse[1] == '?' && RExC_parse[2] == '[')
14952+
nest_depth++, RExC_parse+=2;
1495214953
/* FALLTHROUGH */
1495314954
default:
1495414955
break;
@@ -15005,9 +15006,9 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
1500515006
}
1500615007

1500715008
case ']':
15008-
if (nest_depth--) break;
15009-
RExC_parse++;
15010-
if (*RExC_parse == ')') {
15009+
if (RExC_parse[1] == ')') {
15010+
RExC_parse++;
15011+
if (nest_depth--) break;
1501115012
node = reganode(pRExC_state, ANYOF, 0);
1501215013
RExC_size += ANYOF_SKIP;
1501315014
nextchar(pRExC_state);
@@ -15019,20 +15020,25 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
1501915020

1502015021
return node;
1502115022
}
15022-
goto no_close;
15023+
/* We output the messages even if warnings are off, because we'll fail
15024+
* the very next thing, and these give a likely diagnosis for that */
15025+
if (posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
15026+
output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
15027+
}
15028+
RExC_parse++;
15029+
vFAIL("Unexpected ']' with no following ')' in (?[...");
1502315030
}
1502415031

1502515032
RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
1502615033
}
1502715034

15028-
no_close:
1502915035
/* We output the messages even if warnings are off, because we'll fail
1503015036
* the very next thing, and these give a likely diagnosis for that */
1503115037
if (posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
1503215038
output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
1503315039
}
1503415040

15035-
FAIL("Syntax error in (?[...])");
15041+
vFAIL("Syntax error in (?[...])");
1503615042
}
1503715043

1503815044
/* Pass 2 only after this. */
@@ -15212,12 +15218,14 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
1521215218
* inversion list, and RExC_parse points to the trailing
1521315219
* ']'; the next character should be the ')' */
1521415220
RExC_parse++;
15215-
assert(UCHARAT(RExC_parse) == ')');
15221+
if (UCHARAT(RExC_parse) != ')')
15222+
vFAIL("Expecting close paren for nested extended charclass");
1521615223

1521715224
/* Then the ')' matching the original '(' handled by this
1521815225
* case: statement */
1521915226
RExC_parse++;
15220-
assert(UCHARAT(RExC_parse) == ')');
15227+
if (UCHARAT(RExC_parse) != ')')
15228+
vFAIL("Expecting close paren for wrapper for nested extended charclass");
1522115229

1522215230
RExC_parse++;
1522315231
RExC_flags = save_flags;

β€Žt/lib/warnings/regcomp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,21 +59,21 @@ Unmatched [ in regex; marked by <-- HERE in m/abc[ <-- HERE fi[.00./ at - line
5959
qr/(?[[[:word]]])/;
6060
EXPECT
6161
Assuming NOT a POSIX class since there is no terminating ':' in regex; marked by <-- HERE in m/(?[[[:word <-- HERE ]]])/ at - line 2.
62-
syntax error in (?[...]) in regex m/(?[[[:word]]])/ at - line 2.
62+
Unexpected ']' with no following ')' in (?[... in regex; marked by <-- HERE in m/(?[[[:word]] <-- HERE ])/ at - line 2.
6363
########
6464
# NAME qr/(?[ [[:digit: ])/
6565
# OPTION fatal
6666
qr/(?[[[:digit: ])/;
6767
EXPECT
6868
Assuming NOT a POSIX class since no blanks are allowed in one in regex; marked by <-- HERE in m/(?[[[:digit: ] <-- HERE )/ at - line 2.
69-
syntax error in (?[...]) in regex m/(?[[[:digit: ])/ at - line 2.
69+
syntax error in (?[...]) in regex; marked by <-- HERE in m/(?[[[:digit: ]) <-- HERE / at - line 2.
7070
########
7171
# NAME qr/(?[ [:digit: ])/
7272
# OPTION fatal
7373
qr/(?[[:digit: ])/
7474
EXPECT
7575
Assuming NOT a POSIX class since no blanks are allowed in one in regex; marked by <-- HERE in m/(?[[:digit: ] <-- HERE )/ at - line 2.
76-
syntax error in (?[...]) in regex m/(?[[:digit: ])/ at - line 2.
76+
syntax error in (?[...]) in regex; marked by <-- HERE in m/(?[[:digit: ]) <-- HERE / at - line 2.
7777
########
7878
# NAME [perl #126141]
7979
# OPTION fatal

β€Žt/re/reg_mesg.t

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -234,8 +234,9 @@ my @death =
234234
'/\b{gc}/' => "'gc' is an unknown bound type {#} m/\\b{gc{#}}/",
235235
'/\B{gc}/' => "'gc' is an unknown bound type {#} m/\\B{gc{#}}/",
236236

237-
'/(?[[[::]]])/' => "Syntax error in (?[...]) in regex m/(?[[[::]]])/",
238-
'/(?[[[:w:]]])/' => "Syntax error in (?[...]) in regex m/(?[[[:w:]]])/",
237+
238+
'/(?[[[::]]])/' => "Unexpected ']' with no following ')' in (?[... {#} m/(?[[[::]]{#}])/",
239+
'/(?[[[:w:]]])/' => "Unexpected ']' with no following ')' in (?[... {#} m/(?[[[:w:]]{#}])/",
239240
'/(?[[:w:]])/' => "",
240241
'/([.].*)[.]/' => "", # [perl #127582]
241242
'/[.].*[.]/' => "", # [perl #127604]
@@ -258,11 +259,12 @@ my @death =
258259
'/(?[ \p{foo} ])/' => 'Can\'t find Unicode property definition "foo" {#} m/(?[ \p{foo}{#} ])/',
259260
'/(?[ \p{ foo = bar } ])/' => 'Can\'t find Unicode property definition "foo = bar" {#} m/(?[ \p{ foo = bar }{#} ])/',
260261
'/(?[ \8 ])/' => 'Unrecognized escape \8 in character class {#} m/(?[ \8{#} ])/',
261-
'/(?[ \t ]/' => 'Syntax error in (?[...]) in regex m/(?[ \t ]/',
262-
'/(?[ [ \t ]/' => 'Syntax error in (?[...]) in regex m/(?[ [ \t ]/',
263-
'/(?[ \t ] ]/' => 'Syntax error in (?[...]) in regex m/(?[ \t ] ]/',
264-
'/(?[ [ ] ]/' => 'Syntax error in (?[...]) in regex m/(?[ [ ] ]/',
265-
'/(?[ \t + \e # This was supposed to be a comment ])/' => 'Syntax error in (?[...]) in regex m/(?[ \t + \e # This was supposed to be a comment ])/',
262+
'/(?[ \t ]/' => "Unexpected ']' with no following ')' in (?[... {#} m/(?[ \\t ]{#}/",
263+
'/(?[ [ \t ]/' => "Syntax error in (?[...]) {#} m/(?[ [ \\t ]{#}/",
264+
'/(?[ \t ] ]/' => "Unexpected ']' with no following ')' in (?[... {#} m/(?[ \\t ]{#} ]/",
265+
'/(?[ [ ] ]/' => "Syntax error in (?[...]) {#} m/(?[ [ ] ]{#}/",
266+
'/(?[ \t + \e # This was supposed to be a comment ])/' =>
267+
"Syntax error in (?[...]) {#} m/(?[ \\t + \\e # This was supposed to be a comment ]){#}/",
266268
'/(?[ ])/' => 'Incomplete expression within \'(?[ ])\' {#} m/(?[ {#}])/',
267269
'm/(?[[a-\d]])/' => 'False [] range "a-\d" {#} m/(?[[a-\d{#}]])/',
268270
'm/(?[[\w-x]])/' => 'False [] range "\w-" {#} m/(?[[\w-{#}x]])/',
@@ -452,10 +454,10 @@ my @death_utf8 = mark_as_utf8(
452454

453455
'/ネ\p{}ネ/' => 'Empty \p{} {#} m/ネ\p{{#}}ネ/',
454456

455-
'/ネ(?[[[:ネ]]])ネ/' => "Syntax error in (?[...]) in regex m/ネ(?[[[:ネ]]])ネ/",
456-
'/ネ(?[[[:ネ: ])ネ/' => "Syntax error in (?[...]) in regex m/ネ(?[[[:ネ: ])ネ/",
457-
'/ネ(?[[[::]]])ネ/' => "Syntax error in (?[...]) in regex m/ネ(?[[[::]]])ネ/",
458-
'/ネ(?[[[:ネ:]]])ネ/' => "Syntax error in (?[...]) in regex m/ネ(?[[[:ネ:]]])ネ/",
457+
'/ネ(?[[[:ネ]]])ネ/' => "Unexpected ']' with no following ')' in (?[... {#} m/ネ(?[[[:ネ]]{#}])ネ/",
458+
'/ネ(?[[[:ネ: ])ネ/' => "Syntax error in (?[...]) {#} m/ネ(?[[[:ネ: ])ネ{#}/",
459+
'/ネ(?[[[::]]])ネ/' => "Unexpected ']' with no following ')' in (?[... {#} m/ネ(?[[[::]]{#}])ネ/",
460+
'/ネ(?[[[:ネ:]]])ネ/' => "Unexpected ']' with no following ')' in (?[... {#} m/ネ(?[[[:ネ:]]{#}])ネ/",
459461
'/ネ(?[[:ネ:]])ネ/' => "",
460462
'/ネ(?[ネ])ネ/' => 'Unexpected character {#} m/ネ(?[ネ{#}])ネ/',
461463
'/ネ(?[ + [ネ] ])/' => 'Unexpected binary operator \'+\' with no preceding operand {#} m/ネ(?[ +{#} [ネ] ])/',
@@ -468,8 +470,9 @@ my @death_utf8 = mark_as_utf8(
468470
'/(?[ \x{ネ} ])ネ/' => 'Non-hex character {#} m/(?[ \x{ネ{#}} ])ネ/',
469471
'/(?[ \p{ネ} ])/' => 'Can\'t find Unicode property definition "ネ" {#} m/(?[ \p{ネ}{#} ])/',
470472
'/(?[ \p{ ネ = bar } ])/' => 'Can\'t find Unicode property definition "ネ = bar" {#} m/(?[ \p{ ネ = bar }{#} ])/',
471-
'/ネ(?[ \t ]/' => 'Syntax error in (?[...]) in regex m/ネ(?[ \t ]/',
472-
'/(?[ \t + \e # ネ This was supposed to be a comment ])/' => 'Syntax error in (?[...]) in regex m/(?[ \t + \e # ネ This was supposed to be a comment ])/',
473+
'/ネ(?[ \t ]/' => "Unexpected ']' with no following ')' in (?[... {#} m/ネ(?[ \\t ]{#}/",
474+
'/(?[ \t + \e # ネ This was supposed to be a comment ])/' =>
475+
"Syntax error in (?[...]) {#} m/(?[ \\t + \\e # ネ This was supposed to be a comment ]){#}/",
473476
'm/(*ネ)ネ/' => q<Unknown verb pattern 'ネ' {#} m/(*ネ){#}ネ/>,
474477
'/\cネ/' => "Character following \"\\c\" must be printable ASCII",
475478
'/\b{ネ}/' => "'ネ' is an unknown bound type {#} m/\\b{ネ{#}}/",

β€Žt/re/regex_sets.t

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -158,13 +158,13 @@ for my $char ("Ω ", "Ω₯", "Ω©") {
158158
eval { $_ = '/(?[(\c]) /'; qr/$_/ };
159159
like($@, qr/^Syntax error/, '/(?[(\c]) / should not panic');
160160
eval { $_ = '(?[\c#]' . "\n])"; qr/$_/ };
161-
like($@, qr/^Syntax error/, '/(?[(\c]) / should not panic');
161+
like($@, qr/^Unexpected/, '/(?[(\c]) / should not panic');
162162
eval { $_ = '(?[(\c])'; qr/$_/ };
163163
like($@, qr/^Syntax error/, '/(?[(\c])/ should be a syntax error');
164164
eval { $_ = '(?[(\c]) ]\b'; qr/$_/ };
165-
like($@, qr/^Syntax error/, '/(?[(\c]) ]\b/ should be a syntax error');
165+
like($@, qr/^Unexpected/, '/(?[(\c]) ]\b/ should be a syntax error');
166166
eval { $_ = '(?[\c[]](])'; qr/$_/ };
167-
like($@, qr/^Syntax error/, '/(?[\c[]](])/ should be a syntax error');
167+
like($@, qr/^Unexpected/, '/(?[\c[]](])/ should be a syntax error');
168168
like("\c#", qr/(?[\c#])/, '\c# should match itself');
169169
like("\c[", qr/(?[\c[])/, '\c[ should match itself');
170170
like("\c\ ", qr/(?[\c\])/, '\c\ should match itself');

0 commit comments

Comments
Β (0)