Skip to content

Commit bb58640

Browse files
committed
Fix GH #17278
This was an assertion failure in regexec.c under rare circumstances. A reduction of the fuzzed test case is now in pat_advanced.t The root cause of this was that the pattern being compiled was encoded in UTF-8 and 'use locale' was in effect, equivalent to the /l charset, and then the charset was reset inside the pattern, to /d. But /d in a UTF-8 patterns is illegal, hence the later assertion failure. The solution is to reset instead to /u when the pattern is UTF-8.
1 parent 28df11c commit bb58640

File tree

2 files changed

+24
-5
lines changed

2 files changed

+24
-5
lines changed

regcomp.c

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,11 @@ struct RExC_state_t {
414414
} \
415415
} STMT_END
416416

417+
/* /u is to be chosen if we are supposed to use Unicode rules, or if the
418+
* pattern is in UTF-8. This latter condition is in case the outermost rules
419+
* are locale. See GH #17278 */
420+
#define toUSE_UNI_CHARSET_NOT_DEPENDS (RExC_uni_semantics || UTF)
421+
417422
/* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is
418423
* a flag that indicates we need to override /d with /u as a result of
419424
* something in the pattern. It should only be used in regards to calling
@@ -7736,7 +7741,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
77367741

77377742
rx_flags = orig_rx_flags;
77387743

7739-
if ( (UTF || RExC_uni_semantics)
7744+
if ( toUSE_UNI_CHARSET_NOT_DEPENDS
77407745
&& initial_charset == REGEX_DEPENDS_CHARSET)
77417746
{
77427747

@@ -10819,15 +10824,15 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
1081910824
RExC_parse++;
1082010825
has_use_defaults = TRUE;
1082110826
STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
10822-
cs = (RExC_uni_semantics)
10827+
cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
1082310828
? REGEX_UNICODE_CHARSET
1082410829
: REGEX_DEPENDS_CHARSET;
1082510830
set_regex_charset(&RExC_flags, cs);
1082610831
}
1082710832
else {
1082810833
cs = get_regex_charset(RExC_flags);
1082910834
if ( cs == REGEX_DEPENDS_CHARSET
10830-
&& RExC_uni_semantics)
10835+
&& (toUSE_UNI_CHARSET_NOT_DEPENDS))
1083110836
{
1083210837
cs = REGEX_UNICODE_CHARSET;
1083310838
}
@@ -10911,7 +10916,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
1091110916
* pattern (or target, not known until runtime) are
1091210917
* utf8, or something in the pattern indicates unicode
1091310918
* semantics */
10914-
cs = (RExC_uni_semantics)
10919+
cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
1091510920
? REGEX_UNICODE_CHARSET
1091610921
: REGEX_DEPENDS_CHARSET;
1091710922
has_charset_modifier = DEPENDS_PAT_MOD;
@@ -12447,7 +12452,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
1244712452
/* restore original flags, but keep (?p) and, if we've encountered
1244812453
* something in the parse that changes /d rules into /u, keep the /u */
1244912454
RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
12450-
if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
12455+
if (DEPENDS_SEMANTICS && toUSE_UNI_CHARSET_NOT_DEPENDS) {
1245112456
set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
1245212457
}
1245312458
if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {

t/re/pat_advanced.t

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2562,6 +2562,20 @@ EOF
25622562
{}, "GH #17734");
25632563
}
25642564

2565+
{ # GH $17278 assertion fails
2566+
fresh_perl_is('use locale;
2567+
my $A_grave = "\N{LATIN CAPITAL LETTER A WITH GRAVE}";
2568+
utf8::encode($A_grave);
2569+
my $a_grave = "\N{LATIN SMALL LETTER A WITH GRAVE}";
2570+
utf8::encode($a_grave);
2571+
2572+
my $z="q!$a_grave! =~ m!(?^i)[$A_grave]!";
2573+
utf8::decode($z);
2574+
print eval $z, "\n";',
2575+
1,
2576+
{}, "GH #17278");
2577+
}
2578+
25652579

25662580
# !!! NOTE that tests that aren't at all likely to crash perl should go
25672581
# a ways above, above these last ones. There's a comment there that, like

0 commit comments

Comments
 (0)