From e8c3bacf5f546a49f701d359f26afd77d1458f60 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Wed, 29 Apr 2020 10:47:44 -0600 Subject: [PATCH 1/2] regcomp.c: Add comments --- regcomp.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/regcomp.c b/regcomp.c index 6582a196d6b9..a4c7331f70c6 100644 --- a/regcomp.c +++ b/regcomp.c @@ -21491,7 +21491,9 @@ SV * Perl_re_intuit_string(pTHX_ REGEXP * const r) { /* Assume that RE_INTUIT is set */ /* Returns an SV containing a string that must appear in the target for it - * to match */ + * to match, or NULL if nothing is known that must match. + * + * CAUTION: the SV can be freed during execution of the regex engine */ struct regexp *const prog = ReANY(r); DECLARE_AND_GET_RE_DEBUG_FLAGS; @@ -25078,6 +25080,12 @@ S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */ /* Compile the subpattern consisting of the name being looked for */ subpattern_re = compile_wildcard(wname, wname_len, FALSE /* /-i */ ); must = re_intuit_string(subpattern_re); + + /* (Note: 'must' could contain a NUL. And yet we use strspn() below on it. + * This works because the NUL causes the function to return early, thus + * showing that there are characters in it other than the acceptable ones, + * which is our desired result.) */ + prog = ReANY(subpattern_re); /* If only nothing is matched, skip to where empty names are looked for */ From 4d3640109bda059b8482fd71a043ac4586a10a50 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 21 Apr 2020 17:30:42 -0600 Subject: [PATCH 2/2] regcomp.c: Avoid use after free It turns out that the SV returned by re_intuit_string() may be freed by future calls to re_intuit_start(). Thus, the caller doesn't get clear title to the returned SV. (This wasn't documented until the commit immediately prior to this one.) Cope with this situation by making a mortalized copy. This commit also changes to use the copy's PV directly, simplifying some 'if' statements. re_intuit_string() is effectively in the API, as it is an element in the regex engine structure, callable by anyone. It should not be returning a tenuous SV. That returned scalar should not freed before the pattern it is for is freed. It is too late in the development cycle to change this, so this workaround is presented instead for 5.32. This fixes #17734. --- regcomp.c | 31 ++++++++++++++++++------------- t/re/pat_advanced.t | 9 +++++++++ 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/regcomp.c b/regcomp.c index a4c7331f70c6..8dd5c50da816 100644 --- a/regcomp.c +++ b/regcomp.c @@ -25013,8 +25013,10 @@ S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */ where we are now */ bool found_matches = FALSE; /* Did any name match so far? */ SV * empty; /* For matching zero length names */ - SV * must; /* What substring, if any, must be in a name - for the subpattern to match */ + SV * must_sv; /* Contains the substring, if any, that must be + in a name for the subpattern to match */ + char * must; /* The PV of 'must' */ + STRLEN must_len; /* And its length */ SV * syllable_name = NULL; /* For Hangul syllables */ const char hangul_prefix[] = "HANGUL SYLLABLE "; const STRLEN hangul_prefix_len = sizeof(hangul_prefix) - 1; @@ -25079,7 +25081,17 @@ S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */ /* Compile the subpattern consisting of the name being looked for */ subpattern_re = compile_wildcard(wname, wname_len, FALSE /* /-i */ ); - must = re_intuit_string(subpattern_re); + + must_sv = re_intuit_string(subpattern_re); + if (must_sv) { + /* regexec.c can free the re_intuit_string() return. GH #17734 */ + must_sv = sv_2mortal(newSVsv(must_sv)); + must = SvPV(must_sv, must_len); + } + else { + must = ""; + must_len = 0; + } /* (Note: 'must' could contain a NUL. And yet we use strspn() below on it. * This works because the NUL causes the function to return early, thus @@ -25095,10 +25107,7 @@ S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */ /* And match against the string of all names /gc. Don't even try if it * must match a character not found in any name. */ - if ( ! must - || SvCUR(must) == 0 - || strspn(SvPVX(must), "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()") - == SvCUR(must)) + if (strspn(must, "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()") == must_len) { while (execute_wildcard(subpattern_re, cur_pos, @@ -25238,9 +25247,7 @@ S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */ * one of the characters in that isn't in any Hangul syllable. */ if ( prog->minlen <= (SSize_t) syl_max_len && prog->maxlen > 0 - && ( ! must - || SvCUR(must) == 0 - || strspn(SvPVX(must), "\n ABCDEGHIJKLMNOPRSTUWY") == SvCUR(must))) + && (strspn(must, "\n ABCDEGHIJKLMNOPRSTUWY") == must_len)) { /* These constants, names, values, and algorithm are adapted from the * Unicode standard, version 5.1, section 3.12, and should never @@ -25335,9 +25342,7 @@ S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */ * series */ if ( prog->minlen <= (SSize_t) SvCUR(algo_name) && prog->maxlen > 0 - && ( ! must - || SvCUR(must) == 0 - || strspn(SvPVX(must), legal) == SvCUR(must))) + && (strspn(must, legal) == must_len)) { for (j = low; j <= high; j++) { /* For each code point in the series */ diff --git a/t/re/pat_advanced.t b/t/re/pat_advanced.t index 41f344ac9e46..21bdb8ca15df 100644 --- a/t/re/pat_advanced.t +++ b/t/re/pat_advanced.t @@ -2553,6 +2553,15 @@ EOF {}, "Too large negative relative group number"); } + { # GH #17734, ASAN use after free + fresh_perl_like('no warnings "experimental::uniprop_wildcards"; + my $re = q<[[\p{name=/[Y-]+Z/}]]>; + eval { "\N{BYZANTINE MUSICAL SYMBOL PSILI}" + =~ /$re/ }; print $@ if $@; print "Done\n";', + qr/Done/, + {}, "GH #17734"); + } + # !!! NOTE that tests that aren't at all likely to crash perl should go # a ways above, above these last ones. There's a comment there that, like