perl #77654: quotemeta quotes non-ASCII consistently

Karl Williamson · Karl Williamson · commit 2e2b25717dbd · 2012-02-15T18:02:35.000-07:00
As described in the pod changes in this commit, this changes quotemeta()
to consistenly quote non-ASCII characters when used under
unicode_strings.  The behavior is changed for these and UTF-8 encoded
strings to more closely align with Unicode's recommendations.

The end result is that we *could* at some future point start using other
characters as metacharacters than the 12 we do now.
diff --git a/embed.fnc b/embed.fnc
@@ -608,6 +608,7 @@ p	|UV	|_to_fold_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const bool flags
 #endif
 #if defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C)
 p	|UV	|_to_upper_title_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const char S_or_s
+ApR	|bool	|_is_utf8_quotemeta|NN const U8 *p
 #endif
 Ap	|UV	|to_uni_lower	|UV c|NN U8 *p|NN STRLEN *lenp
 Amp	|UV	|to_uni_fold	|UV c|NN U8 *p|NN STRLEN *lenp
diff --git a/embed.h b/embed.h
@@ -781,6 +781,9 @@
 #define warn_nocontext		Perl_warn_nocontext
 #define warner_nocontext	Perl_warner_nocontext
 #endif
+#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C)
+#define _is_utf8_quotemeta(a)	Perl__is_utf8_quotemeta(aTHX_ a)
+#endif
 #if defined(PERL_MAD)
 #define newFORM(a,b,c)		Perl_newFORM(aTHX_ a,b,c)
 #define newMYSUB(a,b,c,d,e)	Perl_newMYSUB(aTHX_ a,b,c,d,e)
diff --git a/embedvar.h b/embedvar.h
@@ -378,6 +378,7 @@
 #define PL_utf8_perl_idstart	(vTHX->Iutf8_perl_idstart)
 #define PL_utf8_print		(vTHX->Iutf8_print)
 #define PL_utf8_punct		(vTHX->Iutf8_punct)
+#define PL_utf8_quotemeta	(vTHX->Iutf8_quotemeta)
 #define PL_utf8_space		(vTHX->Iutf8_space)
 #define PL_utf8_tofold		(vTHX->Iutf8_tofold)
 #define PL_utf8_tolower		(vTHX->Iutf8_tolower)
diff --git a/intrpvar.h b/intrpvar.h
@@ -641,6 +641,7 @@ PERLVAR(I, utf8_toupper, SV *)
 PERLVAR(I, utf8_totitle, SV *)
 PERLVAR(I, utf8_tolower, SV *)
 PERLVAR(I, utf8_tofold,	SV *)
+PERLVAR(I, utf8_quotemeta, SV *)
 PERLVAR(I, last_swash_hv, HV *)
 PERLVAR(I, last_swash_tmps, U8 *)
 PERLVAR(I, last_swash_slen, STRLEN)
diff --git a/lib/feature.pm b/lib/feature.pm
@@ -145,8 +145,8 @@ L<perlunicode/The "Unicode Bug"> for details.)  For this reason, if you are
 potentially using Unicode in your program, the
 C<use feature 'unicode_strings'> subpragma is B<strongly> recommended.
 
-This feature is available starting with Perl 5.12, but was not fully
-implemented until Perl 5.14.
+This feature is available starting with Perl 5.12; was almost fully
+implemented in Perl 5.14; and extended in Perl 5.16 to cover C<quotemeta>.
 
 =head2 The 'unicode_eval' and 'evalbytes' features
 
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
@@ -226,6 +226,14 @@ cached version of it.
 
 See the documentation for L<$$|perlvar/$$> for details.
 
+=head2 Which Non-ASCII characters get quoted by C<quotemeta> and C<\Q> has changed
+
+This is unlikely to result in a real problem, as Perl does not attach
+special meaning to any non-ASCII character, so it is currently
+irrelevant which are quoted or not.  This change fixes bug [perl #77654] and
+bring Perl's behavior more into line with Unicode's recommendations.
+See L<perlfunc/quotemeta>.
+
 =head1 Deprecations
 
 XXX Any deprecated features, syntax, modules etc. should be listed here.
@@ -730,6 +738,16 @@ bracketed character class in a regular expression that consisted solely
 of a Unicode property, that property wasn't getting inverted outside the
 Latin1 range.
 
+=item *
+
+C<quotemeta> now quotes consistently the same non-ASCII characters under
+C<use feature 'unicode_strings'>, regardless of whether the string is
+encoded in UTF-8 or not, hence fixing the last vestiges (we hope) of the
+infamous L<perlunicode/The "Unicode Bug">.  [perl #77654].
+
+Which of these code points is quoted has changed, based on Unicode's
+recommendations.  See L<perlfunc/quotemeta> for details.
+
 =back
 
 =head1 Known Problems
diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod
@@ -4964,8 +4964,52 @@ input from the user, quotemeta() or C<\Q> must be used.
 
 In Perl v5.14, all non-ASCII characters are quoted in non-UTF-8-encoded
 strings, but not quoted in UTF-8 strings.
-It is planned to change this behavior in v5.16, but the exact rules
-haven't been determined yet.
+
+Starting in Perl v5.16, Perl adopted a Unicode-defined strategy for
+quoting non-ASCII characters; the quoting of ASCII characters is
+unchanged.
+
+Also unchanged is the quoting of non-UTF-8 strings when outside the
+scope of a C<use feature 'unicode_strings'>, which is to quote all
+characters in the upper Latin1 range.  This provides complete backwards
+compatibility for old programs which do not use Unicode.  (Note that
+C<unicode_strings> is automatically enabled within the scope of a
+S<C<use v5.12>> or greater.)
+
+Otherwise, Perl quotes non-ASCII characters using an adaptation from
+Unicode (see L<http://www.unicode.org/reports/tr31/>.)
+The only code points that are quoted are those that have any of the
+Unicode properties:  Pattern_Syntax, Pattern_White_Space, White_Space,
+Default_Ignorable_Code_Point, or General_Category=Control.
+
+Of these properties, the two important ones are Pattern_Syntax and
+Pattern_White_Space.  They have been set up by Unicode for exactly this
+purpose of deciding which characters in a regular expression pattern
+should be quoted.  No character that can be in an identifier has these
+properties.
+
+Perl promises, that if we ever add regular expression pattern
+metacharacters to the dozen already defined
+(C<\ E<verbar> ( ) [ { ^ $ * + ? .>), that we will only use ones that have the
+Pattern_Syntax property.  Perl also promises, that if we ever add
+characters that are considered to be white space in regular expressions
+(currently mostly affected by C</x>), they will all have the
+Pattern_White_Space property.
+
+Unicode promises that the set of code points that have these two
+properties will never change, so something that is not quoted in v5.16
+will never need to be quoted in any future Perl release.  (Not all the
+code points that match Pattern_Syntax have actually had characters
+assigned to them; so there is room to grow, but they are quoted
+whether assigned or not.  Perl, of course, would never use an
+unassigned code point as an actual metacharacter.)
+
+Quoting characters that have the other 3 properties is done to enhance
+the readability of the regular expression and not because they actually
+need to be quoted for regular expression purposes (characters with the
+White_Space property are likely to be indistinguishable on the page or
+screen from those with the Pattern_White_Space property; and the other
+two properties contain non-printing characters).
 
 =item rand EXPR
 X<rand> X<random>
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
@@ -1371,49 +1371,69 @@ readdir, readlink
 
 =head2 The "Unicode Bug"
 
-The term, the "Unicode bug" has been applied to an inconsistency
+The term, "Unicode bug" has been applied to an inconsistency
 on ASCII platforms with the
 Unicode code points in the Latin-1 Supplement block, that
 is, between 128 and 255.  Without a locale specified, unlike all other
 characters or code points, these characters have very different semantics in
 byte semantics versus character semantics, unless
-C<use feature 'unicode_strings'> is specified.
-(The lesson here is to specify C<unicode_strings> to avoid the
-headaches.)
+C<use feature 'unicode_strings'> is specified, directly or indirectly.
+(It is indirectly specified by a C<use v5.12> or higher.)
 
-In character semantics they are interpreted as Unicode code points, which means
+In character semantics these upper-Latin1 characters are interpreted as
+Unicode code points, which means
 they have the same semantics as Latin-1 (ISO-8859-1).
 
-In byte semantics, they are considered to be unassigned characters, meaning
-that the only semantics they have is their ordinal numbers, and that they are
+In byte semantics (without C<unicode_strings>), they are considered to
+be unassigned characters, meaning that the only semantics they have is
+their ordinal numbers, and that they are
 not members of various character classes.  None are considered to match C<\w>
 for example, but all match C<\W>.
 
-The behavior is known to have effects on these areas:
+Perl 5.12.0 added C<unicode_strings> to force character semantics on
+these code points in some circumstances, which fixed portions of the
+bug; Perl 5.14.0 fixed almost all of it; and Perl 5.16.0 fixed the
+remainder (so far as we know, anyway).  The lesson here is to enable
+C<unicode_strings> to avoid the headaches described below.
+
+The old, problematic behavior affects these areas:
 
 =over 4
 
 =item *
 
 Changing the case of a scalar, that is, using C<uc()>, C<ucfirst()>, C<lc()>,
-and C<lcfirst()>, or C<\L>, C<\U>, C<\u> and C<\l> in regular expression
-substitutions.
+and C<lcfirst()>, or C<\L>, C<\U>, C<\u> and C<\l> in double-quotish
+contexts, such as regular expression substitutions.
+Under C<unicode_strings> starting in Perl 5.12.0, character semantics are
+generally used.  See L<perlfunc/lc> for details on how this works
+in combination with various other pragmas.
 
 =item *
 
-Using caseless (C</i>) regular expression matching
+Using caseless (C</i>) regular expression matching.
+Starting in Perl 5.14.0, regular expressions compiled within
+the scope of C<unicode_semantics> use character semantics
+even when executed or compiled into larger
+regular expressions outside the scope.
 
 =item *
 
 Matching any of several properties in regular expressions, namely C<\b>,
 C<\B>, C<\s>, C<\S>, C<\w>, C<\W>, and all the Posix character classes
 I<except> C<[[:ascii:]]>.
+Starting in Perl 5.14.0, regular expressions compiled within
+the scope of C<unicode_semantics> use character semantics
+even when executed or compiled into larger
+regular expressions outside the scope.
 
 =item *
 
 In C<quotemeta> or its inline equivalent C<\Q>, no code points above 127
 are quoted in UTF-8 encoded strings, but in byte encoded strings, code
 points between 128-255 are always quoted.
+Starting in Perl 5.16.0, consistent quoting rules are used within the
+scope of C<unicode_strings>, as described in L<perlfunc/quotemeta>.
 
 =back
 
@@ -1442,21 +1462,9 @@ ASCII range (except in a locale), along with Perl's desire to add Unicode
 support seamlessly.  The result wasn't seamless: these characters were
 orphaned.
 
-Starting in Perl 5.14, C<use feature 'unicode_strings'> can be used to
-cause Perl to use Unicode semantics on all string operations within the
-scope of the feature subpragma.  Regular expressions compiled in its
-scope retain that behavior even when executed or compiled into larger
-regular expressions outside the scope.  (The pragma does not, however,
-affect the C<quotemeta> behavior.  Nor does it affect the deprecated
-user-defined case changing operations--these still require a UTF-8
-encoded string to operate.)
-
-In Perl 5.12, the subpragma affected casing changes, but not regular
-expressions.  See L<perlfunc/lc> for details on how this pragma works in
-combination with various others for casing.
-
-For earlier Perls, or when a string is passed to a function outside the
-subpragma's scope, a workaround is to always call C<utf8::upgrade($string)>,
+For Perls earlier than those described above, or when a string is passed
+to a function outside the subpragma's scope, a workaround is to always
+call C<utf8::upgrade($string)>,
 or to use the standard module L<Encode>.   Also, a scalar that has any characters
 whose ordinal is above 0x100, or which were specified using either of the
 C<\N{...}> notations, will automatically have character semantics.
diff --git a/pod/perluniintro.pod b/pod/perluniintro.pod
@@ -152,7 +152,8 @@ problems of the initial Unicode implementation, but for example
 regular expressions still do not work with Unicode in 5.6.1.
 Perl 5.14.0 is the first release where Unicode support is
 (almost) seamlessly integrable without some gotchas (the exception being
-some differences in L<quotemeta|perlfunc/quotemeta>).   To enable this
+some differences in L<quotemeta|perlfunc/quotemeta>, which is fixed
+starting in Perl 5.16.0).   To enable this
 seamless support, you should C<use feature 'unicode_strings'> (which is
 automatically selected if you C<use 5.012> or higher).  See L<feature>.
 (5.14 also fixes a number of bugs and departures from the Unicode
diff --git a/pp.c b/pp.c
@@ -4088,24 +4088,45 @@ PP(pp_quotemeta)
 	d = SvPVX(TARG);
 	if (DO_UTF8(sv)) {
 	    while (len) {
-		if (UTF8_IS_CONTINUED(*s)) {
 		    STRLEN ulen = UTF8SKIP(s);
+		bool to_quote = FALSE;
+
+		if (UTF8_IS_INVARIANT(*s)) {
+		    if (_isQUOTEMETA(*s)) {
+			to_quote = TRUE;
+		    }
+		}
+		else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
+		    if (_isQUOTEMETA(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1))))
+		    {
+			to_quote = TRUE;
+		    }
+		}
+		else if (_is_utf8_quotemeta(s)) {
+		    to_quote = TRUE;
+		}
+
+		if (to_quote) {
+		    *d++ = '\\';
+		}
 		    if (ulen > len)
 			ulen = len;
 		    len -= ulen;
 		    while (ulen--)
 			*d++ = *s++;
-		}
-		else {
-		    if (!isALNUM(*s))
-			*d++ = '\\';
-		    *d++ = *s++;
-		    len--;
-		}
 	    }
 	    SvUTF8_on(TARG);
 	}
+	else if (IN_UNI_8_BIT) {
+	    while (len--) {
+		if (_isQUOTEMETA(*s))
+		    *d++ = '\\';
+		*d++ = *s++;
+	    }
+	}
 	else {
+	    /* For non UNI_8_BIT (and hence in locale) just quote all \W
+	     * including everything above ASCII */
 	    while (len--) {
 		if (!isWORDCHAR_A(*s))
 		    *d++ = '\\';
diff --git a/proto.h b/proto.h
@@ -7154,6 +7154,12 @@ STATIC U8	S_to_lower_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp)
 
 #endif
 #if defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C)
+PERL_CALLCONV bool	Perl__is_utf8_quotemeta(pTHX_ const U8 *p)
+			__attribute__warn_unused_result__
+			__attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT__IS_UTF8_QUOTEMETA	\
+	assert(p)
+
 PERL_CALLCONV UV	Perl__to_upper_title_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp, const char S_or_s)
 			__attribute__nonnull__(pTHX_2)
 			__attribute__nonnull__(pTHX_3);
diff --git a/regen/feature.pl b/regen/feature.pl
@@ -439,8 +439,8 @@ =head2 The 'unicode_strings' feature
 potentially using Unicode in your program, the
 C<use feature 'unicode_strings'> subpragma is B<strongly> recommended.
 
-This feature is available starting with Perl 5.12, but was not fully
-implemented until Perl 5.14.
+This feature is available starting with Perl 5.12; was almost fully
+implemented in Perl 5.14; and extended in Perl 5.16 to cover C<quotemeta>.
 
 =head2 The 'unicode_eval' and 'evalbytes' features
 
diff --git a/sv.c b/sv.c
@@ -13461,6 +13461,7 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags,
     PL_utf8_idcont	= sv_dup_inc(proto_perl->Iutf8_idcont, param);
     PL_utf8_xidcont	= sv_dup_inc(proto_perl->Iutf8_xidcont, param);
     PL_utf8_foldable	= sv_dup_inc(proto_perl->Iutf8_foldable, param);
+    PL_utf8_quotemeta	= sv_dup_inc(proto_perl->Iutf8_quotemeta, param);
     PL_ASCII		= sv_dup_inc(proto_perl->IASCII, param);
     PL_AboveLatin1	= sv_dup_inc(proto_perl->IAboveLatin1, param);
     PL_Latin1		= sv_dup_inc(proto_perl->ILatin1, param);
diff --git a/t/op/quotemeta.t b/t/op/quotemeta.t
@@ -7,7 +7,7 @@ BEGIN {
     require "test.pl";
 }
 
-plan tests => 22;
+plan tests => 40;
 
 if ($Config{ebcdic} eq 'define') {
     $_ = join "", map chr($_), 129..233;
@@ -44,8 +44,45 @@ is("\Q\l\UPe*x*r\El\E*", "pE\\*X\\*Rl*", '\Q\l\UPe*x*r\El\E*');
 is("\U\lPerl\E\E\E\E", "pERL", '\U\lPerl\E\E\E\E');
 is("\l\UPerl\E\E\E\E", "pERL", '\l\UPerl\E\E\E\E');
 
-is(quotemeta("\x{263a}"), "\x{263a}", "quotemeta Unicode");
-is(length(quotemeta("\x{263a}")), 1, "quotemeta Unicode length");
+is(quotemeta("\x{263a}"), "\\\x{263a}", "quotemeta Unicode quoted");
+is(length(quotemeta("\x{263a}")), 2, "quotemeta Unicode quoted length");
+is(quotemeta("\x{100}"), "\x{100}", "quotemeta Unicode nonquoted");
+is(length(quotemeta("\x{100}")), 1, "quotemeta Unicode nonquoted length");
+
+my $char = ":";
+utf8::upgrade($char);
+is(quotemeta($char), "\\$char", "quotemeta '$char' in UTF-8");
+is(length(quotemeta($char)), 2, "quotemeta '$char'  in UTF-8 length");
+
+$char = "M";
+utf8::upgrade($char);
+is(quotemeta($char), "$char", "quotemeta '$char' in UTF-8");
+is(length(quotemeta($char)), 1, "quotemeta '$char'  in UTF-8 length");
+
+my $char = "\N{U+D7}";
+utf8::upgrade($char);
+is(quotemeta($char), "\\$char", "quotemeta '\\N{U+D7}' in UTF-8");
+is(length(quotemeta($char)), 2, "quotemeta '\\N{U+D7}'  in UTF-8 length");
+
+$char = "\N{U+D8}";
+utf8::upgrade($char);
+is(quotemeta($char), "$char", "quotemeta '\\N{U+D8}' in UTF-8");
+is(length(quotemeta($char)), 1, "quotemeta '\\N{U+D8}'  in UTF-8 length");
+
+{
+    no feature 'unicode_strings';
+    is(quotemeta("\x{d7}"), "\\\x{d7}", "quotemeta Latin1 no unicode_strings quoted");
+    is(length(quotemeta("\x{d7}")), 2, "quotemeta Latin1 no unicode_strings quoted length");
+    is(quotemeta("\x{d8}"), "\\\x{d8}", "quotemeta Latin1 no unicode_strings quoted");
+    is(length(quotemeta("\x{d8}")), 2, "quotemeta Latin1 no unicode_strings quoted length");
+}
+{
+    use feature 'unicode_strings';
+    is(quotemeta("\x{d7}"), "\\\x{d7}", "quotemeta Latin1 unicode_strings quoted");
+    is(length(quotemeta("\x{d7}")), 2, "quotemeta Latin1 unicode_strings quoted length");
+    is(quotemeta("\x{d8}"), "\x{d8}", "quotemeta Latin1 unicode_strings nonquoted");
+    is(length(quotemeta("\x{d8}")), 1, "quotemeta Latin1 unicode_strings nonquoted length");
+}
 
 $a = "foo|bar";
 is("a\Q\Ec$a", "acfoo|bar", '\Q\E');
diff --git a/utf8.c b/utf8.c