Skip to content

Commit 2e2b257

Browse files
author
Karl Williamson
committed
perl #77654: quotemeta quotes non-ASCII consistently
As described in the pod changes in this commit, this changes quotemeta() to consistenly quote non-ASCII characters when used under unicode_strings. The behavior is changed for these and UTF-8 encoded strings to more closely align with Unicode's recommendations. The end result is that we *could* at some future point start using other characters as metacharacters than the 12 we do now.
1 parent adfec83 commit 2e2b257

File tree

15 files changed

+198
-44
lines changed

15 files changed

+198
-44
lines changed

embed.fnc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -608,6 +608,7 @@ p |UV |_to_fold_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const bool flags
608608
#endif
609609
#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C)
610610
p |UV |_to_upper_title_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const char S_or_s
611+
ApR |bool |_is_utf8_quotemeta|NN const U8 *p
611612
#endif
612613
Ap |UV |to_uni_lower |UV c|NN U8 *p|NN STRLEN *lenp
613614
Amp |UV |to_uni_fold |UV c|NN U8 *p|NN STRLEN *lenp

embed.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -781,6 +781,9 @@
781781
#define warn_nocontext Perl_warn_nocontext
782782
#define warner_nocontext Perl_warner_nocontext
783783
#endif
784+
#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C)
785+
#define _is_utf8_quotemeta(a) Perl__is_utf8_quotemeta(aTHX_ a)
786+
#endif
784787
#if defined(PERL_MAD)
785788
#define newFORM(a,b,c) Perl_newFORM(aTHX_ a,b,c)
786789
#define newMYSUB(a,b,c,d,e) Perl_newMYSUB(aTHX_ a,b,c,d,e)

embedvar.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,7 @@
378378
#define PL_utf8_perl_idstart (vTHX->Iutf8_perl_idstart)
379379
#define PL_utf8_print (vTHX->Iutf8_print)
380380
#define PL_utf8_punct (vTHX->Iutf8_punct)
381+
#define PL_utf8_quotemeta (vTHX->Iutf8_quotemeta)
381382
#define PL_utf8_space (vTHX->Iutf8_space)
382383
#define PL_utf8_tofold (vTHX->Iutf8_tofold)
383384
#define PL_utf8_tolower (vTHX->Iutf8_tolower)

intrpvar.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -641,6 +641,7 @@ PERLVAR(I, utf8_toupper, SV *)
641641
PERLVAR(I, utf8_totitle, SV *)
642642
PERLVAR(I, utf8_tolower, SV *)
643643
PERLVAR(I, utf8_tofold, SV *)
644+
PERLVAR(I, utf8_quotemeta, SV *)
644645
PERLVAR(I, last_swash_hv, HV *)
645646
PERLVAR(I, last_swash_tmps, U8 *)
646647
PERLVAR(I, last_swash_slen, STRLEN)

lib/feature.pm

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -145,8 +145,8 @@ L<perlunicode/The "Unicode Bug"> for details.) For this reason, if you are
145145
potentially using Unicode in your program, the
146146
C<use feature 'unicode_strings'> subpragma is B<strongly> recommended.
147147
148-
This feature is available starting with Perl 5.12, but was not fully
149-
implemented until Perl 5.14.
148+
This feature is available starting with Perl 5.12; was almost fully
149+
implemented in Perl 5.14; and extended in Perl 5.16 to cover C<quotemeta>.
150150
151151
=head2 The 'unicode_eval' and 'evalbytes' features
152152

pod/perldelta.pod

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,14 @@ cached version of it.
226226

227227
See the documentation for L<$$|perlvar/$$> for details.
228228

229+
=head2 Which Non-ASCII characters get quoted by C<quotemeta> and C<\Q> has changed
230+
231+
This is unlikely to result in a real problem, as Perl does not attach
232+
special meaning to any non-ASCII character, so it is currently
233+
irrelevant which are quoted or not. This change fixes bug [perl #77654] and
234+
bring Perl's behavior more into line with Unicode's recommendations.
235+
See L<perlfunc/quotemeta>.
236+
229237
=head1 Deprecations
230238

231239
XXX Any deprecated features, syntax, modules etc. should be listed here.
@@ -730,6 +738,16 @@ bracketed character class in a regular expression that consisted solely
730738
of a Unicode property, that property wasn't getting inverted outside the
731739
Latin1 range.
732740

741+
=item *
742+
743+
C<quotemeta> now quotes consistently the same non-ASCII characters under
744+
C<use feature 'unicode_strings'>, regardless of whether the string is
745+
encoded in UTF-8 or not, hence fixing the last vestiges (we hope) of the
746+
infamous L<perlunicode/The "Unicode Bug">. [perl #77654].
747+
748+
Which of these code points is quoted has changed, based on Unicode's
749+
recommendations. See L<perlfunc/quotemeta> for details.
750+
733751
=back
734752

735753
=head1 Known Problems

pod/perlfunc.pod

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4964,8 +4964,52 @@ input from the user, quotemeta() or C<\Q> must be used.
49644964

49654965
In Perl v5.14, all non-ASCII characters are quoted in non-UTF-8-encoded
49664966
strings, but not quoted in UTF-8 strings.
4967-
It is planned to change this behavior in v5.16, but the exact rules
4968-
haven't been determined yet.
4967+
4968+
Starting in Perl v5.16, Perl adopted a Unicode-defined strategy for
4969+
quoting non-ASCII characters; the quoting of ASCII characters is
4970+
unchanged.
4971+
4972+
Also unchanged is the quoting of non-UTF-8 strings when outside the
4973+
scope of a C<use feature 'unicode_strings'>, which is to quote all
4974+
characters in the upper Latin1 range. This provides complete backwards
4975+
compatibility for old programs which do not use Unicode. (Note that
4976+
C<unicode_strings> is automatically enabled within the scope of a
4977+
S<C<use v5.12>> or greater.)
4978+
4979+
Otherwise, Perl quotes non-ASCII characters using an adaptation from
4980+
Unicode (see L<http://www.unicode.org/reports/tr31/>.)
4981+
The only code points that are quoted are those that have any of the
4982+
Unicode properties: Pattern_Syntax, Pattern_White_Space, White_Space,
4983+
Default_Ignorable_Code_Point, or General_Category=Control.
4984+
4985+
Of these properties, the two important ones are Pattern_Syntax and
4986+
Pattern_White_Space. They have been set up by Unicode for exactly this
4987+
purpose of deciding which characters in a regular expression pattern
4988+
should be quoted. No character that can be in an identifier has these
4989+
properties.
4990+
4991+
Perl promises, that if we ever add regular expression pattern
4992+
metacharacters to the dozen already defined
4993+
(C<\ E<verbar> ( ) [ { ^ $ * + ? .>), that we will only use ones that have the
4994+
Pattern_Syntax property. Perl also promises, that if we ever add
4995+
characters that are considered to be white space in regular expressions
4996+
(currently mostly affected by C</x>), they will all have the
4997+
Pattern_White_Space property.
4998+
4999+
Unicode promises that the set of code points that have these two
5000+
properties will never change, so something that is not quoted in v5.16
5001+
will never need to be quoted in any future Perl release. (Not all the
5002+
code points that match Pattern_Syntax have actually had characters
5003+
assigned to them; so there is room to grow, but they are quoted
5004+
whether assigned or not. Perl, of course, would never use an
5005+
unassigned code point as an actual metacharacter.)
5006+
5007+
Quoting characters that have the other 3 properties is done to enhance
5008+
the readability of the regular expression and not because they actually
5009+
need to be quoted for regular expression purposes (characters with the
5010+
White_Space property are likely to be indistinguishable on the page or
5011+
screen from those with the Pattern_White_Space property; and the other
5012+
two properties contain non-printing characters).
49695013

49705014
=item rand EXPR
49715015
X<rand> X<random>

pod/perlunicode.pod

Lines changed: 34 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1371,49 +1371,69 @@ readdir, readlink
13711371

13721372
=head2 The "Unicode Bug"
13731373

1374-
The term, the "Unicode bug" has been applied to an inconsistency
1374+
The term, "Unicode bug" has been applied to an inconsistency
13751375
on ASCII platforms with the
13761376
Unicode code points in the Latin-1 Supplement block, that
13771377
is, between 128 and 255. Without a locale specified, unlike all other
13781378
characters or code points, these characters have very different semantics in
13791379
byte semantics versus character semantics, unless
1380-
C<use feature 'unicode_strings'> is specified.
1381-
(The lesson here is to specify C<unicode_strings> to avoid the
1382-
headaches.)
1380+
C<use feature 'unicode_strings'> is specified, directly or indirectly.
1381+
(It is indirectly specified by a C<use v5.12> or higher.)
13831382

1384-
In character semantics they are interpreted as Unicode code points, which means
1383+
In character semantics these upper-Latin1 characters are interpreted as
1384+
Unicode code points, which means
13851385
they have the same semantics as Latin-1 (ISO-8859-1).
13861386

1387-
In byte semantics, they are considered to be unassigned characters, meaning
1388-
that the only semantics they have is their ordinal numbers, and that they are
1387+
In byte semantics (without C<unicode_strings>), they are considered to
1388+
be unassigned characters, meaning that the only semantics they have is
1389+
their ordinal numbers, and that they are
13891390
not members of various character classes. None are considered to match C<\w>
13901391
for example, but all match C<\W>.
13911392

1392-
The behavior is known to have effects on these areas:
1393+
Perl 5.12.0 added C<unicode_strings> to force character semantics on
1394+
these code points in some circumstances, which fixed portions of the
1395+
bug; Perl 5.14.0 fixed almost all of it; and Perl 5.16.0 fixed the
1396+
remainder (so far as we know, anyway). The lesson here is to enable
1397+
C<unicode_strings> to avoid the headaches described below.
1398+
1399+
The old, problematic behavior affects these areas:
13931400

13941401
=over 4
13951402

13961403
=item *
13971404

13981405
Changing the case of a scalar, that is, using C<uc()>, C<ucfirst()>, C<lc()>,
1399-
and C<lcfirst()>, or C<\L>, C<\U>, C<\u> and C<\l> in regular expression
1400-
substitutions.
1406+
and C<lcfirst()>, or C<\L>, C<\U>, C<\u> and C<\l> in double-quotish
1407+
contexts, such as regular expression substitutions.
1408+
Under C<unicode_strings> starting in Perl 5.12.0, character semantics are
1409+
generally used. See L<perlfunc/lc> for details on how this works
1410+
in combination with various other pragmas.
14011411

14021412
=item *
14031413

1404-
Using caseless (C</i>) regular expression matching
1414+
Using caseless (C</i>) regular expression matching.
1415+
Starting in Perl 5.14.0, regular expressions compiled within
1416+
the scope of C<unicode_semantics> use character semantics
1417+
even when executed or compiled into larger
1418+
regular expressions outside the scope.
14051419

14061420
=item *
14071421

14081422
Matching any of several properties in regular expressions, namely C<\b>,
14091423
C<\B>, C<\s>, C<\S>, C<\w>, C<\W>, and all the Posix character classes
14101424
I<except> C<[[:ascii:]]>.
1425+
Starting in Perl 5.14.0, regular expressions compiled within
1426+
the scope of C<unicode_semantics> use character semantics
1427+
even when executed or compiled into larger
1428+
regular expressions outside the scope.
14111429

14121430
=item *
14131431

14141432
In C<quotemeta> or its inline equivalent C<\Q>, no code points above 127
14151433
are quoted in UTF-8 encoded strings, but in byte encoded strings, code
14161434
points between 128-255 are always quoted.
1435+
Starting in Perl 5.16.0, consistent quoting rules are used within the
1436+
scope of C<unicode_strings>, as described in L<perlfunc/quotemeta>.
14171437

14181438
=back
14191439

@@ -1442,21 +1462,9 @@ ASCII range (except in a locale), along with Perl's desire to add Unicode
14421462
support seamlessly. The result wasn't seamless: these characters were
14431463
orphaned.
14441464

1445-
Starting in Perl 5.14, C<use feature 'unicode_strings'> can be used to
1446-
cause Perl to use Unicode semantics on all string operations within the
1447-
scope of the feature subpragma. Regular expressions compiled in its
1448-
scope retain that behavior even when executed or compiled into larger
1449-
regular expressions outside the scope. (The pragma does not, however,
1450-
affect the C<quotemeta> behavior. Nor does it affect the deprecated
1451-
user-defined case changing operations--these still require a UTF-8
1452-
encoded string to operate.)
1453-
1454-
In Perl 5.12, the subpragma affected casing changes, but not regular
1455-
expressions. See L<perlfunc/lc> for details on how this pragma works in
1456-
combination with various others for casing.
1457-
1458-
For earlier Perls, or when a string is passed to a function outside the
1459-
subpragma's scope, a workaround is to always call C<utf8::upgrade($string)>,
1465+
For Perls earlier than those described above, or when a string is passed
1466+
to a function outside the subpragma's scope, a workaround is to always
1467+
call C<utf8::upgrade($string)>,
14601468
or to use the standard module L<Encode>. Also, a scalar that has any characters
14611469
whose ordinal is above 0x100, or which were specified using either of the
14621470
C<\N{...}> notations, will automatically have character semantics.

pod/perluniintro.pod

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,8 @@ problems of the initial Unicode implementation, but for example
152152
regular expressions still do not work with Unicode in 5.6.1.
153153
Perl 5.14.0 is the first release where Unicode support is
154154
(almost) seamlessly integrable without some gotchas (the exception being
155-
some differences in L<quotemeta|perlfunc/quotemeta>). To enable this
155+
some differences in L<quotemeta|perlfunc/quotemeta>, which is fixed
156+
starting in Perl 5.16.0). To enable this
156157
seamless support, you should C<use feature 'unicode_strings'> (which is
157158
automatically selected if you C<use 5.012> or higher). See L<feature>.
158159
(5.14 also fixes a number of bugs and departures from the Unicode

pp.c

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4088,24 +4088,45 @@ PP(pp_quotemeta)
40884088
d = SvPVX(TARG);
40894089
if (DO_UTF8(sv)) {
40904090
while (len) {
4091-
if (UTF8_IS_CONTINUED(*s)) {
40924091
STRLEN ulen = UTF8SKIP(s);
4092+
bool to_quote = FALSE;
4093+
4094+
if (UTF8_IS_INVARIANT(*s)) {
4095+
if (_isQUOTEMETA(*s)) {
4096+
to_quote = TRUE;
4097+
}
4098+
}
4099+
else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
4100+
if (_isQUOTEMETA(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1))))
4101+
{
4102+
to_quote = TRUE;
4103+
}
4104+
}
4105+
else if (_is_utf8_quotemeta(s)) {
4106+
to_quote = TRUE;
4107+
}
4108+
4109+
if (to_quote) {
4110+
*d++ = '\\';
4111+
}
40934112
if (ulen > len)
40944113
ulen = len;
40954114
len -= ulen;
40964115
while (ulen--)
40974116
*d++ = *s++;
4098-
}
4099-
else {
4100-
if (!isALNUM(*s))
4101-
*d++ = '\\';
4102-
*d++ = *s++;
4103-
len--;
4104-
}
41054117
}
41064118
SvUTF8_on(TARG);
41074119
}
4120+
else if (IN_UNI_8_BIT) {
4121+
while (len--) {
4122+
if (_isQUOTEMETA(*s))
4123+
*d++ = '\\';
4124+
*d++ = *s++;
4125+
}
4126+
}
41084127
else {
4128+
/* For non UNI_8_BIT (and hence in locale) just quote all \W
4129+
* including everything above ASCII */
41094130
while (len--) {
41104131
if (!isWORDCHAR_A(*s))
41114132
*d++ = '\\';

proto.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7154,6 +7154,12 @@ STATIC U8 S_to_lower_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp)
71547154

71557155
#endif
71567156
#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C)
7157+
PERL_CALLCONV bool Perl__is_utf8_quotemeta(pTHX_ const U8 *p)
7158+
__attribute__warn_unused_result__
7159+
__attribute__nonnull__(pTHX_1);
7160+
#define PERL_ARGS_ASSERT__IS_UTF8_QUOTEMETA \
7161+
assert(p)
7162+
71577163
PERL_CALLCONV UV Perl__to_upper_title_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp, const char S_or_s)
71587164
__attribute__nonnull__(pTHX_2)
71597165
__attribute__nonnull__(pTHX_3);

regen/feature.pl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -439,8 +439,8 @@ =head2 The 'unicode_strings' feature
439439
potentially using Unicode in your program, the
440440
C<use feature 'unicode_strings'> subpragma is B<strongly> recommended.
441441
442-
This feature is available starting with Perl 5.12, but was not fully
443-
implemented until Perl 5.14.
442+
This feature is available starting with Perl 5.12; was almost fully
443+
implemented in Perl 5.14; and extended in Perl 5.16 to cover C<quotemeta>.
444444
445445
=head2 The 'unicode_eval' and 'evalbytes' features
446446

sv.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13461,6 +13461,7 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags,
1346113461
PL_utf8_idcont = sv_dup_inc(proto_perl->Iutf8_idcont, param);
1346213462
PL_utf8_xidcont = sv_dup_inc(proto_perl->Iutf8_xidcont, param);
1346313463
PL_utf8_foldable = sv_dup_inc(proto_perl->Iutf8_foldable, param);
13464+
PL_utf8_quotemeta = sv_dup_inc(proto_perl->Iutf8_quotemeta, param);
1346413465
PL_ASCII = sv_dup_inc(proto_perl->IASCII, param);
1346513466
PL_AboveLatin1 = sv_dup_inc(proto_perl->IAboveLatin1, param);
1346613467
PL_Latin1 = sv_dup_inc(proto_perl->ILatin1, param);

t/op/quotemeta.t

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ BEGIN {
77
require "test.pl";
88
}
99

10-
plan tests => 22;
10+
plan tests => 40;
1111

1212
if ($Config{ebcdic} eq 'define') {
1313
$_ = join "", map chr($_), 129..233;
@@ -44,8 +44,45 @@ is("\Q\l\UPe*x*r\El\E*", "pE\\*X\\*Rl*", '\Q\l\UPe*x*r\El\E*');
4444
is("\U\lPerl\E\E\E\E", "pERL", '\U\lPerl\E\E\E\E');
4545
is("\l\UPerl\E\E\E\E", "pERL", '\l\UPerl\E\E\E\E');
4646

47-
is(quotemeta("\x{263a}"), "\x{263a}", "quotemeta Unicode");
48-
is(length(quotemeta("\x{263a}")), 1, "quotemeta Unicode length");
47+
is(quotemeta("\x{263a}"), "\\\x{263a}", "quotemeta Unicode quoted");
48+
is(length(quotemeta("\x{263a}")), 2, "quotemeta Unicode quoted length");
49+
is(quotemeta("\x{100}"), "\x{100}", "quotemeta Unicode nonquoted");
50+
is(length(quotemeta("\x{100}")), 1, "quotemeta Unicode nonquoted length");
51+
52+
my $char = ":";
53+
utf8::upgrade($char);
54+
is(quotemeta($char), "\\$char", "quotemeta '$char' in UTF-8");
55+
is(length(quotemeta($char)), 2, "quotemeta '$char' in UTF-8 length");
56+
57+
$char = "M";
58+
utf8::upgrade($char);
59+
is(quotemeta($char), "$char", "quotemeta '$char' in UTF-8");
60+
is(length(quotemeta($char)), 1, "quotemeta '$char' in UTF-8 length");
61+
62+
my $char = "\N{U+D7}";
63+
utf8::upgrade($char);
64+
is(quotemeta($char), "\\$char", "quotemeta '\\N{U+D7}' in UTF-8");
65+
is(length(quotemeta($char)), 2, "quotemeta '\\N{U+D7}' in UTF-8 length");
66+
67+
$char = "\N{U+D8}";
68+
utf8::upgrade($char);
69+
is(quotemeta($char), "$char", "quotemeta '\\N{U+D8}' in UTF-8");
70+
is(length(quotemeta($char)), 1, "quotemeta '\\N{U+D8}' in UTF-8 length");
71+
72+
{
73+
no feature 'unicode_strings';
74+
is(quotemeta("\x{d7}"), "\\\x{d7}", "quotemeta Latin1 no unicode_strings quoted");
75+
is(length(quotemeta("\x{d7}")), 2, "quotemeta Latin1 no unicode_strings quoted length");
76+
is(quotemeta("\x{d8}"), "\\\x{d8}", "quotemeta Latin1 no unicode_strings quoted");
77+
is(length(quotemeta("\x{d8}")), 2, "quotemeta Latin1 no unicode_strings quoted length");
78+
}
79+
{
80+
use feature 'unicode_strings';
81+
is(quotemeta("\x{d7}"), "\\\x{d7}", "quotemeta Latin1 unicode_strings quoted");
82+
is(length(quotemeta("\x{d7}")), 2, "quotemeta Latin1 unicode_strings quoted length");
83+
is(quotemeta("\x{d8}"), "\x{d8}", "quotemeta Latin1 unicode_strings nonquoted");
84+
is(length(quotemeta("\x{d8}")), 1, "quotemeta Latin1 unicode_strings nonquoted length");
85+
}
4986

5087
$a = "foo|bar";
5188
is("a\Q\Ec$a", "acfoo|bar", '\Q\E');

0 commit comments

Comments
 (0)