Skip to content

Commit 2aeb63f

Browse files
author
Kenji Fukuda
committed
[1.10>master] [MERGE #5418 @kfukuda2] Fixing RegExp parsing for character classes interacting with ranges.
Merge pull request #5418 from kfukuda2:RegExpCharacterClassRangeFix Fixes #258
2 parents c567e4e + 288d7fe commit 2aeb63f

File tree

7 files changed

+190
-33
lines changed

7 files changed

+190
-33
lines changed

lib/Parser/DebugWriter.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,8 @@ namespace UnifiedRegex
7272
CheckForNewline();
7373
if (c > 0xff)
7474
Output::Print(_u("\\u%lc%lc%lc%lc"), hex[c >> 12], hex[(c >> 8) & 0xf], hex[(c >> 4) & 0xf], hex[c & 0xf]);
75+
else if (c == '-')
76+
Output::Print(_u("\\x2d"));
7577
else if (c < ' ' || c > '~')
7678
Output::Print(_u("\\x%lc%lc"), hex[c >> 4], hex[c & 0xf]);
7779
else

lib/Parser/RegexParser.cpp

Lines changed: 41 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1931,6 +1931,7 @@ namespace UnifiedRegex
19311931
codepoint_t pendingRangeStart = INVALID_CODEPOINT;
19321932
codepoint_t pendingRangeEnd = INVALID_CODEPOINT;
19331933
bool previousSurrogatePart = false;
1934+
19341935
while(nextChar != ']')
19351936
{
19361937
current = next;
@@ -2034,7 +2035,7 @@ namespace UnifiedRegex
20342035

20352036
lastCodepoint = INVALID_CODEPOINT;
20362037
}
2037-
// If we the next character is the end of range ']', then we can't have a surrogate pair.
2038+
// If the next character is the end of range ']', then we can't have a surrogate pair.
20382039
// The current character is the range end, if we don't already have a candidate.
20392040
else if (ECLookahead() == ']' && pendingRangeEnd == INVALID_CODEPOINT)
20402041
{
@@ -2124,6 +2125,10 @@ namespace UnifiedRegex
21242125
codepoint_t pendingRangeStart = INVALID_CODEPOINT;
21252126
EncodedChar nextChar = ECLookahead();
21262127
bool previousWasASurrogate = false;
2128+
bool currIsACharSet = false;
2129+
bool prevWasACharSetAndPartOfRange = false;
2130+
bool prevprevWasACharSetAndPartOfRange = false;
2131+
21272132
while(nextChar != ']')
21282133
{
21292134
codepoint_t codePointToSet = INVALID_CODEPOINT;
@@ -2133,6 +2138,7 @@ namespace UnifiedRegex
21332138
{
21342139
ECConsume();
21352140
}
2141+
21362142
// These if-blocks are the logical ClassAtomPass1, they weren't grouped into a method to simplify dealing with multiple out parameters.
21372143
if (containsSurrogates && this->currentSurrogatePairNode != nullptr && this->currentSurrogatePairNode->location == this->next)
21382144
{
@@ -2147,22 +2153,30 @@ namespace UnifiedRegex
21472153
else if (nextChar == '\\')
21482154
{
21492155
Node* returnedNode = ClassEscapePass1(&deferredCharNode, &deferredSetNode, previousWasASurrogate);
2156+
codePointToSet = pendingCodePoint;
21502157

21512158
if (returnedNode->tag == Node::MatchSet)
21522159
{
2153-
codePointToSet = pendingCodePoint;
2154-
pendingCodePoint = INVALID_CODEPOINT;
21552160
if (pendingRangeStart != INVALID_CODEPOINT)
21562161
{
2162+
if (unicodeFlagPresent)
2163+
{
2164+
//We a range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here
2165+
//This breaks the notion of Pass0 check for valid syntax, because during that time, the unicode flag is unknown.
2166+
Fail(JSERR_UnicodeRegExpRangeContainsCharClass); //From #sec-patterns-static-semantics-early-errors-annexb
2167+
}
2168+
21572169
codePointSet.Set(ctAllocator, '-');
21582170
}
2171+
2172+
pendingCodePoint = INVALID_CODEPOINT;
21592173
pendingRangeStart = INVALID_CODEPOINT;
21602174
codePointSet.UnionInPlace(ctAllocator, deferredSetNode.set);
2175+
currIsACharSet = true;
21612176
}
21622177
else
21632178
{
21642179
// Just a character
2165-
codePointToSet = pendingCodePoint;
21662180
pendingCodePoint = deferredCharNode.cs[0];
21672181
}
21682182
}
@@ -2188,9 +2202,26 @@ namespace UnifiedRegex
21882202
pendingCodePoint = NextChar();
21892203
}
21902204

2191-
if (codePointToSet != INVALID_CODEPOINT)
2205+
if (codePointToSet != INVALID_CODEPOINT || prevprevWasACharSetAndPartOfRange)
21922206
{
2193-
if (pendingRangeStart != INVALID_CODEPOINT)
2207+
if (prevprevWasACharSetAndPartOfRange)
2208+
{
2209+
//We a range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here
2210+
//This breaks the notion of Pass0 check for valid syntax, because during that time, the unicode flag is unknown.
2211+
if (unicodeFlagPresent)
2212+
{
2213+
Fail(JSERR_UnicodeRegExpRangeContainsCharClass);
2214+
}
2215+
2216+
if (pendingCodePoint != INVALID_CODEPOINT)
2217+
{
2218+
codePointSet.Set(ctAllocator, pendingCodePoint);
2219+
}
2220+
2221+
codePointSet.Set(ctAllocator, '-'); //Add '-' to set because a range was detected but turned out to be a union of character set with '-' and another atom.
2222+
pendingRangeStart = pendingCodePoint = INVALID_CODEPOINT;
2223+
}
2224+
else if (pendingRangeStart != INVALID_CODEPOINT)
21942225
{
21952226
if (pendingRangeStart > pendingCodePoint)
21962227
{
@@ -2199,6 +2230,7 @@ namespace UnifiedRegex
21992230
Assert(!unicodeFlagPresent);
22002231
Fail(JSERR_RegExpBadRange);
22012232
}
2233+
22022234
codePointSet.SetRange(ctAllocator, pendingRangeStart, pendingCodePoint);
22032235
pendingRangeStart = pendingCodePoint = INVALID_CODEPOINT;
22042236
}
@@ -2209,6 +2241,9 @@ namespace UnifiedRegex
22092241
}
22102242

22112243
nextChar = ECLookahead();
2244+
prevprevWasACharSetAndPartOfRange = prevWasACharSetAndPartOfRange;
2245+
prevWasACharSetAndPartOfRange = currIsACharSet && nextChar == '-';
2246+
currIsACharSet = false;
22122247
}
22132248

22142249
if (pendingCodePoint != INVALID_CODEPOINT)

lib/Parser/rterrors.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,7 @@ RT_ERROR_MSG(JSERR_NoAccessors, 5673, "Invalid property descriptor: accessors no
366366
RT_ERROR_MSG(JSERR_RegExpInvalidEscape, 5674, "", "Invalid regular expression: invalid escape in unicode pattern", kjstSyntaxError, 0)
367367
RT_ERROR_MSG(JSERR_RegExpTooManyCapturingGroups, 5675, "", "Regular expression cannot have more than 32,767 capturing groups", kjstRangeError, 0)
368368
RT_ERROR_MSG(JSERR_ProxyHandlerReturnedFalse, 5676, "Proxy %s handler returned false", "Proxy handler returned false", kjstTypeError, 0)
369+
RT_ERROR_MSG(JSERR_UnicodeRegExpRangeContainsCharClass, 5677, "%s", "Character classes not allowed in a RegExp class range.", kjstSyntaxError, 0)
369370

370371
//Host errors
371372
RT_ERROR_MSG(JSERR_HostMaybeMissingPromiseContinuationCallback, 5700, "", "Host may not have set any promise continuation callback. Promises may not be executed.", kjstTypeError, 0)
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
//-------------------------------------------------------------------------------------------------------
2+
// Copyright (C) Microsoft. All rights reserved.
3+
// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
4+
//-------------------------------------------------------------------------------------------------------
5+
6+
WScript.LoadScriptFile("..\\UnitTestFramework\\UnitTestFramework.js");
7+
8+
function matchRegExp(str, regexp, expectedResult)
9+
{
10+
matchResult = str.match(regexp);//regexp.test(str);
11+
errorMsg = "Expected result of match between string: '" + str + "' and regular expression: " + regexp + " to be " +
12+
expectedResult + " but was " + matchResult;
13+
14+
actualResult = matchResult == null ? null : matchResult[0];
15+
assert.areEqual(expectedResult, actualResult, errorMsg);
16+
}
17+
18+
var tests = [
19+
{
20+
name : "RegExp tests with no flags",
21+
body : function ()
22+
{
23+
let re = /[\s-a-z]/;
24+
matchRegExp("b", re, null);
25+
matchRegExp("a", re, "a");
26+
matchRegExp(" ", re, " ");
27+
matchRegExp("z", re, "z");
28+
matchRegExp("\t", re, "\t");
29+
matchRegExp("q", re, null);
30+
matchRegExp("\\", re, null);
31+
matchRegExp("\u2028", re, "\u2028");
32+
matchRegExp("\u2009", re, "\u2009");
33+
}
34+
},
35+
{
36+
name : "RegExp tests with IgnoreCase flag set",
37+
body : function ()
38+
{
39+
let reIgnoreCase = /^[\s-a-z]$/i;
40+
matchRegExp("O", reIgnoreCase, null);
41+
matchRegExp("A", reIgnoreCase, "A");
42+
matchRegExp(" ", reIgnoreCase, " ");
43+
matchRegExp("z", reIgnoreCase, "z");
44+
matchRegExp("\t", reIgnoreCase, "\t");
45+
matchRegExp("\u2028", reIgnoreCase, "\u2028");
46+
matchRegExp("\u2009", reIgnoreCase, "\u2009");
47+
}
48+
},
49+
{
50+
name : "RegExp tests with Unicode flag set",
51+
body : function ()
52+
{
53+
let reUnicode = /^[a-d]$/u;
54+
matchRegExp("a", reUnicode, "a");
55+
matchRegExp("c", reUnicode, "c");
56+
matchRegExp("d", reUnicode, "d");
57+
matchRegExp("C", reUnicode, null);
58+
matchRegExp("g", reUnicode, null);
59+
matchRegExp("\u2028", reUnicode, null);
60+
matchRegExp("\u2009", reUnicode, null);
61+
assert.throws(() => eval("/^[\\s-z]$/u.exec(\"-\")"), SyntaxError, "Expected an error due to character sets not being allowed in ranges when unicode flag is set.", "Character classes not allowed in a RegExp class range.");
62+
assert.throws(() => eval("/^[z-\\s]$/u.exec(\"-\")"), SyntaxError, "Expected an error due to character sets not being allowed in ranges when unicode flag is set.", "Character classes not allowed in a RegExp class range.");
63+
64+
}
65+
},
66+
{
67+
name : "Non-character class tests",
68+
body : function ()
69+
{
70+
let reNoCharClass = /^[a-c-z]$/;
71+
matchRegExp("b", reNoCharClass, "b");
72+
matchRegExp("-", reNoCharClass, "-");
73+
matchRegExp("z", reNoCharClass, "z");
74+
matchRegExp("y", reNoCharClass, null);
75+
}
76+
},
77+
{
78+
name : "Regression tests from bugFixRegression",
79+
body : function ()
80+
{
81+
matchRegExp(" -abc", /[\s-a-c]*/, " -a");
82+
matchRegExp(" -abc", /[\s\-a-c]*/, " -abc");
83+
matchRegExp(" -ab", /[a-\s-b]*/, " -ab");
84+
matchRegExp(" -ab", /[a\-\s\-b]*/, " -ab");
85+
assert.throws(() => eval("/^[\\s--c-!]$/.exec(\"-./0Abc!\")"), SyntaxError, "Expected an error due to 'c-!' being an invalid range.", "Invalid range in character set");
86+
}
87+
},
88+
{
89+
name : "Special character tests",
90+
body : function ()
91+
{
92+
let re = /^[\s][a\sb][\s--c-f]$/;
93+
matchRegExp(' \\', re, null);
94+
matchRegExp(' \\ ', re, null);
95+
matchRegExp('\\ ', re, null);
96+
re = /[-][\d\-]/;
97+
matchRegExp('--', re, '--');
98+
matchRegExp('-9', re, '-9');
99+
matchRegExp(' ', re, null);
100+
matchRegExp('-\\', re, null);
101+
}
102+
},
103+
{
104+
name : "Negation character set tests",
105+
body : function ()
106+
{
107+
let reNegationCharSet = /[\D-\s]+/;
108+
matchRegExp('555686', reNegationCharSet, null);
109+
matchRegExp('555-686', reNegationCharSet, '-');
110+
matchRegExp('alphabet-123', reNegationCharSet, 'alphabet-');
111+
}
112+
},
113+
{
114+
name : "Non-range tests",
115+
body : function ()
116+
{
117+
let reNonRange = /[-\w]/
118+
matchRegExp('-', reNonRange, '-');
119+
matchRegExp('g', reNonRange, 'g');
120+
matchRegExp('5', reNonRange, '5');
121+
matchRegExp(' ', reNonRange, null);
122+
matchRegExp('\t', reNonRange, null);
123+
matchRegExp('\u2028', reNonRange, null);
124+
matchRegExp('\\', reNonRange, null);
125+
126+
reNonRange = /[\w-]/
127+
matchRegExp('-', reNonRange, '-');
128+
matchRegExp('g', reNonRange, 'g');
129+
matchRegExp('5', reNonRange, '5');
130+
matchRegExp(' ', reNonRange, null);
131+
matchRegExp('\t', reNonRange, null);
132+
matchRegExp('\u2028', reNonRange, null);
133+
matchRegExp('\\', reNonRange, null);
134+
}
135+
}
136+
];
137+
138+
testRunner.runTests(tests, {
139+
verbose : WScript.Arguments[0] != "summary"
140+
});

test/Regex/rlexe.xml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,4 +229,10 @@
229229
<compile-flags>-args summary -endargs</compile-flags>
230230
</default>
231231
</test>
232+
<test>
233+
<default>
234+
<files>characterclass_with_range.js</files>
235+
<compile-flags>-args summary -endargs</compile-flags>
236+
</default>
237+
</test>
232238
</regress-exe>

test/UnifiedRegex/bugFixRegression.baseline

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -632,26 +632,6 @@ exec(/(?:a||b)?/ /*lastIndex=0*/ , "b");
632632
["b"] /*input="b", index=0*/
633633
r.lastIndex=0
634634
RegExp.${_,1,...,9}=["b","","","","","","","","",""]
635-
exec(/[\s-a-c]*/ /*lastIndex=0*/ , " -abc");
636-
[" -abc"] /*input=" -abc", index=0*/
637-
r.lastIndex=0
638-
RegExp.${_,1,...,9}=[" -abc","","","","","","","","",""]
639-
exec(/[\s\-a-c]*/ /*lastIndex=0*/ , " -abc");
640-
[" -abc"] /*input=" -abc", index=0*/
641-
r.lastIndex=0
642-
RegExp.${_,1,...,9}=[" -abc","","","","","","","","",""]
643-
exec(/[a-\s-b]*/ /*lastIndex=0*/ , " -ab");
644-
[" -ab"] /*input=" -ab", index=0*/
645-
r.lastIndex=0
646-
RegExp.${_,1,...,9}=[" -ab","","","","","","","","",""]
647-
exec(/[a\-\s\-b]*/ /*lastIndex=0*/ , " -ab");
648-
[" -ab"] /*input=" -ab", index=0*/
649-
r.lastIndex=0
650-
RegExp.${_,1,...,9}=[" -ab","","","","","","","","",""]
651-
exec(/[\s--c-!]*/ /*lastIndex=0*/ , " -./0Abc!");
652-
[" -./0Abc!"] /*input=" -./0Abc!", index=0*/
653-
r.lastIndex=0
654-
RegExp.${_,1,...,9}=[" -./0Abc!","","","","","","","","",""]
655635
EXCEPTION
656636
exec(/x*(?:(?=x(y*)+)y|\1x)/ /*lastIndex=0*/ , "xxy");
657637
["xx",undefined] /*input="xxy", index=0*/

test/UnifiedRegex/bugFixRegression.js

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -501,13 +501,6 @@ exec(/(?:a*)?/, "");
501501
exec(/(?:a+)?/, "");
502502
exec(/(?:a||b)?/, "b");
503503

504-
// WOOB1145588
505-
exec(/[\s-a-c]*/, " -abc");
506-
exec(/[\s\-a-c]*/, " -abc");
507-
exec(/[a-\s-b]*/, " -ab");
508-
exec(/[a\-\s\-b]*/, " -ab");
509-
exec(/[\s--c-!]*/, " -./0Abc!");
510-
511504
try {
512505
var r = new RegExp("[\\s-c-a]*", "");
513506
exec(r, " -abc");

0 commit comments

Comments
 (0)