[1.10>master] [MERGE #5418 @kfukuda2] Fixing RegExp parsing for character classes interacting with ranges.

Kenji Fukuda · Kenji Fukuda · commit 2aeb63fd0977 · 2018-07-10T14:51:23.000-07:00
Merge pull request #5418 from kfukuda2:RegExpCharacterClassRangeFix Fixes #258
diff --git a/lib/Parser/DebugWriter.cpp b/lib/Parser/DebugWriter.cpp
@@ -72,6 +72,8 @@ namespace UnifiedRegex
         CheckForNewline();
         if (c > 0xff)
             Output::Print(_u("\\u%lc%lc%lc%lc"), hex[c >> 12], hex[(c >> 8) & 0xf], hex[(c >> 4) & 0xf], hex[c & 0xf]);
+        else if (c == '-')
+            Output::Print(_u("\\x2d"));
         else if (c < ' ' || c > '~')
             Output::Print(_u("\\x%lc%lc"), hex[c >> 4], hex[c & 0xf]);
         else
diff --git a/lib/Parser/RegexParser.cpp b/lib/Parser/RegexParser.cpp
@@ -1931,6 +1931,7 @@ namespace UnifiedRegex
         codepoint_t pendingRangeStart = INVALID_CODEPOINT;
         codepoint_t pendingRangeEnd = INVALID_CODEPOINT;
         bool previousSurrogatePart = false;
+
         while(nextChar != ']')
         {
             current = next;
@@ -2034,7 +2035,7 @@ namespace UnifiedRegex
 
                     lastCodepoint = INVALID_CODEPOINT;
                 }
-                // If we the next character is the end of range ']', then we can't have a surrogate pair.
+                // If the next character is the end of range ']', then we can't have a surrogate pair.
                 // The current character is the range end, if we don't already have a candidate.
                 else if (ECLookahead() == ']' && pendingRangeEnd == INVALID_CODEPOINT)
                 {
@@ -2124,6 +2125,10 @@ namespace UnifiedRegex
         codepoint_t pendingRangeStart = INVALID_CODEPOINT;
         EncodedChar nextChar = ECLookahead();
         bool previousWasASurrogate = false;
+        bool currIsACharSet = false;
+        bool prevWasACharSetAndPartOfRange = false;
+        bool prevprevWasACharSetAndPartOfRange = false;
+
         while(nextChar != ']')
         {
             codepoint_t codePointToSet = INVALID_CODEPOINT;
@@ -2133,6 +2138,7 @@ namespace UnifiedRegex
             {
                 ECConsume();
             }
+
             // These if-blocks are the logical ClassAtomPass1, they weren't grouped into a method to simplify dealing with multiple out parameters.
             if (containsSurrogates && this->currentSurrogatePairNode != nullptr && this->currentSurrogatePairNode->location == this->next)
             {
@@ -2147,22 +2153,30 @@ namespace UnifiedRegex
             else if (nextChar == '\\')
             {
                 Node* returnedNode = ClassEscapePass1(&deferredCharNode, &deferredSetNode, previousWasASurrogate);
+                codePointToSet = pendingCodePoint;
 
                 if (returnedNode->tag == Node::MatchSet)
                 {
-                    codePointToSet = pendingCodePoint;
-                    pendingCodePoint = INVALID_CODEPOINT;
                     if (pendingRangeStart != INVALID_CODEPOINT)
                     {
+                        if (unicodeFlagPresent)
+                        {
+                            //We a range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here
+                            //This breaks the notion of Pass0 check for valid syntax, because during that time, the unicode flag is unknown.
+                            Fail(JSERR_UnicodeRegExpRangeContainsCharClass); //From #sec-patterns-static-semantics-early-errors-annexb
+                        }
+
                         codePointSet.Set(ctAllocator, '-');
                     }
+
+                    pendingCodePoint = INVALID_CODEPOINT;
                     pendingRangeStart = INVALID_CODEPOINT;
                     codePointSet.UnionInPlace(ctAllocator, deferredSetNode.set);
+                    currIsACharSet = true;
                 }
                 else
                 {
                     // Just a character
-                    codePointToSet = pendingCodePoint;
                     pendingCodePoint = deferredCharNode.cs[0];
                 }
             }
@@ -2188,9 +2202,26 @@ namespace UnifiedRegex
                 pendingCodePoint = NextChar();
             }
 
-            if (codePointToSet != INVALID_CODEPOINT)
+            if (codePointToSet != INVALID_CODEPOINT || prevprevWasACharSetAndPartOfRange)
             {
-                if (pendingRangeStart != INVALID_CODEPOINT)
+                if (prevprevWasACharSetAndPartOfRange)
+                {
+                    //We a range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here
+                    //This breaks the notion of Pass0 check for valid syntax, because during that time, the unicode flag is unknown.
+                    if (unicodeFlagPresent)
+                    {
+                        Fail(JSERR_UnicodeRegExpRangeContainsCharClass);
+                    }
+
+                    if (pendingCodePoint != INVALID_CODEPOINT)
+                    {
+                        codePointSet.Set(ctAllocator, pendingCodePoint);
+                    }
+
+                    codePointSet.Set(ctAllocator, '-'); //Add '-' to set because a range was detected but turned out to be a union of character set with '-' and another atom.
+                    pendingRangeStart = pendingCodePoint = INVALID_CODEPOINT;
+                }
+                else if (pendingRangeStart != INVALID_CODEPOINT)
                 {
                     if (pendingRangeStart > pendingCodePoint)
                     {
@@ -2199,6 +2230,7 @@ namespace UnifiedRegex
                         Assert(!unicodeFlagPresent);
                         Fail(JSERR_RegExpBadRange);
                     }
+                    
                     codePointSet.SetRange(ctAllocator, pendingRangeStart, pendingCodePoint);
                     pendingRangeStart = pendingCodePoint = INVALID_CODEPOINT;
                 }
@@ -2209,6 +2241,9 @@ namespace UnifiedRegex
             }
 
             nextChar = ECLookahead();
+            prevprevWasACharSetAndPartOfRange = prevWasACharSetAndPartOfRange;
+            prevWasACharSetAndPartOfRange = currIsACharSet && nextChar == '-';
+            currIsACharSet = false;
         }
 
         if (pendingCodePoint != INVALID_CODEPOINT)
diff --git a/lib/Parser/rterrors.h b/lib/Parser/rterrors.h
@@ -366,6 +366,7 @@ RT_ERROR_MSG(JSERR_NoAccessors, 5673, "Invalid property descriptor: accessors no
 RT_ERROR_MSG(JSERR_RegExpInvalidEscape, 5674, "", "Invalid regular expression: invalid escape in unicode pattern", kjstSyntaxError, 0)
 RT_ERROR_MSG(JSERR_RegExpTooManyCapturingGroups, 5675, "", "Regular expression cannot have more than 32,767 capturing groups", kjstRangeError, 0)
 RT_ERROR_MSG(JSERR_ProxyHandlerReturnedFalse, 5676, "Proxy %s handler returned false", "Proxy handler returned false", kjstTypeError, 0)
+RT_ERROR_MSG(JSERR_UnicodeRegExpRangeContainsCharClass, 5677, "%s", "Character classes not allowed in a RegExp class range.", kjstSyntaxError, 0)
 
 //Host errors
 RT_ERROR_MSG(JSERR_HostMaybeMissingPromiseContinuationCallback, 5700, "", "Host may not have set any promise continuation callback. Promises may not be executed.", kjstTypeError, 0)
diff --git a/test/Regex/characterclass_with_range.js b/test/Regex/characterclass_with_range.js
@@ -0,0 +1,140 @@
+//-------------------------------------------------------------------------------------------------------
+// Copyright (C) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
+//-------------------------------------------------------------------------------------------------------
+
+WScript.LoadScriptFile("..\\UnitTestFramework\\UnitTestFramework.js");
+
+function matchRegExp(str, regexp, expectedResult)
+{
+    matchResult = str.match(regexp);//regexp.test(str);
+    errorMsg = "Expected result of match between string: '" + str + "' and regular expression: " + regexp + " to be " + 
+                    expectedResult + " but was " + matchResult;
+
+    actualResult = matchResult == null ? null : matchResult[0];
+    assert.areEqual(expectedResult, actualResult, errorMsg); 
+}
+
+var tests = [
+    {
+        name : "RegExp tests with no flags",
+        body : function () 
+        {
+            let re = /[\s-a-z]/;
+            matchRegExp("b", re, null);
+            matchRegExp("a", re, "a");
+            matchRegExp(" ", re, " ");
+            matchRegExp("z", re, "z");
+            matchRegExp("\t", re, "\t");
+            matchRegExp("q", re, null);
+            matchRegExp("\\", re, null);
+            matchRegExp("\u2028", re, "\u2028");
+            matchRegExp("\u2009", re, "\u2009");
+        }
+    },
+    {
+        name : "RegExp tests with IgnoreCase flag set",
+        body : function () 
+        {
+            let reIgnoreCase = /^[\s-a-z]$/i;
+            matchRegExp("O", reIgnoreCase, null);
+            matchRegExp("A", reIgnoreCase, "A");
+            matchRegExp(" ", reIgnoreCase, " ");
+            matchRegExp("z", reIgnoreCase, "z");
+            matchRegExp("\t", reIgnoreCase, "\t");
+            matchRegExp("\u2028", reIgnoreCase, "\u2028");
+            matchRegExp("\u2009", reIgnoreCase, "\u2009");
+        }
+    },
+    {
+        name : "RegExp tests with Unicode flag set",
+        body : function () 
+        {
+            let reUnicode = /^[a-d]$/u;
+            matchRegExp("a", reUnicode, "a");
+            matchRegExp("c", reUnicode, "c");
+            matchRegExp("d", reUnicode, "d");
+            matchRegExp("C", reUnicode, null);
+            matchRegExp("g", reUnicode, null);
+            matchRegExp("\u2028", reUnicode, null);
+            matchRegExp("\u2009", reUnicode, null);
+            assert.throws(() => eval("/^[\\s-z]$/u.exec(\"-\")"), SyntaxError, "Expected an error due to character sets not being allowed in ranges when unicode flag is set.", "Character classes not allowed in a RegExp class range.");
+            assert.throws(() => eval("/^[z-\\s]$/u.exec(\"-\")"), SyntaxError, "Expected an error due to character sets not being allowed in ranges when unicode flag is set.", "Character classes not allowed in a RegExp class range.");
+        
+        }
+    },
+    {
+        name : "Non-character class tests",
+        body : function () 
+        {
+            let reNoCharClass = /^[a-c-z]$/;
+            matchRegExp("b", reNoCharClass, "b");
+            matchRegExp("-", reNoCharClass, "-");
+            matchRegExp("z", reNoCharClass, "z");
+            matchRegExp("y", reNoCharClass, null);
+        }
+    },
+    {
+        name : "Regression tests from bugFixRegression",
+        body : function () 
+        {
+            matchRegExp(" -abc", /[\s-a-c]*/, " -a");
+            matchRegExp(" -abc", /[\s\-a-c]*/, " -abc");
+            matchRegExp(" -ab", /[a-\s-b]*/, " -ab");
+            matchRegExp(" -ab", /[a\-\s\-b]*/, " -ab");
+            assert.throws(() => eval("/^[\\s--c-!]$/.exec(\"-./0Abc!\")"), SyntaxError, "Expected an error due to 'c-!' being an invalid range.", "Invalid range in character set");
+        }
+    },
+    {
+        name : "Special character tests",
+        body : function () 
+        {
+                let re = /^[\s][a\sb][\s--c-f]$/;
+                matchRegExp('  \\', re, null);
+                matchRegExp(' \\ ', re, null);
+                matchRegExp('\\  ', re, null);
+                re = /[-][\d\-]/;
+                matchRegExp('--', re, '--');
+                matchRegExp('-9', re, '-9');
+                matchRegExp('  ', re, null);
+                matchRegExp('-\\', re, null);
+        }
+    },
+    {
+        name : "Negation character set tests",
+        body : function () 
+        {
+                let reNegationCharSet = /[\D-\s]+/;
+                matchRegExp('555686', reNegationCharSet, null);
+                matchRegExp('555-686', reNegationCharSet, '-');
+                matchRegExp('alphabet-123', reNegationCharSet, 'alphabet-');
+        }
+    },
+    {
+        name : "Non-range tests",
+        body : function () 
+        {
+                let reNonRange = /[-\w]/
+                matchRegExp('-', reNonRange, '-');
+                matchRegExp('g', reNonRange, 'g');
+                matchRegExp('5', reNonRange, '5');
+                matchRegExp(' ', reNonRange, null);
+                matchRegExp('\t', reNonRange, null);
+                matchRegExp('\u2028', reNonRange, null);
+                matchRegExp('\\', reNonRange, null);
+                
+                reNonRange = /[\w-]/
+                matchRegExp('-', reNonRange, '-');
+                matchRegExp('g', reNonRange, 'g');
+                matchRegExp('5', reNonRange, '5');
+                matchRegExp(' ', reNonRange, null);
+                matchRegExp('\t', reNonRange, null);
+                matchRegExp('\u2028', reNonRange, null);
+                matchRegExp('\\', reNonRange, null); 
+        }
+    }
+];
+
+testRunner.runTests(tests, {
+    verbose : WScript.Arguments[0] != "summary"
+});
diff --git a/test/Regex/rlexe.xml b/test/Regex/rlexe.xml
@@ -229,4 +229,10 @@
       <compile-flags>-args summary -endargs</compile-flags>
     </default>
   </test>
+    <test>
+    <default>
+      <files>characterclass_with_range.js</files>
+      <compile-flags>-args summary -endargs</compile-flags>
+    </default>
+  </test>
 </regress-exe>
diff --git a/test/UnifiedRegex/bugFixRegression.baseline b/test/UnifiedRegex/bugFixRegression.baseline
@@ -632,26 +632,6 @@ exec(/(?:a||b)?/ /*lastIndex=0*/ , "b");
 ["b"] /*input="b", index=0*/ 
 r.lastIndex=0
 RegExp.${_,1,...,9}=["b","","","","","","","","",""]
-exec(/[\s-a-c]*/ /*lastIndex=0*/ , " -abc");
-[" -abc"] /*input=" -abc", index=0*/ 
-r.lastIndex=0
-RegExp.${_,1,...,9}=[" -abc","","","","","","","","",""]
-exec(/[\s\-a-c]*/ /*lastIndex=0*/ , " -abc");
-[" -abc"] /*input=" -abc", index=0*/ 
-r.lastIndex=0
-RegExp.${_,1,...,9}=[" -abc","","","","","","","","",""]
-exec(/[a-\s-b]*/ /*lastIndex=0*/ , " -ab");
-[" -ab"] /*input=" -ab", index=0*/ 
-r.lastIndex=0
-RegExp.${_,1,...,9}=[" -ab","","","","","","","","",""]
-exec(/[a\-\s\-b]*/ /*lastIndex=0*/ , " -ab");
-[" -ab"] /*input=" -ab", index=0*/ 
-r.lastIndex=0
-RegExp.${_,1,...,9}=[" -ab","","","","","","","","",""]
-exec(/[\s--c-!]*/ /*lastIndex=0*/ , " -./0Abc!");
-[" -./0Abc!"] /*input=" -./0Abc!", index=0*/ 
-r.lastIndex=0
-RegExp.${_,1,...,9}=[" -./0Abc!","","","","","","","","",""]
 EXCEPTION
 exec(/x*(?:(?=x(y*)+)y|\1x)/ /*lastIndex=0*/ , "xxy");
 ["xx",undefined] /*input="xxy", index=0*/ 
diff --git a/test/UnifiedRegex/bugFixRegression.js b/test/UnifiedRegex/bugFixRegression.js
@@ -501,13 +501,6 @@ exec(/(?:a*)?/, "");
 exec(/(?:a+)?/, "");
 exec(/(?:a||b)?/, "b");
 
-// WOOB1145588
-exec(/[\s-a-c]*/, " -abc");
-exec(/[\s\-a-c]*/, " -abc");
-exec(/[a-\s-b]*/, " -ab");
-exec(/[a\-\s\-b]*/, " -ab");
-exec(/[\s--c-!]*/, " -./0Abc!");
-
 try {
     var r = new RegExp("[\\s-c-a]*", "");
     exec(r, " -abc");

Original file line number	Diff line number	Diff line change
`@@ -1931,6 +1931,7 @@ namespace UnifiedRegex`
`1931`	`1931`	`codepoint_t pendingRangeStart = INVALID_CODEPOINT;`
`1932`	`1932`	`codepoint_t pendingRangeEnd = INVALID_CODEPOINT;`
`1933`	`1933`	`bool previousSurrogatePart = false;`
	`1934`	`+`
`1934`	`1935`	`while(nextChar != ']')`
`1935`	`1936`	`{`
`1936`	`1937`	`current = next;`
`@@ -2034,7 +2035,7 @@ namespace UnifiedRegex`
`2034`	`2035`
`2035`	`2036`	`lastCodepoint = INVALID_CODEPOINT;`
`2036`	`2037`	`}`
`2037`		`- // If we the next character is the end of range ']', then we can't have a surrogate pair.`
	`2038`	`+ // If the next character is the end of range ']', then we can't have a surrogate pair.`
`2038`	`2039`	`// The current character is the range end, if we don't already have a candidate.`
`2039`	`2040`	`else if (ECLookahead() == ']' && pendingRangeEnd == INVALID_CODEPOINT)`
`2040`	`2041`	`{`
`@@ -2124,6 +2125,10 @@ namespace UnifiedRegex`
`2124`	`2125`	`codepoint_t pendingRangeStart = INVALID_CODEPOINT;`
`2125`	`2126`	`EncodedChar nextChar = ECLookahead();`
`2126`	`2127`	`bool previousWasASurrogate = false;`
	`2128`	`+ bool currIsACharSet = false;`
	`2129`	`+ bool prevWasACharSetAndPartOfRange = false;`
	`2130`	`+ bool prevprevWasACharSetAndPartOfRange = false;`
	`2131`	`+`
`2127`	`2132`	`while(nextChar != ']')`
`2128`	`2133`	`{`
`2129`	`2134`	`codepoint_t codePointToSet = INVALID_CODEPOINT;`
`@@ -2133,6 +2138,7 @@ namespace UnifiedRegex`
`2133`	`2138`	`{`
`2134`	`2139`	`ECConsume();`
`2135`	`2140`	`}`
	`2141`	`+`
`2136`	`2142`	`// These if-blocks are the logical ClassAtomPass1, they weren't grouped into a method to simplify dealing with multiple out parameters.`
`2137`	`2143`	`if (containsSurrogates && this->currentSurrogatePairNode != nullptr && this->currentSurrogatePairNode->location == this->next)`
`2138`	`2144`	`{`
`@@ -2147,22 +2153,30 @@ namespace UnifiedRegex`
`2147`	`2153`	`else if (nextChar == '\\')`
`2148`	`2154`	`{`
`2149`	`2155`	`Node* returnedNode = ClassEscapePass1(&deferredCharNode, &deferredSetNode, previousWasASurrogate);`
	`2156`	`+ codePointToSet = pendingCodePoint;`
`2150`	`2157`
`2151`	`2158`	`if (returnedNode->tag == Node::MatchSet)`
`2152`	`2159`	`{`
`2153`		`- codePointToSet = pendingCodePoint;`
`2154`		`- pendingCodePoint = INVALID_CODEPOINT;`
`2155`	`2160`	`if (pendingRangeStart != INVALID_CODEPOINT)`
`2156`	`2161`	`{`
	`2162`	`+ if (unicodeFlagPresent)`
	`2163`	`+ {`
	`2164`	`+ //We a range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here`
	`2165`	`+ //This breaks the notion of Pass0 check for valid syntax, because during that time, the unicode flag is unknown.`
	`2166`	`+ Fail(JSERR_UnicodeRegExpRangeContainsCharClass); //From #sec-patterns-static-semantics-early-errors-annexb`
	`2167`	`+ }`
	`2168`	`+`
`2157`	`2169`	`codePointSet.Set(ctAllocator, '-');`
`2158`	`2170`	`}`
	`2171`	`+`
	`2172`	`+ pendingCodePoint = INVALID_CODEPOINT;`
`2159`	`2173`	`pendingRangeStart = INVALID_CODEPOINT;`
`2160`	`2174`	`codePointSet.UnionInPlace(ctAllocator, deferredSetNode.set);`
	`2175`	`+ currIsACharSet = true;`
`2161`	`2176`	`}`
`2162`	`2177`	`else`
`2163`	`2178`	`{`
`2164`	`2179`	`// Just a character`
`2165`		`- codePointToSet = pendingCodePoint;`
`2166`	`2180`	`pendingCodePoint = deferredCharNode.cs[0];`
`2167`	`2181`	`}`
`2168`	`2182`	`}`
`@@ -2188,9 +2202,26 @@ namespace UnifiedRegex`
`2188`	`2202`	`pendingCodePoint = NextChar();`
`2189`	`2203`	`}`
`2190`	`2204`
`2191`		`- if (codePointToSet != INVALID_CODEPOINT)`
	`2205`	`+ if (codePointToSet != INVALID_CODEPOINT \|\| prevprevWasACharSetAndPartOfRange)`
`2192`	`2206`	`{`
`2193`		`- if (pendingRangeStart != INVALID_CODEPOINT)`
	`2207`	`+ if (prevprevWasACharSetAndPartOfRange)`
	`2208`	`+ {`
	`2209`	`+ //We a range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here`
	`2210`	`+ //This breaks the notion of Pass0 check for valid syntax, because during that time, the unicode flag is unknown.`
	`2211`	`+ if (unicodeFlagPresent)`
	`2212`	`+ {`
	`2213`	`+ Fail(JSERR_UnicodeRegExpRangeContainsCharClass);`
	`2214`	`+ }`
	`2215`	`+`
	`2216`	`+ if (pendingCodePoint != INVALID_CODEPOINT)`
	`2217`	`+ {`
	`2218`	`+ codePointSet.Set(ctAllocator, pendingCodePoint);`
	`2219`	`+ }`
	`2220`	`+`
	`2221`	`+ codePointSet.Set(ctAllocator, '-'); //Add '-' to set because a range was detected but turned out to be a union of character set with '-' and another atom.`
	`2222`	`+ pendingRangeStart = pendingCodePoint = INVALID_CODEPOINT;`
	`2223`	`+ }`
	`2224`	`+ else if (pendingRangeStart != INVALID_CODEPOINT)`
`2194`	`2225`	`{`
`2195`	`2226`	`if (pendingRangeStart > pendingCodePoint)`
`2196`	`2227`	`{`
`@@ -2199,6 +2230,7 @@ namespace UnifiedRegex`
`2199`	`2230`	`Assert(!unicodeFlagPresent);`
`2200`	`2231`	`Fail(JSERR_RegExpBadRange);`
`2201`	`2232`	`}`
	`2233`	`+`
`2202`	`2234`	`codePointSet.SetRange(ctAllocator, pendingRangeStart, pendingCodePoint);`
`2203`	`2235`	`pendingRangeStart = pendingCodePoint = INVALID_CODEPOINT;`
`2204`	`2236`	`}`
`@@ -2209,6 +2241,9 @@ namespace UnifiedRegex`
`2209`	`2241`	`}`
`2210`	`2242`
`2211`	`2243`	`nextChar = ECLookahead();`
	`2244`	`+ prevprevWasACharSetAndPartOfRange = prevWasACharSetAndPartOfRange;`
	`2245`	`+ prevWasACharSetAndPartOfRange = currIsACharSet && nextChar == '-';`
	`2246`	`+ currIsACharSet = false;`
`2212`	`2247`	`}`
`2213`	`2248`
`2214`	`2249`	`if (pendingCodePoint != INVALID_CODEPOINT)`