From c6515d0473a09f0a0a37b2d400474598fb564811 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Fri, 7 Apr 2023 17:20:20 -0400 Subject: [PATCH 1/5] Fix downlevel builds with a project reference to regex generator --- src/libraries/System.Data.Odbc/src/System.Data.Odbc.csproj | 3 +++ src/libraries/System.Data.OleDb/src/System.Data.OleDb.csproj | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/libraries/System.Data.Odbc/src/System.Data.Odbc.csproj b/src/libraries/System.Data.Odbc/src/System.Data.Odbc.csproj index 5efd5553d36853..da7a4b5e51642a 100644 --- a/src/libraries/System.Data.Odbc/src/System.Data.Odbc.csproj +++ b/src/libraries/System.Data.Odbc/src/System.Data.Odbc.csproj @@ -134,6 +134,9 @@ System.Data.Odbc.OdbcTransaction Link="Common\DisableRuntimeMarshalling.cs" /> + + + - + Date: Tue, 4 Apr 2023 12:57:54 -0400 Subject: [PATCH 2/5] Improve char class canonicalization for complete and almost empty sets - Remove categories from a set whose ranges make it already complete (when there's no subtraction). We have code paths that explicitly recognize the Any char class, and these extra categories knock these sets off those fast paths. - Remove categories from a set where a single char is missing from the ranges, by checking whether that char is contained in the categories. If the char is present, the set can be morphed into Any. If the char isn't present, the categories can be removed and the set becomes a standard NotOne form. Both of these are unlikely to be written explicitly by a developer but result from analysis producing search sets, in particular when alternations or nullable loops are involved. Also fixed textual description of sets that both contain the last character (\uFFFF) and have categories. We were sometimes skipping the first category in this case. This is only relevant to the source generator, as these descriptions are output in comments. --- .../Text/RegularExpressions/RegexCharClass.cs | 78 +++++++++++++++---- 1 file changed, 64 insertions(+), 14 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs index ab0dd5554313a5..71a3004af4940f 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs @@ -1327,23 +1327,21 @@ private static bool CharInClassInternal(char ch, string set, int start, int setL return false; } - return CharInCategory(ch, set, start, setLength, categoryLength); + return CharInCategory(ch, set.AsSpan(SetStartIndex + start + setLength, categoryLength)); } - private static bool CharInCategory(char ch, string set, int start, int setLength, int categoryLength) + private static bool CharInCategory(char ch, ReadOnlySpan categorySetSegment) { UnicodeCategory chcategory = char.GetUnicodeCategory(ch); - int i = start + SetStartIndex + setLength; - int end = i + categoryLength; - while (i < end) + for (int i = 0; i < categorySetSegment.Length; i++) { - int curcat = (short)set[i]; + int curcat = (short)categorySetSegment[i]; if (curcat == 0) { // zero is our marker for a group of categories - treated as a unit - if (CharInCategoryGroup(chcategory, set, ref i)) + if (CharInCategoryGroup(chcategory, categorySetSegment, ref i)) { return true; } @@ -1379,8 +1377,6 @@ private static bool CharInCategory(char ch, string set, int start, int setLength return true; } } - - i++; } return false; @@ -1390,7 +1386,7 @@ private static bool CharInCategory(char ch, string set, int start, int setLength /// This is used for categories which are composed of other categories - L, N, Z, W... /// These groups need special treatment when they are negated /// - private static bool CharInCategoryGroup(UnicodeCategory chcategory, string category, ref int i) + private static bool CharInCategoryGroup(UnicodeCategory chcategory, ReadOnlySpan category, ref int i) { int pos = i + 1; int curcat = (short)category[pos]; @@ -1717,6 +1713,52 @@ _subtractor is null && } } } + + // If the class now has a range that includes everything, and if it doesn't have subtraction, + // we can remove all of its categories, as they're duplicative (the set already includes everything). + if (!_negate && + _subtractor is null && + _categories?.Length > 0 && + rangelist.Count == 1 && rangelist[0].First == 0 && rangelist[0].Last == LastChar) + { + _categories.Clear(); + } + + // If there's only a single character omitted from ranges, if there's no subtractor, and if there are categories, + // see if that character is in the categories. If it is, then we can replace whole thing with a complete "any" range. + // If it's not, then we can remove the categories, as they're only duplicating the rest of the range, turning the set + // into a "not one". This primarily helps in the case of a synthesized set from analysis that ends up combining '.' with + // categories, as we want to reduce that set down to either [^\n] or [\0-\uFFFF]. (This can be extrapolated to any number + // of missing characters; in fact, categories in general are superfluous and the entire set can be represented as ranges. + // But categories serve as a space optimization, and we strike a balance between testing many characters and the time/complexity + // it takes to do so. Thus, we limit this to the common case of a single missing character.) + if (!_negate && + _subtractor is null && + _categories?.Length > 0 && + rangelist.Count == 2 && rangelist[0].First == 0 && rangelist[0].Last + 2 == rangelist[1].First && rangelist[1].Last == LastChar) + { + var vsb = new ValueStringBuilder(stackalloc char[256]); + foreach (ReadOnlyMemory chunk in _categories!.GetChunks()) + { + vsb.Append(chunk.Span); + } + + if (CharInCategory((char)(rangelist[0].Last + 1), vsb.AsSpan())) + { + rangelist.RemoveAt(1); + rangelist[0] = ('\0', LastChar); + } + else + { + _negate = true; + rangelist.RemoveAt(1); + char notOne = (char)(rangelist[0].Last + 1); + rangelist[0] = (notOne, notOne); + } + _categories.Clear(); + + vsb.Dispose(); + } } } @@ -1792,12 +1834,20 @@ public static string DescribeSet(string set) void RenderRanges() { - for (; index < SetStartIndex + set[SetLengthIndex]; index += 2) + int rangesEnd = SetStartIndex + set[SetLengthIndex]; + while (index < rangesEnd) { ch1 = set[index]; - ch2 = index + 1 < set.Length ? - (char)(set[index + 1] - 1) : - LastChar; + if (index + 1 < rangesEnd) + { + ch2 = (char)(set[index + 1] - 1); + index += 2; + } + else + { + ch2 = LastChar; + index++; + } desc.Append(DescribeChar(ch1)); From 88d3f417d465238b2859dd49540086df92e761b8 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Tue, 4 Apr 2023 13:01:17 -0400 Subject: [PATCH 3/5] Avoid using a IndexOf for the any set We needn't search for anything, as everything matches. --- .../gen/RegexGenerator.Emitter.cs | 8 ++++++-- .../src/System/Text/RegularExpressions/RegexCompiler.cs | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index f0c15fa7aa0d07..a03e2997535efa 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -1004,9 +1004,13 @@ void EmitFixedSet_LeftToRight() // Use IndexOf{Any} to accelerate the skip loop via vectorization to match the first prefix. // But we avoid using it for the relatively common case of the starting set being '.', aka anything other than - // a newline, as it's very rare to have long, uninterrupted sequences of newlines. + // a newline, as it's very rare to have long, uninterrupted sequences of newlines. And we avoid using it + // for the case of the starting set being anything (e.g. '.' with SingleLine), as in that case it'll always match + // the first char. int setIndex = 0; - bool canUseIndexOf = primarySet.Set != RegexCharClass.NotNewLineClass; + bool canUseIndexOf = + primarySet.Set != RegexCharClass.NotNewLineClass && + primarySet.Set != RegexCharClass.AnyClass; bool needLoop = !canUseIndexOf || setsToUse > 1; FinishEmitBlock loopBlock = default; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index d44a7e63b5b723..42df1b9ba5e0cc 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -834,9 +834,13 @@ void EmitFixedSet_LeftToRight() // Use IndexOf{Any} to accelerate the skip loop via vectorization to match the first prefix. // But we avoid using it for the relatively common case of the starting set being '.', aka anything other than - // a newline, as it's very rare to have long, uninterrupted sequences of newlines. + // a newline, as it's very rare to have long, uninterrupted sequences of newlines. And we avoid using it + // for the case of the starting set being anything (e.g. '.' with SingleLine), as in that case it'll always match + // the first char. int setIndex = 0; - bool canUseIndexOf = primarySet.Set != RegexCharClass.NotNewLineClass; + bool canUseIndexOf = + primarySet.Set != RegexCharClass.NotNewLineClass && + primarySet.Set != RegexCharClass.AnyClass; bool needLoop = !canUseIndexOf || setsToUse > 1; Label checkSpanLengthLabel = default; From 7df88525f3dfd82749cb221d013f589312556aef Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Tue, 4 Apr 2023 13:16:29 -0400 Subject: [PATCH 4/5] Improve regex source gen IndexOfAny naming for Unicode categories When we're otherwise unable to come up with a good name for the custom IndexOfAny helper, if the set is just a handful of UnicodeCategory values, derive a name from those categories. --- .../gen/RegexGenerator.Emitter.cs | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index a03e2997535efa..7f3c41f0d794ac 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -458,7 +458,7 @@ private static string EmitIndexOfAnyCustomHelper(string set, Dictionary(); - for (int i = 0; i <= 0x7f; i++) + for (int i = 0; i < 128; i++) { if (!RegexCharClass.CharInClass((char)i, set)) { @@ -466,6 +466,7 @@ private static string EmitIndexOfAnyCustomHelper(string set, Dictionary "IndexOfAnyDigit", @@ -496,6 +497,18 @@ private static string EmitIndexOfAnyCustomHelper(string set, Dictionary null, }; + + // If this set is just from a few Unicode categories, derive a name from the categories. + if (helperName is null) + { + Span categories = stackalloc UnicodeCategory[5]; // arbitrary limit to keep names from being too unwieldy + if (RegexCharClass.TryGetOnlyCategories(set, categories, out int numCategories, out bool negatedCategory)) + { + helperName = $"IndexOfAny{(negatedCategory ? "Except" : "")}{string.Concat(categories.Slice(0, numCategories).ToArray().Select(c => c.ToString()))}"; + } + } + + // As a final fallback, manufacture a name unique to the full set description. if (helperName is null) { using (SHA256 sha = SHA256.Create()) @@ -522,7 +535,7 @@ private static string EmitIndexOfAnyCustomHelper(string set, Dictionary Date: Wed, 5 Apr 2023 11:14:41 -0400 Subject: [PATCH 5/5] Reduce RegexCompiler cost of using IndexOfAnyValues With the source generator, each IndexOfAnyValues is stored in its own static readonly field. This makes it cheap to access and allows the JIT to devirtualize calls to it. With RegexCompiler, we use a DynamicMethod and thus can't introduce new static fields, so instead we maintain an array of IndexOfAnyValues. That means that every time we need one, we're loading the object out of the array. This incurs both bounds checks and doesn't devirtualize. This commit changes the implementation to avoid the bounds check and to also enable devirtualization. --- .../Text/RegularExpressions/RegexCompiler.cs | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 42df1b9ba5e0cc..f9be28acceeeb2 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -8,6 +8,7 @@ using System.Globalization; using System.Reflection; using System.Reflection.Emit; +using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Threading; @@ -97,6 +98,7 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_stringGetCharsMethod = typeof(string).GetMethod("get_Chars", new Type[] { typeof(int) })!; private static readonly MethodInfo s_arrayResize = typeof(Array).GetMethod("Resize")!.MakeGenericMethod(typeof(int)); private static readonly MethodInfo s_mathMinIntInt = typeof(Math).GetMethod("Min", new Type[] { typeof(int), typeof(int) })!; + private static readonly MethodInfo s_memoryMarshalGetArrayDataReferenceIndexOfAnyValues = typeof(MemoryMarshal).GetMethod("GetArrayDataReference", new Type[] { Type.MakeGenericMethodParameter(0).MakeArrayType() })!.MakeGenericMethod(typeof(IndexOfAnyValues))!; // Note: // Single-range helpers like IsAsciiLetterLower, IsAsciiLetterUpper, IsAsciiDigit, and IsBetween aren't used here, as the IL generated for those // single-range checks is as cheap as the method call, and there's no readability issue as with the source generator. @@ -6102,10 +6104,19 @@ private void LoadIndexOfAnyValues(ReadOnlySpan chars) int index = list.Count; list.Add(IndexOfAnyValues.Create(chars)); - // this._indexOfAnyValues[index] + // Logically do _indexOfAnyValues[index], but avoid the bounds check on accessing the array, + // and cast to the known derived sealed type to enable devirtualization. + + // DerivedIndexOfAnyValues d = Unsafe.Add(ref MemoryMarshal.GetArrayDataReference(this._indexOfAnyValues), index); + // ... = d; Ldthisfld(s_indexOfAnyValuesArrayField); - Ldc(index); - _ilg!.Emit(OpCodes.Ldelem_Ref); + Call(s_memoryMarshalGetArrayDataReferenceIndexOfAnyValues); + Ldc(index * IntPtr.Size); + Add(); + _ilg!.Emit(OpCodes.Ldind_Ref); + LocalBuilder ioavLocal = _ilg!.DeclareLocal(list[index].GetType()); + Stloc(ioavLocal); + Ldloc(ioavLocal); } } }