diff --git a/src/libraries/System.Data.Odbc/src/System.Data.Odbc.csproj b/src/libraries/System.Data.Odbc/src/System.Data.Odbc.csproj index 5efd5553d36853..da7a4b5e51642a 100644 --- a/src/libraries/System.Data.Odbc/src/System.Data.Odbc.csproj +++ b/src/libraries/System.Data.Odbc/src/System.Data.Odbc.csproj @@ -134,6 +134,9 @@ System.Data.Odbc.OdbcTransaction Link="Common\DisableRuntimeMarshalling.cs" /> + + + - + (); - for (int i = 0; i <= 0x7f; i++) + for (int i = 0; i < 128; i++) { if (!RegexCharClass.CharInClass((char)i, set)) { @@ -466,6 +466,7 @@ private static string EmitIndexOfAnyCustomHelper(string set, Dictionary "IndexOfAnyDigit", @@ -496,6 +497,18 @@ private static string EmitIndexOfAnyCustomHelper(string set, Dictionary null, }; + + // If this set is just from a few Unicode categories, derive a name from the categories. + if (helperName is null) + { + Span categories = stackalloc UnicodeCategory[5]; // arbitrary limit to keep names from being too unwieldy + if (RegexCharClass.TryGetOnlyCategories(set, categories, out int numCategories, out bool negatedCategory)) + { + helperName = $"IndexOfAny{(negatedCategory ? "Except" : "")}{string.Concat(categories.Slice(0, numCategories).ToArray().Select(c => c.ToString()))}"; + } + } + + // As a final fallback, manufacture a name unique to the full set description. if (helperName is null) { using (SHA256 sha = SHA256.Create()) @@ -522,7 +535,7 @@ private static string EmitIndexOfAnyCustomHelper(string set, Dictionary 1; FinishEmitBlock loopBlock = default; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs index ab0dd5554313a5..71a3004af4940f 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs @@ -1327,23 +1327,21 @@ private static bool CharInClassInternal(char ch, string set, int start, int setL return false; } - return CharInCategory(ch, set, start, setLength, categoryLength); + return CharInCategory(ch, set.AsSpan(SetStartIndex + start + setLength, categoryLength)); } - private static bool CharInCategory(char ch, string set, int start, int setLength, int categoryLength) + private static bool CharInCategory(char ch, ReadOnlySpan categorySetSegment) { UnicodeCategory chcategory = char.GetUnicodeCategory(ch); - int i = start + SetStartIndex + setLength; - int end = i + categoryLength; - while (i < end) + for (int i = 0; i < categorySetSegment.Length; i++) { - int curcat = (short)set[i]; + int curcat = (short)categorySetSegment[i]; if (curcat == 0) { // zero is our marker for a group of categories - treated as a unit - if (CharInCategoryGroup(chcategory, set, ref i)) + if (CharInCategoryGroup(chcategory, categorySetSegment, ref i)) { return true; } @@ -1379,8 +1377,6 @@ private static bool CharInCategory(char ch, string set, int start, int setLength return true; } } - - i++; } return false; @@ -1390,7 +1386,7 @@ private static bool CharInCategory(char ch, string set, int start, int setLength /// This is used for categories which are composed of other categories - L, N, Z, W... /// These groups need special treatment when they are negated /// - private static bool CharInCategoryGroup(UnicodeCategory chcategory, string category, ref int i) + private static bool CharInCategoryGroup(UnicodeCategory chcategory, ReadOnlySpan category, ref int i) { int pos = i + 1; int curcat = (short)category[pos]; @@ -1717,6 +1713,52 @@ _subtractor is null && } } } + + // If the class now has a range that includes everything, and if it doesn't have subtraction, + // we can remove all of its categories, as they're duplicative (the set already includes everything). + if (!_negate && + _subtractor is null && + _categories?.Length > 0 && + rangelist.Count == 1 && rangelist[0].First == 0 && rangelist[0].Last == LastChar) + { + _categories.Clear(); + } + + // If there's only a single character omitted from ranges, if there's no subtractor, and if there are categories, + // see if that character is in the categories. If it is, then we can replace whole thing with a complete "any" range. + // If it's not, then we can remove the categories, as they're only duplicating the rest of the range, turning the set + // into a "not one". This primarily helps in the case of a synthesized set from analysis that ends up combining '.' with + // categories, as we want to reduce that set down to either [^\n] or [\0-\uFFFF]. (This can be extrapolated to any number + // of missing characters; in fact, categories in general are superfluous and the entire set can be represented as ranges. + // But categories serve as a space optimization, and we strike a balance between testing many characters and the time/complexity + // it takes to do so. Thus, we limit this to the common case of a single missing character.) + if (!_negate && + _subtractor is null && + _categories?.Length > 0 && + rangelist.Count == 2 && rangelist[0].First == 0 && rangelist[0].Last + 2 == rangelist[1].First && rangelist[1].Last == LastChar) + { + var vsb = new ValueStringBuilder(stackalloc char[256]); + foreach (ReadOnlyMemory chunk in _categories!.GetChunks()) + { + vsb.Append(chunk.Span); + } + + if (CharInCategory((char)(rangelist[0].Last + 1), vsb.AsSpan())) + { + rangelist.RemoveAt(1); + rangelist[0] = ('\0', LastChar); + } + else + { + _negate = true; + rangelist.RemoveAt(1); + char notOne = (char)(rangelist[0].Last + 1); + rangelist[0] = (notOne, notOne); + } + _categories.Clear(); + + vsb.Dispose(); + } } } @@ -1792,12 +1834,20 @@ public static string DescribeSet(string set) void RenderRanges() { - for (; index < SetStartIndex + set[SetLengthIndex]; index += 2) + int rangesEnd = SetStartIndex + set[SetLengthIndex]; + while (index < rangesEnd) { ch1 = set[index]; - ch2 = index + 1 < set.Length ? - (char)(set[index + 1] - 1) : - LastChar; + if (index + 1 < rangesEnd) + { + ch2 = (char)(set[index + 1] - 1); + index += 2; + } + else + { + ch2 = LastChar; + index++; + } desc.Append(DescribeChar(ch1)); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index d44a7e63b5b723..f9be28acceeeb2 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -8,6 +8,7 @@ using System.Globalization; using System.Reflection; using System.Reflection.Emit; +using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Threading; @@ -97,6 +98,7 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_stringGetCharsMethod = typeof(string).GetMethod("get_Chars", new Type[] { typeof(int) })!; private static readonly MethodInfo s_arrayResize = typeof(Array).GetMethod("Resize")!.MakeGenericMethod(typeof(int)); private static readonly MethodInfo s_mathMinIntInt = typeof(Math).GetMethod("Min", new Type[] { typeof(int), typeof(int) })!; + private static readonly MethodInfo s_memoryMarshalGetArrayDataReferenceIndexOfAnyValues = typeof(MemoryMarshal).GetMethod("GetArrayDataReference", new Type[] { Type.MakeGenericMethodParameter(0).MakeArrayType() })!.MakeGenericMethod(typeof(IndexOfAnyValues))!; // Note: // Single-range helpers like IsAsciiLetterLower, IsAsciiLetterUpper, IsAsciiDigit, and IsBetween aren't used here, as the IL generated for those // single-range checks is as cheap as the method call, and there's no readability issue as with the source generator. @@ -834,9 +836,13 @@ void EmitFixedSet_LeftToRight() // Use IndexOf{Any} to accelerate the skip loop via vectorization to match the first prefix. // But we avoid using it for the relatively common case of the starting set being '.', aka anything other than - // a newline, as it's very rare to have long, uninterrupted sequences of newlines. + // a newline, as it's very rare to have long, uninterrupted sequences of newlines. And we avoid using it + // for the case of the starting set being anything (e.g. '.' with SingleLine), as in that case it'll always match + // the first char. int setIndex = 0; - bool canUseIndexOf = primarySet.Set != RegexCharClass.NotNewLineClass; + bool canUseIndexOf = + primarySet.Set != RegexCharClass.NotNewLineClass && + primarySet.Set != RegexCharClass.AnyClass; bool needLoop = !canUseIndexOf || setsToUse > 1; Label checkSpanLengthLabel = default; @@ -6098,10 +6104,19 @@ private void LoadIndexOfAnyValues(ReadOnlySpan chars) int index = list.Count; list.Add(IndexOfAnyValues.Create(chars)); - // this._indexOfAnyValues[index] + // Logically do _indexOfAnyValues[index], but avoid the bounds check on accessing the array, + // and cast to the known derived sealed type to enable devirtualization. + + // DerivedIndexOfAnyValues d = Unsafe.Add(ref MemoryMarshal.GetArrayDataReference(this._indexOfAnyValues), index); + // ... = d; Ldthisfld(s_indexOfAnyValuesArrayField); - Ldc(index); - _ilg!.Emit(OpCodes.Ldelem_Ref); + Call(s_memoryMarshalGetArrayDataReferenceIndexOfAnyValues); + Ldc(index * IntPtr.Size); + Add(); + _ilg!.Emit(OpCodes.Ldind_Ref); + LocalBuilder ioavLocal = _ilg!.DeclareLocal(list[index].GetType()); + Stloc(ioavLocal); + Ldloc(ioavLocal); } } }