diff --git a/src/libraries/System.Data.Odbc/src/System.Data.Odbc.csproj b/src/libraries/System.Data.Odbc/src/System.Data.Odbc.csproj
index 5efd5553d36853..da7a4b5e51642a 100644
--- a/src/libraries/System.Data.Odbc/src/System.Data.Odbc.csproj
+++ b/src/libraries/System.Data.Odbc/src/System.Data.Odbc.csproj
@@ -134,6 +134,9 @@ System.Data.Odbc.OdbcTransaction
Link="Common\DisableRuntimeMarshalling.cs" />
+
+
+
-
+
();
- for (int i = 0; i <= 0x7f; i++)
+ for (int i = 0; i < 128; i++)
{
if (!RegexCharClass.CharInClass((char)i, set))
{
@@ -466,6 +466,7 @@ private static string EmitIndexOfAnyCustomHelper(string set, Dictionary "IndexOfAnyDigit",
@@ -496,6 +497,18 @@ private static string EmitIndexOfAnyCustomHelper(string set, Dictionary null,
};
+
+ // If this set is just from a few Unicode categories, derive a name from the categories.
+ if (helperName is null)
+ {
+ Span categories = stackalloc UnicodeCategory[5]; // arbitrary limit to keep names from being too unwieldy
+ if (RegexCharClass.TryGetOnlyCategories(set, categories, out int numCategories, out bool negatedCategory))
+ {
+ helperName = $"IndexOfAny{(negatedCategory ? "Except" : "")}{string.Concat(categories.Slice(0, numCategories).ToArray().Select(c => c.ToString()))}";
+ }
+ }
+
+ // As a final fallback, manufacture a name unique to the full set description.
if (helperName is null)
{
using (SHA256 sha = SHA256.Create())
@@ -522,7 +535,7 @@ private static string EmitIndexOfAnyCustomHelper(string set, Dictionary 1;
FinishEmitBlock loopBlock = default;
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
index ab0dd5554313a5..71a3004af4940f 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
@@ -1327,23 +1327,21 @@ private static bool CharInClassInternal(char ch, string set, int start, int setL
return false;
}
- return CharInCategory(ch, set, start, setLength, categoryLength);
+ return CharInCategory(ch, set.AsSpan(SetStartIndex + start + setLength, categoryLength));
}
- private static bool CharInCategory(char ch, string set, int start, int setLength, int categoryLength)
+ private static bool CharInCategory(char ch, ReadOnlySpan categorySetSegment)
{
UnicodeCategory chcategory = char.GetUnicodeCategory(ch);
- int i = start + SetStartIndex + setLength;
- int end = i + categoryLength;
- while (i < end)
+ for (int i = 0; i < categorySetSegment.Length; i++)
{
- int curcat = (short)set[i];
+ int curcat = (short)categorySetSegment[i];
if (curcat == 0)
{
// zero is our marker for a group of categories - treated as a unit
- if (CharInCategoryGroup(chcategory, set, ref i))
+ if (CharInCategoryGroup(chcategory, categorySetSegment, ref i))
{
return true;
}
@@ -1379,8 +1377,6 @@ private static bool CharInCategory(char ch, string set, int start, int setLength
return true;
}
}
-
- i++;
}
return false;
@@ -1390,7 +1386,7 @@ private static bool CharInCategory(char ch, string set, int start, int setLength
/// This is used for categories which are composed of other categories - L, N, Z, W...
/// These groups need special treatment when they are negated
///
- private static bool CharInCategoryGroup(UnicodeCategory chcategory, string category, ref int i)
+ private static bool CharInCategoryGroup(UnicodeCategory chcategory, ReadOnlySpan category, ref int i)
{
int pos = i + 1;
int curcat = (short)category[pos];
@@ -1717,6 +1713,52 @@ _subtractor is null &&
}
}
}
+
+ // If the class now has a range that includes everything, and if it doesn't have subtraction,
+ // we can remove all of its categories, as they're duplicative (the set already includes everything).
+ if (!_negate &&
+ _subtractor is null &&
+ _categories?.Length > 0 &&
+ rangelist.Count == 1 && rangelist[0].First == 0 && rangelist[0].Last == LastChar)
+ {
+ _categories.Clear();
+ }
+
+ // If there's only a single character omitted from ranges, if there's no subtractor, and if there are categories,
+ // see if that character is in the categories. If it is, then we can replace whole thing with a complete "any" range.
+ // If it's not, then we can remove the categories, as they're only duplicating the rest of the range, turning the set
+ // into a "not one". This primarily helps in the case of a synthesized set from analysis that ends up combining '.' with
+ // categories, as we want to reduce that set down to either [^\n] or [\0-\uFFFF]. (This can be extrapolated to any number
+ // of missing characters; in fact, categories in general are superfluous and the entire set can be represented as ranges.
+ // But categories serve as a space optimization, and we strike a balance between testing many characters and the time/complexity
+ // it takes to do so. Thus, we limit this to the common case of a single missing character.)
+ if (!_negate &&
+ _subtractor is null &&
+ _categories?.Length > 0 &&
+ rangelist.Count == 2 && rangelist[0].First == 0 && rangelist[0].Last + 2 == rangelist[1].First && rangelist[1].Last == LastChar)
+ {
+ var vsb = new ValueStringBuilder(stackalloc char[256]);
+ foreach (ReadOnlyMemory chunk in _categories!.GetChunks())
+ {
+ vsb.Append(chunk.Span);
+ }
+
+ if (CharInCategory((char)(rangelist[0].Last + 1), vsb.AsSpan()))
+ {
+ rangelist.RemoveAt(1);
+ rangelist[0] = ('\0', LastChar);
+ }
+ else
+ {
+ _negate = true;
+ rangelist.RemoveAt(1);
+ char notOne = (char)(rangelist[0].Last + 1);
+ rangelist[0] = (notOne, notOne);
+ }
+ _categories.Clear();
+
+ vsb.Dispose();
+ }
}
}
@@ -1792,12 +1834,20 @@ public static string DescribeSet(string set)
void RenderRanges()
{
- for (; index < SetStartIndex + set[SetLengthIndex]; index += 2)
+ int rangesEnd = SetStartIndex + set[SetLengthIndex];
+ while (index < rangesEnd)
{
ch1 = set[index];
- ch2 = index + 1 < set.Length ?
- (char)(set[index + 1] - 1) :
- LastChar;
+ if (index + 1 < rangesEnd)
+ {
+ ch2 = (char)(set[index + 1] - 1);
+ index += 2;
+ }
+ else
+ {
+ ch2 = LastChar;
+ index++;
+ }
desc.Append(DescribeChar(ch1));
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
index d44a7e63b5b723..f9be28acceeeb2 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
@@ -8,6 +8,7 @@
using System.Globalization;
using System.Reflection;
using System.Reflection.Emit;
+using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Threading;
@@ -97,6 +98,7 @@ internal abstract class RegexCompiler
private static readonly MethodInfo s_stringGetCharsMethod = typeof(string).GetMethod("get_Chars", new Type[] { typeof(int) })!;
private static readonly MethodInfo s_arrayResize = typeof(Array).GetMethod("Resize")!.MakeGenericMethod(typeof(int));
private static readonly MethodInfo s_mathMinIntInt = typeof(Math).GetMethod("Min", new Type[] { typeof(int), typeof(int) })!;
+ private static readonly MethodInfo s_memoryMarshalGetArrayDataReferenceIndexOfAnyValues = typeof(MemoryMarshal).GetMethod("GetArrayDataReference", new Type[] { Type.MakeGenericMethodParameter(0).MakeArrayType() })!.MakeGenericMethod(typeof(IndexOfAnyValues))!;
// Note:
// Single-range helpers like IsAsciiLetterLower, IsAsciiLetterUpper, IsAsciiDigit, and IsBetween aren't used here, as the IL generated for those
// single-range checks is as cheap as the method call, and there's no readability issue as with the source generator.
@@ -834,9 +836,13 @@ void EmitFixedSet_LeftToRight()
// Use IndexOf{Any} to accelerate the skip loop via vectorization to match the first prefix.
// But we avoid using it for the relatively common case of the starting set being '.', aka anything other than
- // a newline, as it's very rare to have long, uninterrupted sequences of newlines.
+ // a newline, as it's very rare to have long, uninterrupted sequences of newlines. And we avoid using it
+ // for the case of the starting set being anything (e.g. '.' with SingleLine), as in that case it'll always match
+ // the first char.
int setIndex = 0;
- bool canUseIndexOf = primarySet.Set != RegexCharClass.NotNewLineClass;
+ bool canUseIndexOf =
+ primarySet.Set != RegexCharClass.NotNewLineClass &&
+ primarySet.Set != RegexCharClass.AnyClass;
bool needLoop = !canUseIndexOf || setsToUse > 1;
Label checkSpanLengthLabel = default;
@@ -6098,10 +6104,19 @@ private void LoadIndexOfAnyValues(ReadOnlySpan chars)
int index = list.Count;
list.Add(IndexOfAnyValues.Create(chars));
- // this._indexOfAnyValues[index]
+ // Logically do _indexOfAnyValues[index], but avoid the bounds check on accessing the array,
+ // and cast to the known derived sealed type to enable devirtualization.
+
+ // DerivedIndexOfAnyValues d = Unsafe.Add(ref MemoryMarshal.GetArrayDataReference(this._indexOfAnyValues), index);
+ // ... = d;
Ldthisfld(s_indexOfAnyValuesArrayField);
- Ldc(index);
- _ilg!.Emit(OpCodes.Ldelem_Ref);
+ Call(s_memoryMarshalGetArrayDataReferenceIndexOfAnyValues);
+ Ldc(index * IntPtr.Size);
+ Add();
+ _ilg!.Emit(OpCodes.Ldind_Ref);
+ LocalBuilder ioavLocal = _ilg!.DeclareLocal(list[index].GetType());
+ Stloc(ioavLocal);
+ Ldloc(ioavLocal);
}
}
}