From 808cb796f06149707cac7c92ec864d8a310ee34a Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed Date: Thu, 5 Dec 2024 20:32:56 -0800 Subject: [PATCH 1/4] Normalization APIs using the spans --- .../System/Globalization/Normalization.Icu.cs | 124 +++++++++++-- .../System/Globalization/Normalization.Nls.cs | 169 ++++++++++++++---- .../src/System/Globalization/Normalization.cs | 79 ++++++++ .../System/StringNormalizationExtensions.cs | 57 ++++++ .../System.Runtime/ref/System.Runtime.cs | 3 + .../Normalization/NormalizationAll.cs | 44 +++++ .../Normalization/StringNormalizationTests.cs | 49 +++++ 7 files changed, 481 insertions(+), 44 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Icu.cs b/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Icu.cs index 6ef9df95aa79d4..e657d61289ce29 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Icu.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Icu.cs @@ -14,6 +14,7 @@ private static unsafe bool IcuIsNormalized(string strInput, NormalizationForm no { Debug.Assert(!GlobalizationMode.Invariant); Debug.Assert(!GlobalizationMode.UseNls); + Debug.Assert(normalizationForm == NormalizationForm.FormC || normalizationForm == NormalizationForm.FormD || normalizationForm == NormalizationForm.FormKC || normalizationForm == NormalizationForm.FormKD); ValidateArguments(strInput, normalizationForm); @@ -40,10 +41,43 @@ private static unsafe bool IcuIsNormalized(string strInput, NormalizationForm no return ret == 1; } + private static unsafe bool IcuIsNormalized(ReadOnlySpan source, NormalizationForm normalizationForm) + { + Debug.Assert(!GlobalizationMode.Invariant); + Debug.Assert(!GlobalizationMode.UseNls); + Debug.Assert(!source.IsEmpty); + Debug.Assert(normalizationForm is NormalizationForm.FormC or NormalizationForm.FormD or NormalizationForm.FormKC or NormalizationForm.FormKD); + + ValidateArguments(source, normalizationForm, nameof(source)); + + int ret; + fixed (char* pInput = source) + { +#if TARGET_MACCATALYST || TARGET_IOS || TARGET_TVOS + if (GlobalizationMode.Hybrid) + { + ret = Interop.Globalization.IsNormalizedNative(normalizationForm, pInput, source.Length); + } + else +#endif + { + ret = Interop.Globalization.IsNormalized(normalizationForm, pInput, source.Length); + } + } + + if (ret == -1) + { + throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(source)); + } + + return ret == 1; + } + private static unsafe string IcuNormalize(string strInput, NormalizationForm normalizationForm) { Debug.Assert(!GlobalizationMode.Invariant); Debug.Assert(!GlobalizationMode.UseNls); + Debug.Assert(normalizationForm == NormalizationForm.FormC || normalizationForm == NormalizationForm.FormD || normalizationForm == NormalizationForm.FormKC || normalizationForm == NormalizationForm.FormKD); ValidateArguments(strInput, normalizationForm); @@ -114,25 +148,95 @@ private static unsafe string IcuNormalize(string strInput, NormalizationForm nor } } - private static void ValidateArguments(string strInput, NormalizationForm normalizationForm) + private static unsafe bool IcuTryNormalize(ReadOnlySpan source, Span destination, out int charsWritten, NormalizationForm normalizationForm = NormalizationForm.FormC) { - Debug.Assert(strInput != null); + Debug.Assert(!GlobalizationMode.Invariant); + Debug.Assert(!GlobalizationMode.UseNls); + Debug.Assert(!source.IsEmpty); + Debug.Assert(normalizationForm == NormalizationForm.FormC || normalizationForm == NormalizationForm.FormD || normalizationForm == NormalizationForm.FormKC || normalizationForm == NormalizationForm.FormKD); - if ((OperatingSystem.IsBrowser() || OperatingSystem.IsWasi())&& (normalizationForm == NormalizationForm.FormKC || normalizationForm == NormalizationForm.FormKD)) + if (destination.IsEmpty) { - // Browser's ICU doesn't contain data needed for FormKC and FormKD - throw new PlatformNotSupportedException(); + charsWritten = 0; + return false; + } + + ValidateArguments(source, normalizationForm, nameof(source)); + + int realLen; + fixed (char* pInput = source) + fixed (char* pDest = destination) + { +#if TARGET_MACCATALYST || TARGET_IOS || TARGET_TVOS + if (GlobalizationMode.Hybrid) + { + realLen = Interop.Globalization.NormalizeStringNative(normalizationForm, pInput, source.Length, pDest, destination.Length); + } + else +#endif + { + realLen = Interop.Globalization.NormalizeString(normalizationForm, pInput, source.Length, pDest, destination.Length); + } + } + + if (realLen == -1) + { + throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(source)); } - if (normalizationForm != NormalizationForm.FormC && normalizationForm != NormalizationForm.FormD && - normalizationForm != NormalizationForm.FormKC && normalizationForm != NormalizationForm.FormKD) + if (realLen <= destination.Length) { - throw new ArgumentException(SR.Argument_InvalidNormalizationForm, nameof(normalizationForm)); + charsWritten = realLen; + return true; + } + + charsWritten = 0; + return false; + } + + private static unsafe int IcuGetNormalizedLength(ReadOnlySpan source, NormalizationForm normalizationForm) + { + Debug.Assert(!GlobalizationMode.Invariant); + Debug.Assert(!GlobalizationMode.UseNls); + Debug.Assert(!source.IsEmpty); + Debug.Assert(normalizationForm == NormalizationForm.FormC || normalizationForm == NormalizationForm.FormD || normalizationForm == NormalizationForm.FormKC || normalizationForm == NormalizationForm.FormKD); + + ValidateArguments(source, normalizationForm, nameof(source)); + + int realLen; + fixed (char* pInput = source) + { +#if TARGET_MACCATALYST || TARGET_IOS || TARGET_TVOS + if (GlobalizationMode.Hybrid) + { + realLen = Interop.Globalization.NormalizeStringNative(normalizationForm, pInput, source.Length, null, 0); + } + else +#endif + { + realLen = Interop.Globalization.NormalizeString(normalizationForm, pInput, source.Length, null, 0); + } + } + + if (realLen < 0) + { + throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(source)); + } + + return realLen; + } + + private static void ValidateArguments(ReadOnlySpan strInput, NormalizationForm normalizationForm, string paramName = "strInput") + { + if ((OperatingSystem.IsBrowser() || OperatingSystem.IsWasi()) && (normalizationForm == NormalizationForm.FormKC || normalizationForm == NormalizationForm.FormKD)) + { + // Browser's ICU doesn't contain data needed for FormKC and FormKD + throw new PlatformNotSupportedException(); } if (HasInvalidUnicodeSequence(strInput)) { - throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(strInput)); + throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, paramName); } } @@ -143,7 +247,7 @@ private static void ValidateArguments(string strInput, NormalizationForm normali /// We walk the string ourselves looking for these bad sequences so we can continue to throw /// ArgumentException in these cases. /// - private static bool HasInvalidUnicodeSequence(string s) + private static bool HasInvalidUnicodeSequence(ReadOnlySpan s) { for (int i = 0; i < s.Length; i++) { diff --git a/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Nls.cs b/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Nls.cs index 3abea572529896..3350ee00c21d66 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Nls.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Nls.cs @@ -3,6 +3,7 @@ using System.Buffers; using System.Diagnostics; +using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Text; @@ -10,45 +11,44 @@ namespace System.Globalization { internal static partial class Normalization { - private static unsafe bool NlsIsNormalized(string strInput, NormalizationForm normalizationForm) + private static unsafe bool NlsIsNormalized(ReadOnlySpan source, NormalizationForm normalizationForm) { Debug.Assert(!GlobalizationMode.Invariant); Debug.Assert(GlobalizationMode.UseNls); - Debug.Assert(strInput != null); + Debug.Assert(!source.IsEmpty); + Debug.Assert(normalizationForm == NormalizationForm.FormC || normalizationForm == NormalizationForm.FormD || normalizationForm == NormalizationForm.FormKC || normalizationForm == NormalizationForm.FormKD); + + Interop.BOOL result; + fixed (char* pInput = source) + { + result = Interop.Normaliz.IsNormalizedString(normalizationForm, pInput, source.Length); + } // The only way to know if IsNormalizedString failed is through checking the Win32 last error // IsNormalizedString pinvoke has SetLastError attribute property which will set the last error // to 0 (ERROR_SUCCESS) before executing the calls. + CheckLastErrorAndThrowIfFailed(nameof(source)); + + return result != Interop.BOOL.FALSE; + } + + private static unsafe bool NlsIsNormalized(string strInput, NormalizationForm normalizationForm) + { + Debug.Assert(!GlobalizationMode.Invariant); + Debug.Assert(GlobalizationMode.UseNls); + Debug.Assert(strInput != null); + Debug.Assert(normalizationForm == NormalizationForm.FormC || normalizationForm == NormalizationForm.FormD || normalizationForm == NormalizationForm.FormKC || normalizationForm == NormalizationForm.FormKD); + Interop.BOOL result; fixed (char* pInput = strInput) { result = Interop.Normaliz.IsNormalizedString(normalizationForm, pInput, strInput.Length); } - int lastError = Marshal.GetLastPInvokeError(); - switch (lastError) - { - case Interop.Errors.ERROR_SUCCESS: - break; - - case Interop.Errors.ERROR_INVALID_PARAMETER: - case Interop.Errors.ERROR_NO_UNICODE_TRANSLATION: - if (normalizationForm != NormalizationForm.FormC && - normalizationForm != NormalizationForm.FormD && - normalizationForm != NormalizationForm.FormKC && - normalizationForm != NormalizationForm.FormKD) - { - throw new ArgumentException(SR.Argument_InvalidNormalizationForm, nameof(normalizationForm)); - } - - throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(strInput)); - - case Interop.Errors.ERROR_NOT_ENOUGH_MEMORY: - throw new OutOfMemoryException(); - - default: - throw new InvalidOperationException(SR.Format(SR.UnknownError_Num, lastError)); - } + // The only way to know if IsNormalizedString failed is through checking the Win32 last error + // IsNormalizedString pinvoke has SetLastError attribute property which will set the last error + // to 0 (ERROR_SUCCESS) before executing the calls. + CheckLastErrorAndThrowIfFailed(nameof(strInput)); return result != Interop.BOOL.FALSE; } @@ -58,6 +58,7 @@ private static unsafe string NlsNormalize(string strInput, NormalizationForm nor Debug.Assert(!GlobalizationMode.Invariant); Debug.Assert(GlobalizationMode.UseNls); Debug.Assert(strInput != null); + Debug.Assert(normalizationForm == NormalizationForm.FormC || normalizationForm == NormalizationForm.FormD || normalizationForm == NormalizationForm.FormKC || normalizationForm == NormalizationForm.FormKD); if (strInput.Length == 0) { @@ -111,14 +112,6 @@ private static unsafe string NlsNormalize(string strInput, NormalizationForm nor case Interop.Errors.ERROR_INVALID_PARAMETER: case Interop.Errors.ERROR_NO_UNICODE_TRANSLATION: - if (normalizationForm != NormalizationForm.FormC && - normalizationForm != NormalizationForm.FormD && - normalizationForm != NormalizationForm.FormKC && - normalizationForm != NormalizationForm.FormKD) - { - throw new ArgumentException(SR.Argument_InvalidNormalizationForm, nameof(normalizationForm)); - } - // Illegal code point or order found. Ie: FFFE or D800 D800, etc. throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(strInput)); @@ -139,5 +132,113 @@ private static unsafe string NlsNormalize(string strInput, NormalizationForm nor } } } + + private static unsafe bool NlsTryNormalize(ReadOnlySpan source, Span destination, out int charsWritten, NormalizationForm normalizationForm = NormalizationForm.FormC) + { + Debug.Assert(!GlobalizationMode.Invariant); + Debug.Assert(GlobalizationMode.UseNls); + Debug.Assert(!source.IsEmpty); + Debug.Assert(normalizationForm == NormalizationForm.FormC || normalizationForm == NormalizationForm.FormD || normalizationForm == NormalizationForm.FormKC || normalizationForm == NormalizationForm.FormKD); + + if (destination.IsEmpty) + { + charsWritten = 0; + return false; + } + + // we depend on Win32 last error when calling NormalizeString + // NormalizeString pinvoke has SetLastError attribute property which will set the last error + // to 0 (ERROR_SUCCESS) before executing the calls. + + int realLength; + fixed (char* pInput = source) + fixed (char* pDest = destination) + { + realLength = Interop.Normaliz.NormalizeString(normalizationForm, pInput, source.Length, pDest, destination.Length); + } + + int lastError = Marshal.GetLastPInvokeError(); + switch (lastError) + { + case Interop.Errors.ERROR_SUCCESS: + charsWritten = realLength; + return true; + + // Do appropriate stuff for the individual errors: + case Interop.Errors.ERROR_INSUFFICIENT_BUFFER: + charsWritten = 0; + return false; + + case Interop.Errors.ERROR_INVALID_PARAMETER: + case Interop.Errors.ERROR_NO_UNICODE_TRANSLATION: + // Illegal code point or order found. Ie: FFFE or D800 D800, etc. + throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(source)); + + case Interop.Errors.ERROR_NOT_ENOUGH_MEMORY: + throw new OutOfMemoryException(); + + default: + // We shouldn't get here... + throw new InvalidOperationException(SR.Format(SR.UnknownError_Num, lastError)); + } + } + + private static unsafe int NlsGetNormalizedLength(ReadOnlySpan source, NormalizationForm normalizationForm) + { + Debug.Assert(!GlobalizationMode.Invariant); + Debug.Assert(GlobalizationMode.UseNls); + Debug.Assert(!source.IsEmpty); + Debug.Assert(normalizationForm == NormalizationForm.FormC || normalizationForm == NormalizationForm.FormD || normalizationForm == NormalizationForm.FormKC || normalizationForm == NormalizationForm.FormKD); + + // we depend on Win32 last error when calling NormalizeString + // NormalizeString pinvoke has SetLastError attribute property which will set the last error + // to 0 (ERROR_SUCCESS) before executing the calls. + + int realLength; + fixed (char* pInput = source) + { + realLength = Interop.Normaliz.NormalizeString(normalizationForm, pInput, source.Length, null, 0); + } + + int lastError = Marshal.GetLastPInvokeError(); + switch (lastError) + { + case Interop.Errors.ERROR_SUCCESS: + return realLength; + + case Interop.Errors.ERROR_INVALID_PARAMETER: + case Interop.Errors.ERROR_NO_UNICODE_TRANSLATION: + // Illegal code point or order found. Ie: FFFE or D800 D800, etc. + throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(source)); + + case Interop.Errors.ERROR_NOT_ENOUGH_MEMORY: + throw new OutOfMemoryException(); + + default: + // We shouldn't get here... + throw new InvalidOperationException(SR.Format(SR.UnknownError_Num, lastError)); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void CheckLastErrorAndThrowIfFailed(string inputName) + { + int lastError = Marshal.GetLastPInvokeError(); + switch (lastError) + { + case Interop.Errors.ERROR_SUCCESS: + break; + + case Interop.Errors.ERROR_INVALID_PARAMETER: + case Interop.Errors.ERROR_NO_UNICODE_TRANSLATION: + throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, inputName); + + case Interop.Errors.ERROR_NOT_ENOUGH_MEMORY: + throw new OutOfMemoryException(); + + default: + throw new InvalidOperationException(SR.Format(SR.UnknownError_Num, lastError)); + } + } } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.cs b/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.cs index d120302a8aa8e7..52765221f7ca1b 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. using System.Diagnostics; +using System.Runtime.CompilerServices; using System.Text; namespace System.Globalization @@ -10,6 +11,8 @@ internal static partial class Normalization { internal static bool IsNormalized(string strInput, NormalizationForm normalizationForm) { + CheckNormalizationForm(normalizationForm); + if (GlobalizationMode.Invariant) { // In Invariant mode we assume all characters are normalized. @@ -22,8 +25,26 @@ internal static bool IsNormalized(string strInput, NormalizationForm normalizati IcuIsNormalized(strInput, normalizationForm); } + internal static bool IsNormalized(ReadOnlySpan source, NormalizationForm normalizationForm = NormalizationForm.FormC) + { + CheckNormalizationForm(normalizationForm); + + // In Invariant mode we assume all characters are normalized. + if (GlobalizationMode.Invariant || source.IsEmpty || Ascii.IsValid(source)) + { + // This is because we don't support any linguistic operation on the strings + return true; + } + + return GlobalizationMode.UseNls ? + NlsIsNormalized(source, normalizationForm) : + IcuIsNormalized(source, normalizationForm); + } + internal static string Normalize(string strInput, NormalizationForm normalizationForm) { + CheckNormalizationForm(normalizationForm); + if (GlobalizationMode.Invariant) { // In Invariant mode we assume all characters are normalized. @@ -35,5 +56,63 @@ internal static string Normalize(string strInput, NormalizationForm normalizatio NlsNormalize(strInput, normalizationForm) : IcuNormalize(strInput, normalizationForm); } + + internal static bool TryNormalize(ReadOnlySpan source, Span destination, out int charsWritten, NormalizationForm normalizationForm = NormalizationForm.FormC) + { + CheckNormalizationForm(normalizationForm); + + if (GlobalizationMode.Invariant || source.IsEmpty) + { + // In Invariant mode we assume all characters are normalized. + // This is because we don't support any linguistic operation on the strings + charsWritten = 0; + return true; + } + + if (Ascii.IsValid(source)) + { + if (destination.Length < source.Length) + { + charsWritten = 0; + return false; + } + + source.CopyTo(destination); + charsWritten = source.Length; + return true; + } + + return GlobalizationMode.UseNls ? + NlsTryNormalize(source, destination, out charsWritten, normalizationForm) : + IcuTryNormalize(source, destination, out charsWritten, normalizationForm); + } + + internal static int GetNormalizedLength(this ReadOnlySpan source, NormalizationForm normalizationForm = NormalizationForm.FormC) + { + CheckNormalizationForm(normalizationForm); + + if (GlobalizationMode.Invariant || source.IsEmpty || Ascii.IsValid(source)) + { + // In Invariant mode we assume all characters are normalized. + // This is because we don't support any linguistic operation on the strings + return source.Length; + } + + return GlobalizationMode.UseNls ? + NlsGetNormalizedLength(source, normalizationForm) : + IcuGetNormalizedLength(source, normalizationForm); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void CheckNormalizationForm(NormalizationForm normalizationForm) + { + if (normalizationForm != NormalizationForm.FormC && + normalizationForm != NormalizationForm.FormD && + normalizationForm != NormalizationForm.FormKC && + normalizationForm != NormalizationForm.FormKD) + { + throw new ArgumentException(SR.Argument_InvalidNormalizationForm, nameof(normalizationForm)); + } + } } } diff --git a/src/libraries/System.Private.CoreLib/src/System/StringNormalizationExtensions.cs b/src/libraries/System.Private.CoreLib/src/System/StringNormalizationExtensions.cs index 712752626fe549..3adb8f4e5044b2 100644 --- a/src/libraries/System.Private.CoreLib/src/System/StringNormalizationExtensions.cs +++ b/src/libraries/System.Private.CoreLib/src/System/StringNormalizationExtensions.cs @@ -5,13 +5,27 @@ namespace System { + /// + /// Extensions for string normalization. + /// public static partial class StringNormalizationExtensions { + /// + /// Determines whether the specified string is in a normalized . + /// + /// The string to check. + /// if the specified string is in a normalized form; otherwise, . public static bool IsNormalized(this string strInput) { return IsNormalized(strInput, NormalizationForm.FormC); } + /// + /// Determines whether the specified string is in a normalized form. + /// + /// The string to check. + /// The normalization form to use. + /// if the specified string is in a normalized form; otherwise, . public static bool IsNormalized(this string strInput, NormalizationForm normalizationForm) { ArgumentNullException.ThrowIfNull(strInput); @@ -19,17 +33,60 @@ public static bool IsNormalized(this string strInput, NormalizationForm normaliz return strInput.IsNormalized(normalizationForm); } + /// + /// Determines whether the specified span of characters is in a normalized form. + /// + /// The span of characters to check. + /// The normalization form to use. + /// if the specified span of characters is in a normalized form; otherwise, . + /// The specified character span contains an invalid code point or the normalization form is invalid. + public static bool IsNormalized(this ReadOnlySpan source, NormalizationForm normalizationForm = NormalizationForm.FormC) => + System.Globalization.Normalization.IsNormalized(source, normalizationForm); + + /// + /// Normalizes the specified string to the . + /// + /// The string to normalize. + /// The normalized string in . public static string Normalize(this string strInput) { // Default to Form C return Normalize(strInput, NormalizationForm.FormC); } + /// + /// Normalizes the specified string to the specified normalization form. + /// + /// The string to normalize. + /// The normalization form to use. + /// The normalized string in the specified normalization form. public static string Normalize(this string strInput, NormalizationForm normalizationForm) { ArgumentNullException.ThrowIfNull(strInput); return strInput.Normalize(normalizationForm); } + + /// + /// Normalizes the specified span of characters to the specified normalization form. + /// + /// The span of characters to normalize. + /// The buffer to write the normalized characters to. + /// When this method returns, contains the number of characters written to . + /// The normalization form to use. + /// if the specified span of characters was successfully normalized; otherwise, . + /// The specified character span contains an invalid code point or the normalization form is invalid. + public static bool TryNormalize(this ReadOnlySpan source, Span destination, out int charsWritten, NormalizationForm normalizationForm = NormalizationForm.FormC) => + System.Globalization.Normalization.TryNormalize(source, destination, out charsWritten, normalizationForm); + + /// + /// Gets the estimated length of the normalized form of the specified string in the . +/// + /// The character span to get the estimated length of the normalized form. + /// The normalization form to use. + /// The estimated length of the normalized form of the specified string. + /// The specified character span contains an invalid code point or the normalization form is invalid. + public static int GetNormalizedLength(this ReadOnlySpan source, NormalizationForm normalizationForm = NormalizationForm.FormC) => + System.Globalization.Normalization.GetNormalizedLength(source, normalizationForm); } } diff --git a/src/libraries/System.Runtime/ref/System.Runtime.cs b/src/libraries/System.Runtime/ref/System.Runtime.cs index 0d79bc4c54e23b..da44791d9607cf 100644 --- a/src/libraries/System.Runtime/ref/System.Runtime.cs +++ b/src/libraries/System.Runtime/ref/System.Runtime.cs @@ -5819,8 +5819,11 @@ public static partial class StringNormalizationExtensions { public static bool IsNormalized(this string strInput) { throw null; } public static bool IsNormalized(this string strInput, System.Text.NormalizationForm normalizationForm) { throw null; } + public static bool IsNormalized(this ReadOnlySpan source, System.Text.NormalizationForm normalizationForm = System.Text.NormalizationForm.FormC) { throw null; } public static string Normalize(this string strInput) { throw null; } + public static bool TryNormalize(this ReadOnlySpan source, Span destination, out int charsWritten, System.Text.NormalizationForm normalizationForm = System.Text.NormalizationForm.FormC) { throw null; } public static string Normalize(this string strInput, System.Text.NormalizationForm normalizationForm) { throw null; } + public static int GetNormalizedLength(this ReadOnlySpan source, System.Text.NormalizationForm normalizationForm = System.Text.NormalizationForm.FormC) { throw null; } } [System.FlagsAttribute] public enum StringSplitOptions diff --git a/src/libraries/System.Runtime/tests/System.Globalization.Extensions.Tests/Normalization/NormalizationAll.cs b/src/libraries/System.Runtime/tests/System.Globalization.Extensions.Tests/Normalization/NormalizationAll.cs index 703988c1cffb39..bedd07baf4637a 100644 --- a/src/libraries/System.Runtime/tests/System.Globalization.Extensions.Tests/Normalization/NormalizationAll.cs +++ b/src/libraries/System.Runtime/tests/System.Globalization.Extensions.Tests/Normalization/NormalizationAll.cs @@ -93,6 +93,18 @@ private static void VerifyConformanceInvariant(NormalizationForm normForm, strin string normalized4 = c4.Normalize(normForm); string normalized5 = c5.Normalize(normForm); + Span normalizedSpan1 = new char[normalized1.Length]; + Span normalizedSpan2 = new char[normalized2.Length]; + Span normalizedSpan3 = new char[normalized3.Length]; + Span normalizedSpan4 = new char[normalized4.Length]; + Span normalizedSpan5 = new char[normalized5.Length]; + + Assert.True(c1.AsSpan().TryNormalize(normalizedSpan1, out int charsWritten1, normForm), $"'{c1}' is not normalized with form {normForm}"); + Assert.True(c2.AsSpan().TryNormalize(normalizedSpan2, out int charsWritten2, normForm), $"'{c2}' is not normalized with form {normForm}"); + Assert.True(c3.AsSpan().TryNormalize(normalizedSpan3, out int charsWritten3, normForm), $"'{c3}' is not normalized with form {normForm}"); + Assert.True(c4.AsSpan().TryNormalize(normalizedSpan4, out int charsWritten4, normForm), $"'{c4}' is not normalized with form {normForm}"); + Assert.True(c5.AsSpan().TryNormalize(normalizedSpan5, out int charsWritten5, normForm), $"'{c5}' is not normalized with form {normForm}"); + switch (normForm) { case NormalizationForm.FormC: @@ -101,15 +113,24 @@ private static void VerifyConformanceInvariant(NormalizationForm normForm, strin AssertEqualsForm(c2, normalized2); AssertEqualsForm(c2, normalized3); + AssertEqualsForm(c2, normalizedSpan1.Slice(0, charsWritten1).ToString()); + AssertEqualsForm(c2, normalizedSpan2.Slice(0, charsWritten2).ToString()); + AssertEqualsForm(c2, normalizedSpan3.Slice(0, charsWritten3).ToString()); + // c4 == NFC(c4) == NFC(c5) AssertEqualsForm(c4, normalized4); AssertEqualsForm(c4, normalized5); + AssertEqualsForm(c4, normalizedSpan4.Slice(0, charsWritten4).ToString()); + AssertEqualsForm(c4, normalizedSpan5.Slice(0, charsWritten5).ToString()); + // c2 is normalized to Form C Assert.True(c2.IsNormalized(normForm), $"'{c2}' is marked as not normalized with form {normForm}"); + Assert.True(c2.AsSpan().IsNormalized(normForm), $"'{c2}' span is marked as not normalized with form {normForm}"); // c4 is normalized to Form C Assert.True(c4.IsNormalized(normForm), $"'{c4}' is marked as not normalized with form {normForm}"); + Assert.True(c4.AsSpan().IsNormalized(normForm), $"'{c4}' span is marked as not normalized with form {normForm}"); break; case NormalizationForm.FormD: @@ -118,15 +139,24 @@ private static void VerifyConformanceInvariant(NormalizationForm normForm, strin AssertEqualsForm(c3, normalized2); AssertEqualsForm(c3, normalized3); + AssertEqualsForm(c3, normalizedSpan1.Slice(0, charsWritten1).ToString()); + AssertEqualsForm(c3, normalizedSpan2.Slice(0, charsWritten2).ToString()); + AssertEqualsForm(c3, normalizedSpan3.Slice(0, charsWritten3).ToString()); + // c5 == NFD(c4) == NFD(c5) AssertEqualsForm(c5, normalized4); AssertEqualsForm(c5, normalized5); + AssertEqualsForm(c5, normalizedSpan4.Slice(0, charsWritten4).ToString()); + AssertEqualsForm(c5, normalizedSpan5.Slice(0, charsWritten5).ToString()); + // c3 is normalized to Form D Assert.True(c3.IsNormalized(normForm), $"'{c3}' is marked as not normalized with form {normForm}"); + Assert.True(c3.AsSpan().IsNormalized(normForm), $"'{c3}' span is marked as not normalized with form {normForm}"); // c5 is normalized to Form D Assert.True(c5.IsNormalized(normForm), $"'{c5}' is marked as not normalized with form {normForm}"); + Assert.True(c5.AsSpan().IsNormalized(normForm), $"'{c5}' span is marked as not normalized with form {normForm}"); break; case NormalizationForm.FormKC: @@ -138,8 +168,15 @@ private static void VerifyConformanceInvariant(NormalizationForm normForm, strin AssertEqualsForm(c4, normalized4); AssertEqualsForm(c4, normalized5); + AssertEqualsForm(c4, normalizedSpan1.Slice(0, charsWritten1).ToString()); + AssertEqualsForm(c4, normalizedSpan2.Slice(0, charsWritten2).ToString()); + AssertEqualsForm(c4, normalizedSpan3.Slice(0, charsWritten3).ToString()); + AssertEqualsForm(c4, normalizedSpan4.Slice(0, charsWritten4).ToString()); + AssertEqualsForm(c4, normalizedSpan5.Slice(0, charsWritten5).ToString()); + // c4 is normalized to Form KC Assert.True(c4.IsNormalized(normForm), $"'{c4}' is marked as not normalized with form {normForm}"); + Assert.True(c4.AsSpan().IsNormalized(normForm), $"'{c4}' span is marked as not normalized with form {normForm}"); break; case NormalizationForm.FormKD: @@ -151,8 +188,15 @@ private static void VerifyConformanceInvariant(NormalizationForm normForm, strin AssertEqualsForm(c5, normalized4); AssertEqualsForm(c5, normalized5); + AssertEqualsForm(c5, normalizedSpan1.Slice(0, charsWritten1).ToString()); + AssertEqualsForm(c5, normalizedSpan2.Slice(0, charsWritten2).ToString()); + AssertEqualsForm(c5, normalizedSpan3.Slice(0, charsWritten3).ToString()); + AssertEqualsForm(c5, normalizedSpan4.Slice(0, charsWritten4).ToString()); + AssertEqualsForm(c5, normalizedSpan5.Slice(0, charsWritten5).ToString()); + // c5 is normalized to Form KD Assert.True(c5.IsNormalized(normForm), $"'{c5}' is marked as not normalized with form {normForm}"); + Assert.True(c5.AsSpan().IsNormalized(normForm), $"'{c5}' span is marked as not normalized with form {normForm}"); break; } } diff --git a/src/libraries/System.Runtime/tests/System.Globalization.Extensions.Tests/Normalization/StringNormalizationTests.cs b/src/libraries/System.Runtime/tests/System.Globalization.Extensions.Tests/Normalization/StringNormalizationTests.cs index 1b70a79b6ae6de..165c5d02ea0c96 100644 --- a/src/libraries/System.Runtime/tests/System.Globalization.Extensions.Tests/Normalization/StringNormalizationTests.cs +++ b/src/libraries/System.Runtime/tests/System.Globalization.Extensions.Tests/Normalization/StringNormalizationTests.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. using System; +using System.Linq; using System.Text; using Xunit; using System.Collections.Generic; @@ -21,16 +22,23 @@ public void IsNormalized(string value, NormalizationForm normalizationForm, bool if (normalizationForm == NormalizationForm.FormC) { Assert.Equal(expected, value.IsNormalized()); + Assert.Equal(expected, value.AsSpan().IsNormalized()); } Assert.Equal(expected, value.IsNormalized(normalizationForm)); + Assert.Equal(expected, value.AsSpan().IsNormalized(normalizationForm)); } [Fact] public void IsNormalized_Invalid() { Assert.Throws(() => "\uFB01".IsNormalized((NormalizationForm)10)); + Assert.Throws(() => "\uFB01".AsSpan().IsNormalized((NormalizationForm)10)); + AssertExtensions.Throws("strInput", () => "\uFFFE".IsNormalized()); // Invalid codepoint + AssertExtensions.Throws("source", () => "\uFFFE".AsSpan().IsNormalized()); // Invalid codepoint + AssertExtensions.Throws("strInput", () => "\uD800\uD800".IsNormalized()); // Invalid surrogate pair + AssertExtensions.Throws("source", () => "\uD800\uD800".AsSpan().IsNormalized()); // Invalid surrogate pair } [Fact] @@ -63,20 +71,61 @@ public static IEnumerable NormalizeTestData() [MemberData(nameof(NormalizeTestData))] public void Normalize(string value, NormalizationForm normalizationForm, string expected) { + Span destination = new char[expected.Length + 1]; // NLS sometimes need extra character in the buffer mostly if need to insert the null terminator + int charsWritten; + if (normalizationForm == NormalizationForm.FormC) { Assert.Equal(expected, value.Normalize()); + + Assert.True(value.AsSpan().TryNormalize(destination, out charsWritten)); + Assert.Equal(expected, destination.Slice(0, charsWritten).ToString()); + + if (PlatformDetection.IsNlsGlobalization) + { + // NLS return estimated normalized length that is enough to hold the result but doesn't return the exact length + Assert.True(expected.Length <= value.GetNormalizedLength(), $"Expected: {expected.Length}, Actual: {value.GetNormalizedLength()}"); + } + else + { + // ICU returns the exact normalized length + Assert.Equal(expected.Length, value.AsSpan().GetNormalizedLength()); + } } + Assert.Equal(expected, value.Normalize(normalizationForm)); + + if (expected.Length > 0) + { + Assert.False(value.AsSpan().TryNormalize(destination.Slice(0, expected.Length - 1), out charsWritten, normalizationForm), $"Trying to normalize '{value}' to a buffer of length {expected.Length - 1} succeeded!"); + } + + Assert.True(value.AsSpan().TryNormalize(destination, out charsWritten, normalizationForm), $"Failed to normalize '{value}' to a buffer of length {destination.Length}"); + Assert.Equal(expected, destination.Slice(0, charsWritten).ToString()); + if (PlatformDetection.IsNlsGlobalization) + { + // NLS return estimated normalized length that is enough to hold the result but doesn't return the exact length + Assert.True(expected.Length <= value.AsSpan().GetNormalizedLength(normalizationForm), $"Expected: {expected.Length}, Actual: {value.AsSpan().GetNormalizedLength(normalizationForm)}"); + } + else + { + // ICU returns the exact normalized length + Assert.Equal(expected.Length, value.AsSpan().GetNormalizedLength(normalizationForm)); + } } [Fact] public void Normalize_Invalid() { + char[] destination = new char[100]; Assert.Throws(() => "\uFB01".Normalize((NormalizationForm)7)); + Assert.Throws(() => "\uFB01".AsSpan().TryNormalize(destination.AsSpan(), out int charsWritten, (NormalizationForm)7)); AssertExtensions.Throws("strInput", () => "\uFFFE".Normalize()); // Invalid codepoint + AssertExtensions.Throws("source", () => "\uFFFE".AsSpan().TryNormalize(destination.AsSpan(), out int charsWritten)); // Invalid codepoint + AssertExtensions.Throws("strInput", () => "\uD800\uD800".Normalize()); // Invalid surrogate pair + AssertExtensions.Throws("source", () => "\uD800\uD800".AsSpan().TryNormalize(destination, out int charsWritten)); // Invalid surrogate pair } [Fact] From d48e358e67b97d16a7615f56039c236d98535171 Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed Date: Fri, 6 Dec 2024 15:49:11 -0800 Subject: [PATCH 2/4] Address the feedback --- .../src/Resources/Strings.resx | 57 ++++++++++--------- .../System/Globalization/Normalization.Icu.cs | 33 +---------- .../System/Globalization/Normalization.Nls.cs | 21 ------- .../src/System/Globalization/Normalization.cs | 36 ++++-------- .../Normalization/StringNormalizationTests.cs | 4 +- 5 files changed, 43 insertions(+), 108 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/Resources/Strings.resx b/src/libraries/System.Private.CoreLib/src/Resources/Strings.resx index e7dadba40a5395..3e91b3ecb8340f 100644 --- a/src/libraries/System.Private.CoreLib/src/Resources/Strings.resx +++ b/src/libraries/System.Private.CoreLib/src/Resources/Strings.resx @@ -1,17 +1,17 @@  - @@ -1315,6 +1315,9 @@ Invalid or unsupported normalization form. + + `NormalizationForm.FormKC` and `NormalizationForm.FormKD` are not supported in browser environments or WebAssembly. + An undefined NumberStyles value is being used. diff --git a/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Icu.cs b/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Icu.cs index e657d61289ce29..30701156cf2da0 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Icu.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Icu.cs @@ -10,37 +10,6 @@ namespace System.Globalization { internal static partial class Normalization { - private static unsafe bool IcuIsNormalized(string strInput, NormalizationForm normalizationForm) - { - Debug.Assert(!GlobalizationMode.Invariant); - Debug.Assert(!GlobalizationMode.UseNls); - Debug.Assert(normalizationForm == NormalizationForm.FormC || normalizationForm == NormalizationForm.FormD || normalizationForm == NormalizationForm.FormKC || normalizationForm == NormalizationForm.FormKD); - - ValidateArguments(strInput, normalizationForm); - - int ret; - fixed (char* pInput = strInput) - { -#if TARGET_MACCATALYST || TARGET_IOS || TARGET_TVOS - if (GlobalizationMode.Hybrid) - { - ret = Interop.Globalization.IsNormalizedNative(normalizationForm, pInput, strInput.Length); - } - else -#endif - { - ret = Interop.Globalization.IsNormalized(normalizationForm, pInput, strInput.Length); - } - } - - if (ret == -1) - { - throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(strInput)); - } - - return ret == 1; - } - private static unsafe bool IcuIsNormalized(ReadOnlySpan source, NormalizationForm normalizationForm) { Debug.Assert(!GlobalizationMode.Invariant); @@ -231,7 +200,7 @@ private static void ValidateArguments(ReadOnlySpan strInput, Normalization if ((OperatingSystem.IsBrowser() || OperatingSystem.IsWasi()) && (normalizationForm == NormalizationForm.FormKC || normalizationForm == NormalizationForm.FormKD)) { // Browser's ICU doesn't contain data needed for FormKC and FormKD - throw new PlatformNotSupportedException(); + throw new PlatformNotSupportedException(SR.Argument_UnsupportedNormalizationFormInBrowser); } if (HasInvalidUnicodeSequence(strInput)) diff --git a/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Nls.cs b/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Nls.cs index 3350ee00c21d66..2e63cd5daa5b81 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Nls.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Nls.cs @@ -32,27 +32,6 @@ private static unsafe bool NlsIsNormalized(ReadOnlySpan source, Normalizat return result != Interop.BOOL.FALSE; } - private static unsafe bool NlsIsNormalized(string strInput, NormalizationForm normalizationForm) - { - Debug.Assert(!GlobalizationMode.Invariant); - Debug.Assert(GlobalizationMode.UseNls); - Debug.Assert(strInput != null); - Debug.Assert(normalizationForm == NormalizationForm.FormC || normalizationForm == NormalizationForm.FormD || normalizationForm == NormalizationForm.FormKC || normalizationForm == NormalizationForm.FormKD); - - Interop.BOOL result; - fixed (char* pInput = strInput) - { - result = Interop.Normaliz.IsNormalizedString(normalizationForm, pInput, strInput.Length); - } - - // The only way to know if IsNormalizedString failed is through checking the Win32 last error - // IsNormalizedString pinvoke has SetLastError attribute property which will set the last error - // to 0 (ERROR_SUCCESS) before executing the calls. - CheckLastErrorAndThrowIfFailed(nameof(strInput)); - - return result != Interop.BOOL.FALSE; - } - private static unsafe string NlsNormalize(string strInput, NormalizationForm normalizationForm) { Debug.Assert(!GlobalizationMode.Invariant); diff --git a/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.cs b/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.cs index 52765221f7ca1b..647e097601cd71 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.cs @@ -9,22 +9,6 @@ namespace System.Globalization { internal static partial class Normalization { - internal static bool IsNormalized(string strInput, NormalizationForm normalizationForm) - { - CheckNormalizationForm(normalizationForm); - - if (GlobalizationMode.Invariant) - { - // In Invariant mode we assume all characters are normalized. - // This is because we don't support any linguistic operation on the strings - return true; - } - - return GlobalizationMode.UseNls ? - NlsIsNormalized(strInput, normalizationForm) : - IcuIsNormalized(strInput, normalizationForm); - } - internal static bool IsNormalized(ReadOnlySpan source, NormalizationForm normalizationForm = NormalizationForm.FormC) { CheckNormalizationForm(normalizationForm); @@ -61,25 +45,25 @@ internal static bool TryNormalize(ReadOnlySpan source, Span destinat { CheckNormalizationForm(normalizationForm); - if (GlobalizationMode.Invariant || source.IsEmpty) + if (source.IsEmpty) { - // In Invariant mode we assume all characters are normalized. - // This is because we don't support any linguistic operation on the strings charsWritten = 0; return true; } - if (Ascii.IsValid(source)) + if (GlobalizationMode.Invariant || Ascii.IsValid(source)) { - if (destination.Length < source.Length) + // In Invariant mode we assume all characters are normalized. + // This is because we don't support any linguistic operation on the strings + + if (source.TryCopyTo(destination)) { - charsWritten = 0; - return false; + charsWritten = source.Length; + return true; } - source.CopyTo(destination); - charsWritten = source.Length; - return true; + charsWritten = 0; + return false; } return GlobalizationMode.UseNls ? diff --git a/src/libraries/System.Runtime/tests/System.Globalization.Extensions.Tests/Normalization/StringNormalizationTests.cs b/src/libraries/System.Runtime/tests/System.Globalization.Extensions.Tests/Normalization/StringNormalizationTests.cs index 165c5d02ea0c96..bba0ddb088d462 100644 --- a/src/libraries/System.Runtime/tests/System.Globalization.Extensions.Tests/Normalization/StringNormalizationTests.cs +++ b/src/libraries/System.Runtime/tests/System.Globalization.Extensions.Tests/Normalization/StringNormalizationTests.cs @@ -34,10 +34,10 @@ public void IsNormalized_Invalid() Assert.Throws(() => "\uFB01".IsNormalized((NormalizationForm)10)); Assert.Throws(() => "\uFB01".AsSpan().IsNormalized((NormalizationForm)10)); - AssertExtensions.Throws("strInput", () => "\uFFFE".IsNormalized()); // Invalid codepoint + AssertExtensions.Throws("source", () => "\uFFFE".IsNormalized()); // Invalid codepoint AssertExtensions.Throws("source", () => "\uFFFE".AsSpan().IsNormalized()); // Invalid codepoint - AssertExtensions.Throws("strInput", () => "\uD800\uD800".IsNormalized()); // Invalid surrogate pair + AssertExtensions.Throws("source", () => "\uD800\uD800".IsNormalized()); // Invalid surrogate pair AssertExtensions.Throws("source", () => "\uD800\uD800".AsSpan().IsNormalized()); // Invalid surrogate pair } From c6147e382e93286fbb3ddda790ad93feeeb71b87 Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed <10833894+tarekgh@users.noreply.github.com> Date: Sun, 8 Dec 2024 12:48:24 -0800 Subject: [PATCH 3/4] Update src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Icu.cs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Günther Foidl --- .../src/System/Globalization/Normalization.Icu.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Icu.cs b/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Icu.cs index 30701156cf2da0..076629fa32b841 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Icu.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Icu.cs @@ -148,7 +148,7 @@ private static unsafe bool IcuTryNormalize(ReadOnlySpan source, Span } } - if (realLen == -1) + if (realLen < 0) { throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(source)); } From cc8cace203ff3d64c6f52a4c5a1db4ab7e938630 Mon Sep 17 00:00:00 2001 From: Eric StJohn Date: Mon, 9 Dec 2024 15:31:25 -0800 Subject: [PATCH 4/4] Fix comment indent --- .../src/System/StringNormalizationExtensions.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/StringNormalizationExtensions.cs b/src/libraries/System.Private.CoreLib/src/System/StringNormalizationExtensions.cs index 3adb8f4e5044b2..7a94a853581aa2 100644 --- a/src/libraries/System.Private.CoreLib/src/System/StringNormalizationExtensions.cs +++ b/src/libraries/System.Private.CoreLib/src/System/StringNormalizationExtensions.cs @@ -81,7 +81,7 @@ public static bool TryNormalize(this ReadOnlySpan source, Span desti /// /// Gets the estimated length of the normalized form of the specified string in the . -/// + /// /// The character span to get the estimated length of the normalized form. /// The normalization form to use. /// The estimated length of the normalized form of the specified string.