diff --git a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs index 0ca72c150f..3bf6647d70 100644 --- a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs +++ b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs @@ -176,7 +176,7 @@ private sealed class OutPipelineColumn : Scalar { public readonly Scalar Input; - public OutPipelineColumn(Scalar input, TextNormalizingEstimator.CaseNormalizationMode textCase, bool keepDiacritics, bool keepPunctuations, bool keepNumbers) + public OutPipelineColumn(Scalar input, TextNormalizingEstimator.CaseMode textCase, bool keepDiacritics, bool keepPunctuations, bool keepNumbers) : base(new Reconciler(textCase, keepDiacritics, keepPunctuations, keepNumbers), input) { Input = input; @@ -185,12 +185,12 @@ public OutPipelineColumn(Scalar input, TextNormalizingEstimator.CaseNorm private sealed class Reconciler : EstimatorReconciler, IEquatable { - private readonly TextNormalizingEstimator.CaseNormalizationMode _textCase; + private readonly TextNormalizingEstimator.CaseMode _textCase; private readonly bool _keepDiacritics; private readonly bool _keepPunctuations; private readonly bool _keepNumbers; - public Reconciler(TextNormalizingEstimator.CaseNormalizationMode textCase, bool keepDiacritics, bool keepPunctuations, bool keepNumbers) + public Reconciler(TextNormalizingEstimator.CaseMode textCase, bool keepDiacritics, bool keepPunctuations, bool keepNumbers) { _textCase = textCase; _keepDiacritics = keepDiacritics; @@ -227,15 +227,15 @@ public override IEstimator Reconcile(IHostEnvironment env, /// Normalizes input text by changing case, removing diacritical marks, punctuation marks and/or numbers. /// /// The column to apply to. - /// Casing text using the rules of the invariant culture. + /// Casing text using the rules of the invariant culture. /// Whether to keep diacritical marks or remove them. /// Whether to keep punctuation marks or remove them. /// Whether to keep numbers or remove them. public static Scalar NormalizeText(this Scalar input, - TextNormalizingEstimator.CaseNormalizationMode textCase = TextNormalizingEstimator.CaseNormalizationMode.Lower, + TextNormalizingEstimator.CaseMode caseMode = TextNormalizingEstimator.CaseMode.Lower, bool keepDiacritics = false, bool keepPunctuations = true, - bool keepNumbers = true) => new OutPipelineColumn(input, textCase, keepDiacritics, keepPunctuations, keepNumbers); + bool keepNumbers = true) => new OutPipelineColumn(input, caseMode, keepDiacritics, keepPunctuations, keepNumbers); } /// diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index e1b6597e57..bcfa9801e3 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -83,19 +83,19 @@ public static TokenizingByCharactersEstimator TokenizeCharacters(this Transforms /// The text-related transform's catalog. /// Name of the column resulting from the transformation of . /// Name of the column to transform. If set to , the value of the will be used as source. - /// Casing text using the rules of the invariant culture. + /// Casing text using the rules of the invariant culture. /// Whether to keep diacritical marks or remove them. /// Whether to keep punctuation marks or remove them. /// Whether to keep numbers or remove them. public static TextNormalizingEstimator NormalizeText(this TransformsCatalog.TextTransforms catalog, string outputColumnName, string inputColumnName = null, - TextNormalizingEstimator.CaseNormalizationMode textCase = TextNormalizeDefaults.TextCase, + TextNormalizingEstimator.CaseMode caseMode = TextNormalizeDefaults.Mode, bool keepDiacritics = TextNormalizeDefaults.KeepDiacritics, bool keepPunctuations = TextNormalizeDefaults.KeepPunctuations, bool keepNumbers = TextNormalizeDefaults.KeepNumbers) => new TextNormalizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), - outputColumnName, inputColumnName, textCase, keepDiacritics, keepPunctuations, keepNumbers); + outputColumnName, inputColumnName, caseMode, keepDiacritics, keepPunctuations, keepNumbers); /// /// The text-related transform's catalog. diff --git a/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs b/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs index 7e30b62165..72c3555b90 100644 --- a/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs +++ b/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs @@ -24,7 +24,7 @@ namespace Microsoft.ML.Transforms.Text { - using CaseNormalizationMode = TextNormalizingEstimator.CaseNormalizationMode; + using CaseMode = TextNormalizingEstimator.CaseMode; // A transform that turns a collection of text documents into numerical feature vectors. The feature vectors are counts // of (word or character) ngrams in a given text. It offers ngram hashing (finding the ngram token string name to feature // integer index mapping through hashing) as an option. @@ -100,7 +100,7 @@ internal sealed class Arguments : TransformInputBase public bool UsePredefinedStopWordRemover = false; [Argument(ArgumentType.AtMostOnce, HelpText = "Casing text using the rules of the invariant culture.", ShortName = "case", SortOrder = 5)] - public CaseNormalizationMode TextCase = TextNormalizingEstimator.Defaults.TextCase; + public CaseMode TextCase = TextNormalizingEstimator.Defaults.Mode; [Argument(ArgumentType.AtMostOnce, HelpText = "Whether to keep diacritical marks or remove them.", ShortName = "diac", SortOrder = 6)] public bool KeepDiacritics = TextNormalizingEstimator.Defaults.KeepDiacritics; @@ -142,7 +142,7 @@ public sealed class Options /// /// Casing used for the text. /// - public CaseNormalizationMode TextCase { get; set; } = CaseNormalizationMode.Lower; + public CaseMode TextCase { get; set; } = CaseMode.Lower; /// /// Whether to keep diacritical marks or remove them. /// @@ -203,7 +203,7 @@ private sealed class TransformApplierParams public readonly NormFunction VectorNormalizer; public readonly Language Language; public readonly bool UsePredefinedStopWordRemover; - public readonly CaseNormalizationMode TextCase; + public readonly CaseMode TextCase; public readonly bool KeepDiacritics; public readonly bool KeepPunctuations; public readonly bool KeepNumbers; @@ -241,7 +241,7 @@ public bool NeedsNormalizeTransform get { return - TextCase != CaseNormalizationMode.None || + TextCase != CaseMode.None || !KeepDiacritics || !KeepPunctuations || !KeepNumbers; @@ -275,7 +275,7 @@ public TransformApplierParams(TextFeaturizingEstimator parent) { var host = parent._host; host.Check(Enum.IsDefined(typeof(Language), parent.OptionalSettings.TextLanguage)); - host.Check(Enum.IsDefined(typeof(CaseNormalizationMode), parent.OptionalSettings.TextCase)); + host.Check(Enum.IsDefined(typeof(CaseMode), parent.OptionalSettings.TextCase)); WordExtractorFactory = parent._wordFeatureExtractor?.CreateComponent(host, parent._dictionary); CharExtractorFactory = parent._charFeatureExtractor?.CreateComponent(host, parent._dictionary); VectorNormalizer = parent.OptionalSettings.VectorNormalizer; diff --git a/src/Microsoft.ML.Transforms/Text/TextNormalizing.cs b/src/Microsoft.ML.Transforms/Text/TextNormalizing.cs index e9c28e04af..c943f3451f 100644 --- a/src/Microsoft.ML.Transforms/Text/TextNormalizing.cs +++ b/src/Microsoft.ML.Transforms/Text/TextNormalizing.cs @@ -58,7 +58,7 @@ internal sealed class Options public Column[] Columns; [Argument(ArgumentType.AtMostOnce, HelpText = "Casing text using the rules of the invariant culture.", ShortName = "case", SortOrder = 1)] - public TextNormalizingEstimator.CaseNormalizationMode TextCase = TextNormalizingEstimator.Defaults.TextCase; + public TextNormalizingEstimator.CaseMode TextCase = TextNormalizingEstimator.Defaults.Mode; [Argument(ArgumentType.AtMostOnce, HelpText = "Whether to keep diacritical marks or remove them.", ShortName = "diac", SortOrder = 1)] @@ -92,22 +92,22 @@ private static VersionInfo GetVersionInfo() /// /// The names of the output and input column pairs on which the transformation is applied. /// - public IReadOnlyCollection<(string outputColumnName, string inputColumnName)> Columns => ColumnPairs.AsReadOnly(); + internal IReadOnlyCollection<(string outputColumnName, string inputColumnName)> Columns => ColumnPairs.AsReadOnly(); - private readonly TextNormalizingEstimator.CaseNormalizationMode _textCase; + private readonly TextNormalizingEstimator.CaseMode _caseMode; private readonly bool _keepDiacritics; private readonly bool _keepPunctuations; private readonly bool _keepNumbers; internal TextNormalizingTransformer(IHostEnvironment env, - TextNormalizingEstimator.CaseNormalizationMode textCase = TextNormalizingEstimator.Defaults.TextCase, + TextNormalizingEstimator.CaseMode caseMode = TextNormalizingEstimator.Defaults.Mode, bool keepDiacritics = TextNormalizingEstimator.Defaults.KeepDiacritics, bool keepPunctuations = TextNormalizingEstimator.Defaults.KeepPunctuations, bool keepNumbers = TextNormalizingEstimator.Defaults.KeepNumbers, params (string outputColumnName, string inputColumnName)[] columns) : base(Contracts.CheckRef(env, nameof(env)).Register(RegistrationName), columns) { - _textCase = textCase; + _caseMode = caseMode; _keepDiacritics = keepDiacritics; _keepPunctuations = keepPunctuations; _keepNumbers = keepNumbers; @@ -135,7 +135,7 @@ private protected override void SaveModel(ModelSaveContext ctx) // bool: whether to keep numbers SaveColumns(ctx); - ctx.Writer.Write((byte)_textCase); + ctx.Writer.Write((byte)_caseMode); ctx.Writer.WriteBoolByte(_keepDiacritics); ctx.Writer.WriteBoolByte(_keepPunctuations); ctx.Writer.WriteBoolByte(_keepNumbers); @@ -161,8 +161,8 @@ private TextNormalizingTransformer(IHost host, ModelLoadContext ctx) // bool: whether to keep diacritics // bool: whether to keep punctuations // bool: whether to keep numbers - _textCase = (TextNormalizingEstimator.CaseNormalizationMode)ctx.Reader.ReadByte(); - host.CheckDecode(Enum.IsDefined(typeof(TextNormalizingEstimator.CaseNormalizationMode), _textCase)); + _caseMode = (TextNormalizingEstimator.CaseMode)ctx.Reader.ReadByte(); + host.CheckDecode(Enum.IsDefined(typeof(TextNormalizingEstimator.CaseMode), _caseMode)); _keepDiacritics = ctx.Reader.ReadBoolByte(); _keepPunctuations = ctx.Reader.ReadBoolByte(); @@ -232,34 +232,34 @@ protected override DataViewSchema.DetachedColumn[] GetOutputColumnsCore() // List of pairs of (letters combined with diacritics, the letters without diacritics) from Office NL team. private static readonly string[] _combinedDiacriticsPairs = { - // Latin letters combined with diacritics: - "ÀA", "ÁA", "ÂA", "ÃA", "ÄA", "ÅA", "ÇC", "ÈE", "ÉE", "ÊE", "ËE", "ÌI", "ÍI", "ÎI", "ÏI", "ÑN", - "ÒO", "ÓO", "ÔO", "ÕO", "ÖO", "ÙU", "ÚU", "ÛU", "ÜU", "ÝY", "àa", "áa", "âa", "ãa", "äa", "åa", - "çc", "èe", "ée", "êe", "ëe", "ìi", "íi", "îi", "ïi", "ñn", "òo", "óo", "ôo", "õo", "öo", "ùu", - "úu", "ûu", "üu", "ýy", "ÿy", "ĀA", "āa", "ĂA", "ăa", "ĄA", "ąa", "ĆC", "ćc", "ĈC", "ĉc", "ĊC", - "ċc", "ČC", "čc", "ĎD", "ďd", "ĒE", "ēe", "ĔE", "ĕe", "ĖE", "ėe", "ĘE", "ęe", "ĚE", "ěe", "ĜG", - "ĝg", "ĞG", "ğg", "ĠG", "ġg", "ĢG", "ģg", "ĤH", "ĥh", "ĨI", "ĩi", "ĪI", "īi", "ĬI", "ĭi", "ĮI", - "įi", "İI", "ĴJ", "ĵj", "ĶK", "ķk", "ĹL", "ĺl", "ĻL", "ļl", "ĽL", "ľl", "ŃN", "ńn", "ŅN", "ņn", - "ŇN", "ňn", "ŌO", "ōo", "ŎO", "ŏo", "ŐO", "őo", "ŔR", "ŕr", "ŖR", "ŗr", "ŘR", "řr", "ŚS", "śs", - "ŜS", "ŝs", "ŞS", "şs", "ŠS", "šs", "ŢT", "ţt", "ŤT", "ťt", "ŨU", "ũu", "ŪU", "ūu", "ŬU", "ŭu", - "ŮU", "ůu", "ŰU", "űu", "ŲU", "ųu", "ŴW", "ŵw", "ŶY", "ŷy", "ŸY", "ŹZ", "źz", "ŻZ", "żz", "ŽZ", - "žz", "ƠO", "ơo", "ƯU", "ưu", "ǍA", "ǎa", "ǏI", "ǐi", "ǑO", "ǒo", "ǓU", "ǔu", "ǕU", "ǖu", "ǗU", - "ǘu", "ǙU", "ǚu", "ǛU", "ǜu", "ǞA", "ǟa", "ǠA", "ǡa", "ǢÆ", "ǣæ", "ǦG", "ǧg", "ǨK", "ǩk", "ǪO", - "ǫo", "ǬO", "ǭo", "ǮƷ", "ǯʒ", "ǰj", "ǴG", "ǵg", "ǸN", "ǹn", "ǺA", "ǻa", "ǼÆ", "ǽæ", "ǾØ", "ǿø", - "ȀA", "ȁa", "ȂA", "ȃa", "ȄE", "ȅe", "ȆE", "ȇe", "ȈI", "ȉi", "ȊI", "ȋi", "ȌO", "ȍo", "ȎO", "ȏo", - "ȐR", "ȑr", "ȒR", "ȓr", "ȔU", "ȕu", "ȖU", "ȗu", "ȘS", "șs", "ȚT", "țt", "ȞH", "ȟh", "ȦA", "ȧa", - "ȨE", "ȩe", "ȪO", "ȫo", "ȬO", "ȭo", "ȮO", "ȯo", "ȰO", "ȱo", "ȲY", "ȳy", - - // Greek letters combined with diacritics: - "ΆΑ", "ΈΕ", "ΉΗ", "ΊΙ", "ΌΟ", "ΎΥ", "ΏΩ", "ΐι", "ΪΙ", "ΫΥ", "άα", "έε", "ήη", "ίι", "ΰυ", "ϊι", - "ϋυ", "όο", "ύυ", "ώω", "ϓϒ", "ϔϒ", - - // Cyrillic letters combined with diacritics: - "ЀЕ", "ЁЕ", "ЃГ", "ЇІ", "ЌК", "ЍИ", "ЎУ", "ЙИ", "йи", "ѐе", "ёе", "ѓг", "їі", "ќк", "ѝи", "ўу", - "ѶѴ", "ѷѵ", "ӁЖ", "ӂж", "ӐА", "ӑа", "ӒА", "ӓа", "ӖЕ", "ӗе", "ӚӘ", "ӛә", "ӜЖ", "ӝж", "ӞЗ", "ӟз", - "ӢИ", "ӣи", "ӤИ", "ӥи", "ӦО", "ӧо", "ӪӨ", "ӫө", "ӬЭ", "ӭэ", "ӮУ", "ӯу", "ӰУ", "ӱу", "ӲУ", "ӳу", - "ӴЧ", "ӵч", "ӸЫ", "ӹы" - }; + // Latin letters combined with diacritics: + "ÀA", "ÁA", "ÂA", "ÃA", "ÄA", "ÅA", "ÇC", "ÈE", "ÉE", "ÊE", "ËE", "ÌI", "ÍI", "ÎI", "ÏI", "ÑN", + "ÒO", "ÓO", "ÔO", "ÕO", "ÖO", "ÙU", "ÚU", "ÛU", "ÜU", "ÝY", "àa", "áa", "âa", "ãa", "äa", "åa", + "çc", "èe", "ée", "êe", "ëe", "ìi", "íi", "îi", "ïi", "ñn", "òo", "óo", "ôo", "õo", "öo", "ùu", + "úu", "ûu", "üu", "ýy", "ÿy", "ĀA", "āa", "ĂA", "ăa", "ĄA", "ąa", "ĆC", "ćc", "ĈC", "ĉc", "ĊC", + "ċc", "ČC", "čc", "ĎD", "ďd", "ĒE", "ēe", "ĔE", "ĕe", "ĖE", "ėe", "ĘE", "ęe", "ĚE", "ěe", "ĜG", + "ĝg", "ĞG", "ğg", "ĠG", "ġg", "ĢG", "ģg", "ĤH", "ĥh", "ĨI", "ĩi", "ĪI", "īi", "ĬI", "ĭi", "ĮI", + "įi", "İI", "ĴJ", "ĵj", "ĶK", "ķk", "ĹL", "ĺl", "ĻL", "ļl", "ĽL", "ľl", "ŃN", "ńn", "ŅN", "ņn", + "ŇN", "ňn", "ŌO", "ōo", "ŎO", "ŏo", "ŐO", "őo", "ŔR", "ŕr", "ŖR", "ŗr", "ŘR", "řr", "ŚS", "śs", + "ŜS", "ŝs", "ŞS", "şs", "ŠS", "šs", "ŢT", "ţt", "ŤT", "ťt", "ŨU", "ũu", "ŪU", "ūu", "ŬU", "ŭu", + "ŮU", "ůu", "ŰU", "űu", "ŲU", "ųu", "ŴW", "ŵw", "ŶY", "ŷy", "ŸY", "ŹZ", "źz", "ŻZ", "żz", "ŽZ", + "žz", "ƠO", "ơo", "ƯU", "ưu", "ǍA", "ǎa", "ǏI", "ǐi", "ǑO", "ǒo", "ǓU", "ǔu", "ǕU", "ǖu", "ǗU", + "ǘu", "ǙU", "ǚu", "ǛU", "ǜu", "ǞA", "ǟa", "ǠA", "ǡa", "ǢÆ", "ǣæ", "ǦG", "ǧg", "ǨK", "ǩk", "ǪO", + "ǫo", "ǬO", "ǭo", "ǮƷ", "ǯʒ", "ǰj", "ǴG", "ǵg", "ǸN", "ǹn", "ǺA", "ǻa", "ǼÆ", "ǽæ", "ǾØ", "ǿø", + "ȀA", "ȁa", "ȂA", "ȃa", "ȄE", "ȅe", "ȆE", "ȇe", "ȈI", "ȉi", "ȊI", "ȋi", "ȌO", "ȍo", "ȎO", "ȏo", + "ȐR", "ȑr", "ȒR", "ȓr", "ȔU", "ȕu", "ȖU", "ȗu", "ȘS", "șs", "ȚT", "țt", "ȞH", "ȟh", "ȦA", "ȧa", + "ȨE", "ȩe", "ȪO", "ȫo", "ȬO", "ȭo", "ȮO", "ȯo", "ȰO", "ȱo", "ȲY", "ȳy", + + // Greek letters combined with diacritics: + "ΆΑ", "ΈΕ", "ΉΗ", "ΊΙ", "ΌΟ", "ΎΥ", "ΏΩ", "ΐι", "ΪΙ", "ΫΥ", "άα", "έε", "ήη", "ίι", "ΰυ", "ϊι", + "ϋυ", "όο", "ύυ", "ώω", "ϓϒ", "ϔϒ", + + // Cyrillic letters combined with diacritics: + "ЀЕ", "ЁЕ", "ЃГ", "ЇІ", "ЌК", "ЍИ", "ЎУ", "ЙИ", "йи", "ѐе", "ёе", "ѓг", "їі", "ќк", "ѝи", "ўу", + "ѶѴ", "ѷѵ", "ӁЖ", "ӂж", "ӐА", "ӑа", "ӒА", "ӓа", "ӖЕ", "ӗе", "ӚӘ", "ӛә", "ӜЖ", "ӝж", "ӞЗ", "ӟз", + "ӢИ", "ӣи", "ӤИ", "ӥи", "ӦО", "ӧо", "ӪӨ", "ӫө", "ӬЭ", "ӭэ", "ӮУ", "ӯу", "ӰУ", "ӱу", "ӲУ", "ӳу", + "ӴЧ", "ӵч", "ӸЫ", "ӹы" + }; private static Dictionary CombinedDiacriticsMap { @@ -381,9 +381,9 @@ private void NormalizeSrc(in ReadOnlyMemory src, ref ReadOnlyMemory ch = CombinedDiacriticsMap[ch]; } - if (_parent._textCase == TextNormalizingEstimator.CaseNormalizationMode.Lower) + if (_parent._caseMode == TextNormalizingEstimator.CaseMode.Lower) ch = CharUtils.ToLowerInvariant(ch); - else if (_parent._textCase == TextNormalizingEstimator.CaseNormalizationMode.Upper) + else if (_parent._caseMode == TextNormalizingEstimator.CaseMode.Upper) ch = CharUtils.ToUpperInvariant(ch); if (ch != src.Span[i]) @@ -437,16 +437,25 @@ public sealed class TextNormalizingEstimator : TrivialEstimator /// Case normalization mode of text. This enumeration is serialized. /// - public enum CaseNormalizationMode + public enum CaseMode { + /// + /// Make the output characters lowercased. + /// Lower = 0, + /// + /// Make the output characters uppercased. + /// Upper = 1, + /// + /// Do not change the case of output characters. + /// None = 2 } internal static class Defaults { - public const CaseNormalizationMode TextCase = CaseNormalizationMode.Lower; + public const CaseMode Mode = CaseMode.Lower; public const bool KeepDiacritics = false; public const bool KeepPunctuations = true; public const bool KeepNumbers = true; @@ -464,18 +473,18 @@ internal static class Defaults /// The environment. /// Name of the column resulting from the transformation of . /// Name of the column to transform. If set to , the value of the will be used as source. - /// Casing text using the rules of the invariant culture. + /// Casing text using the rules of the invariant culture. /// Whether to keep diacritical marks or remove them. /// Whether to keep punctuation marks or remove them. /// Whether to keep numbers or remove them. internal TextNormalizingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, - CaseNormalizationMode textCase = Defaults.TextCase, + CaseMode caseMode = Defaults.Mode, bool keepDiacritics = Defaults.KeepDiacritics, bool keepPunctuations = Defaults.KeepPunctuations, bool keepNumbers = Defaults.KeepNumbers) - : this(env, textCase, keepDiacritics, keepPunctuations, keepNumbers, (outputColumnName, inputColumnName ?? outputColumnName)) + : this(env, caseMode, keepDiacritics, keepPunctuations, keepNumbers, (outputColumnName, inputColumnName ?? outputColumnName)) { } @@ -484,19 +493,19 @@ internal TextNormalizingEstimator(IHostEnvironment env, /// and outputs new text as output columns. /// /// The environment. - /// Casing text using the rules of the invariant culture. + /// Casing text using the rules of the invariant culture. /// Whether to keep diacritical marks or remove them. /// Whether to keep punctuation marks or remove them. /// Whether to keep numbers or remove them. /// Pairs of columns to run the text normalization on. internal TextNormalizingEstimator(IHostEnvironment env, - CaseNormalizationMode textCase = Defaults.TextCase, + CaseMode caseMode = Defaults.Mode, bool keepDiacritics = Defaults.KeepDiacritics, bool keepPunctuations = Defaults.KeepPunctuations, bool keepNumbers = Defaults.KeepNumbers, params (string outputColumnName, string inputColumnName)[] columns) : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(TextNormalizingEstimator)), - new TextNormalizingTransformer(env, textCase, keepDiacritics, keepPunctuations, keepNumbers, columns)) + new TextNormalizingTransformer(env, caseMode, keepDiacritics, keepPunctuations, keepNumbers, columns)) { } diff --git a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs index 34b9311235..f18a707999 100644 --- a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs +++ b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs @@ -825,7 +825,7 @@ public void TextNormalizeStatic() .Append(r => ( r.label, norm: r.text.NormalizeText(), - norm_Upper: r.text.NormalizeText(textCase: TextNormalizingEstimator.CaseNormalizationMode.Upper), + norm_Upper: r.text.NormalizeText(caseMode: TextNormalizingEstimator.CaseMode.Upper), norm_KeepDiacritics: r.text.NormalizeText(keepDiacritics: true), norm_NoPuctuations: r.text.NormalizeText(keepPunctuations: false), norm_NoNumbers: r.text.NormalizeText(keepNumbers: false))); diff --git a/test/Microsoft.ML.Tests/Transformers/TextNormalizer.cs b/test/Microsoft.ML.Tests/Transformers/TextNormalizer.cs index 2fa94c1cc9..bc685e5b58 100644 --- a/test/Microsoft.ML.Tests/Transformers/TextNormalizer.cs +++ b/test/Microsoft.ML.Tests/Transformers/TextNormalizer.cs @@ -56,7 +56,7 @@ public void TextNormalizerWorkout() dataView = reader.Load(dataSource).AsDynamic; var pipeVariations = new TextNormalizingEstimator(ML, columns: new[] { ("NormText", "text") }).Append( - new TextNormalizingEstimator(ML, textCase: TextNormalizingEstimator.CaseNormalizationMode.Upper, columns: new[] { ("UpperText", "text") })).Append( + new TextNormalizingEstimator(ML, caseMode: TextNormalizingEstimator.CaseMode.Upper, columns: new[] { ("UpperText", "text") })).Append( new TextNormalizingEstimator(ML, keepDiacritics: true, columns: new[] { ("WithDiacriticsText", "text") })).Append( new TextNormalizingEstimator(ML, keepNumbers: false, columns: new[] { ("NoNumberText", "text") })).Append( new TextNormalizingEstimator(ML, keepPunctuations: false, columns: new[] { ("NoPuncText", "text") }));