diff --git a/docs/code/MlNetCookBook.md b/docs/code/MlNetCookBook.md index 3e9a2b5769..031dabe2c4 100644 --- a/docs/code/MlNetCookBook.md +++ b/docs/code/MlNetCookBook.md @@ -782,7 +782,7 @@ var pipeline = // NLP pipeline 4: word embeddings. .Append(mlContext.Transforms.Text.TokenizeWords("TokenizedMessage", "NormalizedMessage")) .Append(mlContext.Transforms.Text.ExtractWordEmbeddings("Embeddings", "TokenizedMessage", - WordEmbeddingsExtractingEstimator.PretrainedModelKind.GloVeTwitter25D)); + WordEmbeddingsExtractingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)); // Let's train our pipeline, and then apply it to the same data. // Note that even on a small dataset of 70KB the pipeline above can take up to a minute to completely train. diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs index 9da2f086e6..63428fcdf9 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs @@ -61,8 +61,8 @@ public static void Example() // Let's apply pretrained word embedding model GloVeTwitter25D. // 25D means each word mapped into 25 dimensional space, basically each word represented by 25 float values. - var gloveWordEmbedding = ml.Transforms.Text.ExtractWordEmbeddings("GloveEmbeddings", "CleanWords", - WordEmbeddingsExtractingEstimator.PretrainedModelKind.GloVeTwitter25D); + var gloveWordEmbedding = ml.Transforms.Text.ApplyWordEmbedding("GloveEmbeddings", "CleanWords", + WordEmbeddingEstimator.PretrainedModelKind.GloVeTwitter25D); // We also have option to apply custom word embedding models. // Let's first create one. @@ -81,7 +81,7 @@ public static void Example() file.WriteLine("best" + " " + string.Join(" ", 0f, 0f, 20f)); } // Now let's add custom embedding on top of same words. - var pipeline = gloveWordEmbedding.Append(ml.Transforms.Text.ExtractWordEmbeddings("CustomEmbeddings", @".\custommodel.txt", "CleanWords")); + var pipeline = gloveWordEmbedding.Append(ml.Transforms.Text.ApplyWordEmbedding("CustomEmbeddings", @".\custommodel.txt", "CleanWords")); // And do all required transformations. var embeddingDataview = pipeline.Fit(wordsDataview).Transform(wordsDataview); diff --git a/src/Microsoft.ML.StaticPipe/WordEmbeddingsStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/WordEmbeddingsStaticExtensions.cs index d310dcd777..455699fe20 100644 --- a/src/Microsoft.ML.StaticPipe/WordEmbeddingsStaticExtensions.cs +++ b/src/Microsoft.ML.StaticPipe/WordEmbeddingsStaticExtensions.cs @@ -14,7 +14,7 @@ public static class WordEmbeddingsStaticExtensions /// Vector of tokenized text. /// The pretrained word embedding model. /// - public static Vector WordEmbeddings(this VarVector input, WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe) + public static Vector WordEmbeddings(this VarVector input, WordEmbeddingEstimator.PretrainedModelKind modelKind = WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding) { Contracts.CheckValue(input, nameof(input)); return new OutColumn(input, modelKind); @@ -33,7 +33,7 @@ private sealed class OutColumn : Vector { public PipelineColumn Input { get; } - public OutColumn(VarVector input, WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe) + public OutColumn(VarVector input, WordEmbeddingEstimator.PretrainedModelKind modelKind = WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding) : base(new Reconciler(modelKind), input) { Input = input; @@ -48,10 +48,10 @@ public OutColumn(VarVector input, string customModelFile = null) private sealed class Reconciler : EstimatorReconciler { - private readonly WordEmbeddingsExtractingEstimator.PretrainedModelKind? _modelKind; + private readonly WordEmbeddingEstimator.PretrainedModelKind? _modelKind; private readonly string _customLookupTable; - public Reconciler(WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe) + public Reconciler(WordEmbeddingEstimator.PretrainedModelKind modelKind = WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding) { _modelKind = modelKind; _customLookupTable = null; @@ -71,18 +71,18 @@ public override IEstimator Reconcile(IHostEnvironment env, { Contracts.Assert(toOutput.Length == 1); - var cols = new WordEmbeddingsExtractingEstimator.ColumnOptions[toOutput.Length]; + var cols = new WordEmbeddingEstimator.ColumnOptions[toOutput.Length]; for (int i = 0; i < toOutput.Length; ++i) { var outCol = (OutColumn)toOutput[i]; - cols[i] = new WordEmbeddingsExtractingEstimator.ColumnOptions(outputNames[outCol], inputNames[outCol.Input]); + cols[i] = new WordEmbeddingEstimator.ColumnOptions(outputNames[outCol], inputNames[outCol.Input]); } bool customLookup = !string.IsNullOrWhiteSpace(_customLookupTable); if (customLookup) - return new WordEmbeddingsExtractingEstimator(env, _customLookupTable, cols); + return new WordEmbeddingEstimator(env, _customLookupTable, cols); else - return new WordEmbeddingsExtractingEstimator(env, _modelKind.Value, cols); + return new WordEmbeddingEstimator(env, _modelKind.Value, cols); } } } diff --git a/src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs b/src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs index caabe64172..f7344e3440 100644 --- a/src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs +++ b/src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs @@ -132,16 +132,16 @@ public static CommonOutputs.TransformOutput LightLda(IHostEnvironment env, Laten } [TlcModule.EntryPoint(Name = "Transforms.WordEmbeddings", - Desc = WordEmbeddingsExtractingTransformer.Summary, - UserName = WordEmbeddingsExtractingTransformer.UserName, - ShortName = WordEmbeddingsExtractingTransformer.ShortName)] - public static CommonOutputs.TransformOutput WordEmbeddings(IHostEnvironment env, WordEmbeddingsExtractingTransformer.Options input) + Desc = WordEmbeddingTransformer.Summary, + UserName = WordEmbeddingTransformer.UserName, + ShortName = WordEmbeddingTransformer.ShortName)] + public static CommonOutputs.TransformOutput WordEmbeddings(IHostEnvironment env, WordEmbeddingTransformer.Options input) { Contracts.CheckValue(env, nameof(env)); env.CheckValue(input, nameof(input)); var h = EntryPointUtils.CheckArgsAndCreateHost(env, "WordEmbeddings", input); - var view = WordEmbeddingsExtractingTransformer.Create(h, input, input.Data); + var view = WordEmbeddingTransformer.Create(h, input, input.Data); return new CommonOutputs.TransformOutput() { Model = new TransformModelImpl(h, view, input.Data), diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 92647a4ab0..8492f1f64a 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -101,7 +101,7 @@ public static TextNormalizingEstimator NormalizeText(this TransformsCatalog.Text /// The text-related transform's catalog. /// Name of the column resulting from the transformation of . /// Name of the column to transform. If set to , the value of the will be used as source. - /// The embeddings to use. + /// The embeddings to use. /// /// /// /// /// - public static WordEmbeddingsExtractingEstimator ExtractWordEmbeddings(this TransformsCatalog.TextTransforms catalog, + public static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog.TextTransforms catalog, string outputColumnName, string inputColumnName = null, - WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe) - => new WordEmbeddingsExtractingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, modelKind); + WordEmbeddingEstimator.PretrainedModelKind modelKind = WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding) + => new WordEmbeddingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, modelKind); /// /// The text-related transform's catalog. @@ -127,16 +127,16 @@ public static WordEmbeddingsExtractingEstimator ExtractWordEmbeddings(this Trans /// ]]> /// /// - public static WordEmbeddingsExtractingEstimator ExtractWordEmbeddings(this TransformsCatalog.TextTransforms catalog, + public static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog.TextTransforms catalog, string outputColumnName, string customModelFile, string inputColumnName = null) - => new WordEmbeddingsExtractingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), + => new WordEmbeddingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, customModelFile, inputColumnName ?? outputColumnName); /// /// The text-related transform's catalog. - /// The embeddings to use. + /// The embeddings to use. /// The array columns, and per-column configurations to extract embeedings from. /// /// @@ -145,10 +145,10 @@ public static WordEmbeddingsExtractingEstimator ExtractWordEmbeddings(this Trans /// ]]> /// /// - public static WordEmbeddingsExtractingEstimator ExtractWordEmbeddings(this TransformsCatalog.TextTransforms catalog, - WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe, - params WordEmbeddingsExtractingEstimator.ColumnOptions[] columns) - => new WordEmbeddingsExtractingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), modelKind, columns); + public static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog.TextTransforms catalog, + WordEmbeddingEstimator.PretrainedModelKind modelKind = WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding, + params WordEmbeddingEstimator.ColumnOptions[] columns) + => new WordEmbeddingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), modelKind, columns); /// /// Tokenizes incoming text in , using as separators, diff --git a/src/Microsoft.ML.Transforms/Text/WordEmbeddingsExtractor.cs b/src/Microsoft.ML.Transforms/Text/WordEmbeddingsExtractor.cs index b288b07dde..8153814e05 100644 --- a/src/Microsoft.ML.Transforms/Text/WordEmbeddingsExtractor.cs +++ b/src/Microsoft.ML.Transforms/Text/WordEmbeddingsExtractor.cs @@ -19,22 +19,22 @@ using Microsoft.ML.Runtime; using Microsoft.ML.Transforms.Text; -[assembly: LoadableClass(WordEmbeddingsExtractingTransformer.Summary, typeof(IDataTransform), typeof(WordEmbeddingsExtractingTransformer), typeof(WordEmbeddingsExtractingTransformer.Options), - typeof(SignatureDataTransform), WordEmbeddingsExtractingTransformer.UserName, "WordEmbeddingsTransform", WordEmbeddingsExtractingTransformer.ShortName, DocName = "transform/WordEmbeddingsTransform.md")] +[assembly: LoadableClass(WordEmbeddingTransformer.Summary, typeof(IDataTransform), typeof(WordEmbeddingTransformer), typeof(WordEmbeddingTransformer.Options), + typeof(SignatureDataTransform), WordEmbeddingTransformer.UserName, "WordEmbeddingsTransform", WordEmbeddingTransformer.ShortName, DocName = "transform/WordEmbeddingsTransform.md")] -[assembly: LoadableClass(WordEmbeddingsExtractingTransformer.Summary, typeof(IDataTransform), typeof(WordEmbeddingsExtractingTransformer), null, typeof(SignatureLoadDataTransform), - WordEmbeddingsExtractingTransformer.UserName, WordEmbeddingsExtractingTransformer.LoaderSignature)] +[assembly: LoadableClass(WordEmbeddingTransformer.Summary, typeof(IDataTransform), typeof(WordEmbeddingTransformer), null, typeof(SignatureLoadDataTransform), + WordEmbeddingTransformer.UserName, WordEmbeddingTransformer.LoaderSignature)] -[assembly: LoadableClass(typeof(WordEmbeddingsExtractingTransformer), null, typeof(SignatureLoadModel), - WordEmbeddingsExtractingTransformer.UserName, WordEmbeddingsExtractingTransformer.LoaderSignature)] +[assembly: LoadableClass(typeof(WordEmbeddingTransformer), null, typeof(SignatureLoadModel), + WordEmbeddingTransformer.UserName, WordEmbeddingTransformer.LoaderSignature)] -[assembly: LoadableClass(typeof(IRowMapper), typeof(WordEmbeddingsExtractingTransformer), null, typeof(SignatureLoadRowMapper), - WordEmbeddingsExtractingTransformer.UserName, WordEmbeddingsExtractingTransformer.LoaderSignature)] +[assembly: LoadableClass(typeof(IRowMapper), typeof(WordEmbeddingTransformer), null, typeof(SignatureLoadRowMapper), + WordEmbeddingTransformer.UserName, WordEmbeddingTransformer.LoaderSignature)] namespace Microsoft.ML.Transforms.Text { /// - public sealed class WordEmbeddingsExtractingTransformer : OneToOneTransformerBase + public sealed class WordEmbeddingTransformer : OneToOneTransformerBase { internal sealed class Column : OneToOneColumn { @@ -61,7 +61,7 @@ internal sealed class Options : TransformInputBase public Column[] Columns; [Argument(ArgumentType.AtMostOnce, HelpText = "Pre-trained model used to create the vocabulary", ShortName = "model", SortOrder = 1)] - public WordEmbeddingsExtractingEstimator.PretrainedModelKind? ModelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe; + public WordEmbeddingEstimator.PretrainedModelKind? ModelKind = WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding; [Argument(ArgumentType.AtMostOnce, IsInputFileName = true, HelpText = "Filename for custom word embedding model", ShortName = "dataFile", SortOrder = 2)] @@ -82,10 +82,10 @@ internal static VersionInfo GetVersionInfo() verReadableCur: 0x00010001, verWeCanReadBack: 0x00010001, loaderSignature: LoaderSignature, - loaderAssemblyName: typeof(WordEmbeddingsExtractingTransformer).Assembly.FullName); + loaderAssemblyName: typeof(WordEmbeddingTransformer).Assembly.FullName); } - private readonly WordEmbeddingsExtractingEstimator.PretrainedModelKind? _modelKind; + private readonly WordEmbeddingEstimator.PretrainedModelKind? _modelKind; private readonly string _modelFileNameWithPath; private static object _embeddingsLock = new object(); private readonly bool _customLookup; @@ -96,7 +96,7 @@ internal static VersionInfo GetVersionInfo() /// /// The names of the output and input column pairs on which the transformation is applied. /// - public IReadOnlyCollection<(string outputColumnName, string inputColumnName)> Columns => ColumnPairs.AsReadOnly(); + private IReadOnlyCollection<(string outputColumnName, string inputColumnName)> Columns => ColumnPairs.AsReadOnly(); private sealed class Model { @@ -155,53 +155,53 @@ public List GetWordLabels() private const int Timeout = 10 * 60 * 1000; /// - /// Instantiates using the pretrained word embedding model specified by . + /// Instantiates using the pretrained word embedding model specified by . /// /// Host Environment. /// Name of the column resulting from the transformation of . /// Name of the column to transform. If set to , the value of the will be used as source. /// The pretrained word embedding model. - internal WordEmbeddingsExtractingTransformer(IHostEnvironment env, string outputColumnName, string inputColumnName = null, - WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe) - : this(env, modelKind, new WordEmbeddingsExtractingEstimator.ColumnOptions(outputColumnName, inputColumnName ?? outputColumnName)) + internal WordEmbeddingTransformer(IHostEnvironment env, string outputColumnName, string inputColumnName = null, + WordEmbeddingEstimator.PretrainedModelKind modelKind = WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding) + : this(env, modelKind, new WordEmbeddingEstimator.ColumnOptions(outputColumnName, inputColumnName ?? outputColumnName)) { } /// - /// Instantiates using the custom word embedding model by loading it from the file specified by the . + /// Instantiates using the custom word embedding model by loading it from the file specified by the . /// /// Host Environment. /// Name of the column resulting from the transformation of . /// Filename for custom word embedding model. /// Name of the column to transform. If set to , the value of the will be used as source. - internal WordEmbeddingsExtractingTransformer(IHostEnvironment env, string outputColumnName, string customModelFile, string inputColumnName = null) - : this(env, customModelFile, new WordEmbeddingsExtractingEstimator.ColumnOptions(outputColumnName, inputColumnName ?? outputColumnName)) + internal WordEmbeddingTransformer(IHostEnvironment env, string outputColumnName, string customModelFile, string inputColumnName = null) + : this(env, customModelFile, new WordEmbeddingEstimator.ColumnOptions(outputColumnName, inputColumnName ?? outputColumnName)) { } /// - /// Instantiates using the pretrained word embedding model specified by . + /// Instantiates using the pretrained word embedding model specified by . /// /// Host Environment. /// The pretrained word embedding model. /// Input/Output columns. - internal WordEmbeddingsExtractingTransformer(IHostEnvironment env, WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind, params WordEmbeddingsExtractingEstimator.ColumnOptions[] columns) + internal WordEmbeddingTransformer(IHostEnvironment env, WordEmbeddingEstimator.PretrainedModelKind modelKind, params WordEmbeddingEstimator.ColumnOptions[] columns) : base(Contracts.CheckRef(env, nameof(env)).Register(RegistrationName), GetColumnPairs(columns)) { - env.CheckUserArg(Enum.IsDefined(typeof(WordEmbeddingsExtractingEstimator.PretrainedModelKind), modelKind), nameof(modelKind)); + env.CheckUserArg(Enum.IsDefined(typeof(WordEmbeddingEstimator.PretrainedModelKind), modelKind), nameof(modelKind)); _modelKind = modelKind; - _modelFileNameWithPath = EnsureModelFile(env, out _linesToSkip, (WordEmbeddingsExtractingEstimator.PretrainedModelKind)_modelKind); + _modelFileNameWithPath = EnsureModelFile(env, out _linesToSkip, (WordEmbeddingEstimator.PretrainedModelKind)_modelKind); _currentVocab = GetVocabularyDictionary(env); } /// - /// Instantiates using the custom word embedding model by loading it from the file specified by the . + /// Instantiates using the custom word embedding model by loading it from the file specified by the . /// /// Host Environment. /// Filename for custom word embedding model. /// Input/Output columns. - internal WordEmbeddingsExtractingTransformer(IHostEnvironment env, string customModelFile, params WordEmbeddingsExtractingEstimator.ColumnOptions[] columns) + internal WordEmbeddingTransformer(IHostEnvironment env, string customModelFile, params WordEmbeddingEstimator.ColumnOptions[] columns) : base(Contracts.CheckRef(env, nameof(env)).Register(RegistrationName), GetColumnPairs(columns)) { env.CheckValue(customModelFile, nameof(customModelFile)); @@ -213,7 +213,7 @@ internal WordEmbeddingsExtractingTransformer(IHostEnvironment env, string custom _currentVocab = GetVocabularyDictionary(env); } - private static (string outputColumnName, string inputColumnName)[] GetColumnPairs(WordEmbeddingsExtractingEstimator.ColumnOptions[] columns) + private static (string outputColumnName, string inputColumnName)[] GetColumnPairs(WordEmbeddingEstimator.ColumnOptions[] columns) { Contracts.CheckValue(columns, nameof(columns)); return columns.Select(x => (x.Name, x.InputColumnName)).ToArray(); @@ -227,28 +227,28 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa env.CheckValue(input, nameof(input)); if (options.ModelKind == null) - options.ModelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe; - env.CheckUserArg(!options.ModelKind.HasValue || Enum.IsDefined(typeof(WordEmbeddingsExtractingEstimator.PretrainedModelKind), options.ModelKind), nameof(options.ModelKind)); + options.ModelKind = WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding; + env.CheckUserArg(!options.ModelKind.HasValue || Enum.IsDefined(typeof(WordEmbeddingEstimator.PretrainedModelKind), options.ModelKind), nameof(options.ModelKind)); env.CheckValue(options.Columns, nameof(options.Columns)); - var cols = new WordEmbeddingsExtractingEstimator.ColumnOptions[options.Columns.Length]; + var cols = new WordEmbeddingEstimator.ColumnOptions[options.Columns.Length]; for (int i = 0; i < cols.Length; i++) { var item = options.Columns[i]; - cols[i] = new WordEmbeddingsExtractingEstimator.ColumnOptions( + cols[i] = new WordEmbeddingEstimator.ColumnOptions( item.Name, item.Source ?? item.Name); } bool customLookup = !string.IsNullOrWhiteSpace(options.CustomLookupTable); if (customLookup) - return new WordEmbeddingsExtractingTransformer(env, options.CustomLookupTable, cols).MakeDataTransform(input); + return new WordEmbeddingTransformer(env, options.CustomLookupTable, cols).MakeDataTransform(input); else - return new WordEmbeddingsExtractingTransformer(env, options.ModelKind.Value, cols).MakeDataTransform(input); + return new WordEmbeddingTransformer(env, options.ModelKind.Value, cols).MakeDataTransform(input); } - private WordEmbeddingsExtractingTransformer(IHost host, ModelLoadContext ctx) + private WordEmbeddingTransformer(IHost host, ModelLoadContext ctx) : base(host, ctx) { Host.AssertValue(ctx); @@ -261,21 +261,21 @@ private WordEmbeddingsExtractingTransformer(IHost host, ModelLoadContext ctx) } else { - _modelKind = (WordEmbeddingsExtractingEstimator.PretrainedModelKind)ctx.Reader.ReadUInt32(); - _modelFileNameWithPath = EnsureModelFile(Host, out _linesToSkip, (WordEmbeddingsExtractingEstimator.PretrainedModelKind)_modelKind); + _modelKind = (WordEmbeddingEstimator.PretrainedModelKind)ctx.Reader.ReadUInt32(); + _modelFileNameWithPath = EnsureModelFile(Host, out _linesToSkip, (WordEmbeddingEstimator.PretrainedModelKind)_modelKind); } Host.CheckNonWhiteSpace(_modelFileNameWithPath, nameof(_modelFileNameWithPath)); _currentVocab = GetVocabularyDictionary(host); } - internal static WordEmbeddingsExtractingTransformer Create(IHostEnvironment env, ModelLoadContext ctx) + internal static WordEmbeddingTransformer Create(IHostEnvironment env, ModelLoadContext ctx) { Contracts.CheckValue(env, nameof(env)); IHost h = env.Register(RegistrationName); h.CheckValue(ctx, nameof(ctx)); ctx.CheckAtModel(GetVersionInfo()); - return new WordEmbeddingsExtractingTransformer(h, ctx); + return new WordEmbeddingTransformer(h, ctx); } // Factory method for SignatureLoadDataTransform. @@ -311,10 +311,10 @@ private protected override void CheckInputColumn(DataViewSchema inputSchema, int private sealed class Mapper : OneToOneMapperBase, ISaveAsOnnx { - private readonly WordEmbeddingsExtractingTransformer _parent; + private readonly WordEmbeddingTransformer _parent; private readonly VectorType _outputType; - public Mapper(WordEmbeddingsExtractingTransformer parent, DataViewSchema inputSchema) + public Mapper(WordEmbeddingTransformer parent, DataViewSchema inputSchema) : base(parent.Host.Register(nameof(Mapper)), parent, inputSchema) { Host.CheckValue(inputSchema, nameof(inputSchema)); @@ -603,24 +603,24 @@ private ValueGetter> GetGetterVec(DataViewRow input, int iinfo) } } - private static Dictionary _modelsMetaData = new Dictionary() + private static Dictionary _modelsMetaData = new Dictionary() { - { WordEmbeddingsExtractingEstimator.PretrainedModelKind.GloVe50D, "glove.6B.50d.txt" }, - { WordEmbeddingsExtractingEstimator.PretrainedModelKind.GloVe100D, "glove.6B.100d.txt" }, - { WordEmbeddingsExtractingEstimator.PretrainedModelKind.GloVe200D, "glove.6B.200d.txt" }, - { WordEmbeddingsExtractingEstimator.PretrainedModelKind.GloVe300D, "glove.6B.300d.txt" }, - { WordEmbeddingsExtractingEstimator.PretrainedModelKind.GloVeTwitter25D, "glove.twitter.27B.25d.txt" }, - { WordEmbeddingsExtractingEstimator.PretrainedModelKind.GloVeTwitter50D, "glove.twitter.27B.50d.txt" }, - { WordEmbeddingsExtractingEstimator.PretrainedModelKind.GloVeTwitter100D, "glove.twitter.27B.100d.txt" }, - { WordEmbeddingsExtractingEstimator.PretrainedModelKind.GloVeTwitter200D, "glove.twitter.27B.200d.txt" }, - { WordEmbeddingsExtractingEstimator.PretrainedModelKind.FastTextWikipedia300D, "wiki.en.vec" }, - { WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe, "sentiment.emd" } + { WordEmbeddingEstimator.PretrainedModelKind.GloVe50D, "glove.6B.50d.txt" }, + { WordEmbeddingEstimator.PretrainedModelKind.GloVe100D, "glove.6B.100d.txt" }, + { WordEmbeddingEstimator.PretrainedModelKind.GloVe200D, "glove.6B.200d.txt" }, + { WordEmbeddingEstimator.PretrainedModelKind.GloVe300D, "glove.6B.300d.txt" }, + { WordEmbeddingEstimator.PretrainedModelKind.GloVeTwitter25D, "glove.twitter.27B.25d.txt" }, + { WordEmbeddingEstimator.PretrainedModelKind.GloVeTwitter50D, "glove.twitter.27B.50d.txt" }, + { WordEmbeddingEstimator.PretrainedModelKind.GloVeTwitter100D, "glove.twitter.27B.100d.txt" }, + { WordEmbeddingEstimator.PretrainedModelKind.GloVeTwitter200D, "glove.twitter.27B.200d.txt" }, + { WordEmbeddingEstimator.PretrainedModelKind.FastTextWikipedia300D, "wiki.en.vec" }, + { WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding, "sentiment.emd" } }; - private static Dictionary _linesToSkipInModels = new Dictionary() - { { WordEmbeddingsExtractingEstimator.PretrainedModelKind.FastTextWikipedia300D, 1 } }; + private static Dictionary _linesToSkipInModels = new Dictionary() + { { WordEmbeddingEstimator.PretrainedModelKind.FastTextWikipedia300D, 1 } }; - private string EnsureModelFile(IHostEnvironment env, out int linesToSkip, WordEmbeddingsExtractingEstimator.PretrainedModelKind kind) + private string EnsureModelFile(IHostEnvironment env, out int linesToSkip, WordEmbeddingEstimator.PretrainedModelKind kind) { linesToSkip = 0; if (_modelsMetaData.ContainsKey(kind)) @@ -630,7 +630,7 @@ private string EnsureModelFile(IHostEnvironment env, out int linesToSkip, WordEm linesToSkip = _linesToSkipInModels[kind]; using (var ch = Host.Start("Ensuring resources")) { - string dir = kind == WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe ? Path.Combine("Text", "Sswe") : "WordVectors"; + string dir = kind == WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding ? Path.Combine("Text", "Sswe") : "WordVectors"; var url = $"{dir}/{modelFileName}"; var ensureModel = ResourceManagerUtils.Instance.EnsureResource(Host, ch, url, modelFileName, dir, Timeout); ensureModel.Wait(); @@ -729,7 +729,7 @@ private static ParallelOptions GetParallelOptions(IHostEnvironment hostEnvironme } /// - public sealed class WordEmbeddingsExtractingEstimator : IEstimator + public sealed class WordEmbeddingEstimator : IEstimator { private readonly IHost _host; private readonly ColumnOptions[] _columns; @@ -746,8 +746,8 @@ public sealed class WordEmbeddingsExtractingEstimator : IEstimatorName of the column resulting from the transformation of . /// Name of the column to transform. If set to , the value of the will be used as source. /// The embeddings to use. - internal WordEmbeddingsExtractingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, - PretrainedModelKind modelKind = PretrainedModelKind.Sswe) + internal WordEmbeddingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, + PretrainedModelKind modelKind = PretrainedModelKind.SentimentSpecificWordEmbedding) : this(env, modelKind, new ColumnOptions(outputColumnName, inputColumnName ?? outputColumnName)) { } @@ -762,7 +762,7 @@ internal WordEmbeddingsExtractingEstimator(IHostEnvironment env, string outputCo /// Name of the column resulting from the transformation of . /// The path of the pre-trained embeedings model to use. /// Name of the column to transform. - internal WordEmbeddingsExtractingEstimator(IHostEnvironment env, string outputColumnName, string customModelFile, string inputColumnName = null) + internal WordEmbeddingEstimator(IHostEnvironment env, string outputColumnName, string customModelFile, string inputColumnName = null) : this(env, customModelFile, new ColumnOptions(outputColumnName, inputColumnName ?? outputColumnName)) { } @@ -776,21 +776,21 @@ internal WordEmbeddingsExtractingEstimator(IHostEnvironment env, string outputCo /// The local instance of /// The embeddings to use. /// The array columns, and per-column configurations to extract embeedings from. - internal WordEmbeddingsExtractingEstimator(IHostEnvironment env, - PretrainedModelKind modelKind = PretrainedModelKind.Sswe, + internal WordEmbeddingEstimator(IHostEnvironment env, + PretrainedModelKind modelKind = PretrainedModelKind.SentimentSpecificWordEmbedding, params ColumnOptions[] columns) { Contracts.CheckValue(env, nameof(env)); - _host = env.Register(nameof(WordEmbeddingsExtractingEstimator)); + _host = env.Register(nameof(WordEmbeddingEstimator)); _modelKind = modelKind; _customLookupTable = null; _columns = columns; } - internal WordEmbeddingsExtractingEstimator(IHostEnvironment env, string customModelFile, params ColumnOptions[] columns) + internal WordEmbeddingEstimator(IHostEnvironment env, string customModelFile, params ColumnOptions[] columns) { Contracts.CheckValue(env, nameof(env)); - _host = env.Register(nameof(WordEmbeddingsExtractingEstimator)); + _host = env.Register(nameof(WordEmbeddingEstimator)); _modelKind = null; _customLookupTable = customModelFile; _columns = columns; @@ -829,7 +829,7 @@ public enum PretrainedModelKind FastTextWikipedia300D = 8, [TGUI(Label = "Sentiment-Specific Word Embedding")] - Sswe = 9 + SentimentSpecificWordEmbedding = 9 } /// /// Information for each column pair. @@ -882,16 +882,16 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema) } /// - /// Trains and returns a . + /// Trains and returns a . /// - public WordEmbeddingsExtractingTransformer Fit(IDataView input) + public WordEmbeddingTransformer Fit(IDataView input) { bool customLookup = !string.IsNullOrWhiteSpace(_customLookupTable); - WordEmbeddingsExtractingTransformer transformer; + WordEmbeddingTransformer transformer; if (customLookup) - transformer = new WordEmbeddingsExtractingTransformer(_host, _customLookupTable, _columns); + transformer = new WordEmbeddingTransformer(_host, _customLookupTable, _columns); else - transformer = new WordEmbeddingsExtractingTransformer(_host, _modelKind.Value, _columns); + transformer = new WordEmbeddingTransformer(_host, _modelKind.Value, _columns); // Validate input schema. transformer.GetOutputSchema(input.Schema); return transformer; diff --git a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv index 27113bc17d..3bc0eba707 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv +++ b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv @@ -132,5 +132,5 @@ Transforms.TrainTestDatasetSplitter Split the dataset into train and test sets M Transforms.TreeLeafFeaturizer Trains a tree ensemble, or loads it from a file, then maps a numeric feature vector to three outputs: 1. A vector containing the individual tree outputs of the tree ensemble. 2. A vector indicating the leaves that the feature vector falls on in the tree ensemble. 3. A vector indicating the paths that the feature vector falls on in the tree ensemble. If a both a model file and a trainer are specified - will use the model file. If neither are specified, will train a default FastTree model. This can handle key labels by training a regression model towards their optionally permuted indices. Microsoft.ML.Data.TreeFeaturize Featurizer Microsoft.ML.Data.TreeEnsembleFeaturizerTransform+ArgumentsForEntryPoint Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.TwoHeterogeneousModelCombiner Combines a TransformModel and a PredictorModel into a single PredictorModel. Microsoft.ML.EntryPoints.ModelOperations CombineTwoModels Microsoft.ML.EntryPoints.ModelOperations+SimplePredictorModelInput Microsoft.ML.EntryPoints.ModelOperations+PredictorModelOutput Transforms.VectorToImage Converts vector array into image type. Microsoft.ML.ImageAnalytics.ImageAnalyticsEntryPoints VectorToImage Microsoft.ML.ImageAnalytics.VectorToImageConvertingTransformer+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput -Transforms.WordEmbeddings Word Embeddings transform is a text featurizer which converts vectors of text tokens into sentence vectors using a pre-trained model Microsoft.ML.Transforms.Text.TextAnalytics WordEmbeddings Microsoft.ML.Transforms.Text.WordEmbeddingsExtractingTransformer+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput +Transforms.WordEmbeddings Word Embeddings transform is a text featurizer which converts vectors of text tokens into sentence vectors using a pre-trained model Microsoft.ML.Transforms.Text.TextAnalytics WordEmbeddings Microsoft.ML.Transforms.Text.WordEmbeddingTransformer+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.WordTokenizer The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed. Microsoft.ML.Transforms.Text.TextAnalytics DelimitedTokenizeTransform Microsoft.ML.Transforms.Text.WordTokenizingTransformer+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json index 3f190a7257..84be4dcb20 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json +++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json @@ -23348,7 +23348,7 @@ "GloVeTwitter100D", "GloVeTwitter200D", "FastTextWikipedia300D", - "Sswe" + "SentimentSpecificWordEmbedding" ] }, "Desc": "Pre-trained model used to create the vocabulary", @@ -23358,7 +23358,7 @@ "Required": false, "SortOrder": 1.0, "IsNullable": true, - "Default": "Sswe" + "Default": "SentimentSpecificWordEmbedding" }, { "Name": "Data", diff --git a/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs b/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs index d43fa2a110..890e3bea46 100644 --- a/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs +++ b/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs @@ -103,8 +103,8 @@ public void TrainSentiment() UseWordExtractor = false, }, "SentimentText").Fit(loader).Transform(loader); - var trans = mlContext.Transforms.Text.ExtractWordEmbeddings("Features", "WordEmbeddings_TransformedText", - WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe).Fit(text).Transform(text); + var trans = mlContext.Transforms.Text.ApplyWordEmbedding("Features", "WordEmbeddings_TransformedText", + WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding).Fit(text).Transform(text); // Train var trainer = mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent(); diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs index 4077d932bb..513c4ebbff 100644 --- a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs +++ b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs @@ -3603,11 +3603,11 @@ public void EntryPointWordEmbeddings() }, InputFile = inputFile, }).Data; - var embedding = Transforms.Text.TextAnalytics.WordEmbeddings(Env, new WordEmbeddingsExtractingTransformer.Options() + var embedding = Transforms.Text.TextAnalytics.WordEmbeddings(Env, new WordEmbeddingTransformer.Options() { Data = dataView, - Columns = new[] { new WordEmbeddingsExtractingTransformer.Column { Name = "Features", Source = "Text" } }, - ModelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe + Columns = new[] { new WordEmbeddingTransformer.Column { Name = "Features", Source = "Text" } }, + ModelKind = WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding }); var result = embedding.OutputData; using (var cursor = result.GetRowCursorForAllColumns()) diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs index 6c0b9ced47..d9473608ea 100644 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs @@ -463,7 +463,7 @@ public void WordEmbeddingsTest() var embedNetworkPath = GetDataPath(@"shortsentiment.emd"); var data = mlContext.Data.LoadFromTextFile(dataPath, separatorChar: '\t', hasHeader: false); - var pipeline = mlContext.Transforms.Text.ExtractWordEmbeddings("Embed", embedNetworkPath, "Tokens"); + var pipeline = mlContext.Transforms.Text.ApplyWordEmbedding("Embed", embedNetworkPath, "Tokens"); var model = pipeline.Fit(data); var transformedData = model.Transform(data); diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs index 8e7e97145c..eb6ec8c5a2 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs @@ -472,7 +472,7 @@ private void TextFeaturizationOn(string dataPath) // NLP pipeline 4: word embeddings. // PretrainedModelKind.Sswe is used here for performance of the test. In a real // scenario, it is best to use a different model for more accuracy. - Embeddings: r.Message.NormalizeText().TokenizeText().WordEmbeddings(WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe) + Embeddings: r.Message.NormalizeText().TokenizeText().WordEmbeddings(WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding) )); // Let's train our pipeline, and then apply it to the same data. diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs index 77c6145849..2ec90892be 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs @@ -313,8 +313,8 @@ private void TextFeaturizationOn(string dataPath) // PretrainedModelKind.Sswe is used here for performance of the test. In a real // scenario, it is best to use a different model for more accuracy. .Append(mlContext.Transforms.Text.TokenizeWords("TokenizedMessage", "NormalizedMessage")) - .Append(mlContext.Transforms.Text.ExtractWordEmbeddings("Embeddings", "TokenizedMessage", - WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe)); + .Append(mlContext.Transforms.Text.ApplyWordEmbedding("Embeddings", "TokenizedMessage", + WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)); // Let's train our pipeline, and then apply it to the same data. // Note that even on a small dataset of 70KB the pipeline above can take up to a minute to completely train. diff --git a/test/Microsoft.ML.Tests/Transformers/WordEmbeddingsTests.cs b/test/Microsoft.ML.Tests/Transformers/WordEmbeddingsTests.cs index 631857f9fa..95843dc330 100644 --- a/test/Microsoft.ML.Tests/Transformers/WordEmbeddingsTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/WordEmbeddingsTests.cs @@ -39,7 +39,7 @@ public void TestWordEmbeddings() .Append(ML.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words")); var words = est.Fit(data).Transform(data); - var pipe = ML.Transforms.Text.ExtractWordEmbeddings("WordEmbeddings", "CleanWords", modelKind: WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe); + var pipe = ML.Transforms.Text.ApplyWordEmbedding("WordEmbeddings", "CleanWords", modelKind: WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding); TestEstimatorCore(pipe, words, invalidInput: data); @@ -82,7 +82,7 @@ public void TestCustomWordEmbeddings() file.WriteLine("you" + " " + string.Join(" ", -1f, -2f, -4f, -6f, -1f)); file.WriteLine("dude" + " " + string.Join(" ", 100f, 0f, 0f, 0f, 0f)); } - var pipe = ML.Transforms.Text.ExtractWordEmbeddings("WordEmbeddings", pathToCustomModel, "CleanWords"); + var pipe = ML.Transforms.Text.ApplyWordEmbedding("WordEmbeddings", pathToCustomModel, "CleanWords"); TestEstimatorCore(pipe, words, invalidInput: data);