dotnet · Ivanidzo4ka · Jul 31, 2018 · Jun 14, 2018 · Jul 17, 2018 · Jul 17, 2018
diff --git a/src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs b/src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs
@@ -137,5 +137,25 @@ public static CommonOutputs.TransformOutput LightLda(IHostEnvironment env, LdaTr
                 OutputData = view
             };
         }
+
+        [TlcModule.EntryPoint(Name = "Transforms.WordEmbeddings",
+            Desc = WordEmbeddingsTransform.Summary,
+            UserName = WordEmbeddingsTransform.UserName,
+            ShortName = WordEmbeddingsTransform.ShortName,
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name=""WordEmbeddings""]/*' />",
+                                 @"<include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/example[@name=""WordEmbeddings""]/*' />" })]
+        public static CommonOutputs.TransformOutput WordEmbeddings(IHostEnvironment env, WordEmbeddingsTransform.Arguments input)
+        {
+            Contracts.CheckValue(env, nameof(env));
+            env.CheckValue(input, nameof(input));
+
+            var h = EntryPointUtils.CheckArgsAndCreateHost(env, "WordEmbeddings", input);
+            var view = new WordEmbeddingsTransform(h, input, input.Data);
+            return new CommonOutputs.TransformOutput()
+            {
+                Model = new TransformModel(h, view, input.Data),
+                OutputData = view
+            };
+        }
     }
 }
diff --git a/src/Microsoft.ML.Transforms/Text/WordEmbeddingsTransform.cs b/src/Microsoft.ML.Transforms/Text/WordEmbeddingsTransform.cs
diff --git a/src/Microsoft.ML.Transforms/Text/doc.xml b/src/Microsoft.ML.Transforms/Text/doc.xml
@@ -179,7 +179,49 @@
     <example name="LightLDA">
       <example>
         <code language="csharp">
-          pipeline.Add(new LightLda(("InTextCol" , "OutTextCol")));
+          pipeline.Add(new LightLda((&quot;InTextCol&quot; , &quot;OutTextCol&quot;)));
+        </code>
+      </example>
+    </example>
+
+    <member name="WordEmbeddings">
+      <summary>
+        Word Embeddings transform is a text featurizer which converts vectors of text tokens into sentence vectors using a pre-trained model.
+      </summary>
+      <remarks>
+        WordEmbeddings wrap different embedding models, such as GloVe. Users can specify which embedding to use. 
+        The available options are various versions of <a href="https://nlp.stanford.edu/projects/glove/">GloVe Models</a>, <a href="https://en.wikipedia.org/wiki/FastText">fastText</a>, and <a href="http://anthology.aclweb.org/P/P14/P14-1146.pdf">SSWE</a>.
+        <para>
+          Note: As WordEmbedding requires a column with text vector, e.g. %3C%27this%27, %27is%27, %27good%27%3E, users need to create an input column by
+          using the output_tokens=True for TextTransform to convert a column with sentences like "This is good" into %3C%27this%27, %27is%27, %27good%27 %3E.
+          The suffix of %27_TransformedText%27 is added to the original column name to create the output token column. For instance if the input column is %27body%27,
+          the output tokens column is named %27body_TransformedText%27.
+        </para>
+        <para>
+          License attributes for pretrained models:
+          <list type="bullet">
+            <item>
+              <description>
+                &quot;fastText Wikipedia 300D&quot; by Facebook, Inc. is licensed under <a href="https://creativecommons.org/licenses/by-sa/3.0/">CC-BY-SA 3.0</a> based on:
+                P. Bojanowski*, E. Grave*, A. Joulin, T. Mikolov,<a href="https://arxiv.org/abs/1607.04606">Enriching Word Vectors with Subword Information</a>
+                %40article%7Bbojanowski2016enriching%2C%0A%20%20title%3D%7BEnriching%20Word%20Vectors%20with%20Subword%20Information%7D%2C%0A%20%20author%3D%7BBojanowski%2C%20Piotr%20and%20Grave%2C%20Edouard%20and%20Joulin%2C%20Armand%20and%20Mikolov%2C%20Tomas%7D%2C%0A%20%20journal%3D%7BarXiv%20preprint%20arXiv%3A1607.04606%7D%2C%0A%20%20year%3D%7B2016%7D%0A%7D
+                More information can be found <a href="https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md">here</a>.
+              </description>
+            </item>
+            <item>
+              <description>
+                GloVe models by Stanford University, or (Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. <a href="https://nlp.stanford.edu/pubs/glove.pdf">GloVe: Global Vectors for Word Representation</a>) is licensed under <a href="https://opendatacommons.org/licenses/pddl/1.0/">PDDL</a>.
+                More information can be found <a href="https://nlp.stanford.edu/projects/glove/">here</a>. Repository can be found <a href="https://github.com/stanfordnlp/GloVe">here</a>.
+              </description>
+          </item>
+        </list>
+        </para>
+      </remarks>
+    </member>
+    <example name="WordEmbeddings">
+      <example>
+        <code language="csharp">
+          pipeline.Add(new WordEmbeddings((&quot;InVectorTextCol&quot; , &quot;OutTextCol&quot;)));
         </code>
       </example>
     </example>

diff --git a/src/Microsoft.ML/CSharpApi.cs b/src/Microsoft.ML/CSharpApi.cs
@@ -1522,6 +1522,18 @@ public void Add(Microsoft.ML.Transforms.TwoHeterogeneousModelCombiner input, Mic
                 _jsonNodes.Add(Serialize("Transforms.TwoHeterogeneousModelCombiner", input, output));
             }
 
+            public Microsoft.ML.Transforms.WordEmbeddings.Output Add(Microsoft.ML.Transforms.WordEmbeddings input)
+            {
+                var output = new Microsoft.ML.Transforms.WordEmbeddings.Output();
+                Add(input, output);
+                return output;
+            }
+
+            public void Add(Microsoft.ML.Transforms.WordEmbeddings input, Microsoft.ML.Transforms.WordEmbeddings.Output output)
+            {
+                _jsonNodes.Add(Serialize("Transforms.WordEmbeddings", input, output));
+            }
+
             public Microsoft.ML.Transforms.WordTokenizer.Output Add(Microsoft.ML.Transforms.WordTokenizer input)
             {
                 var output = new Microsoft.ML.Transforms.WordTokenizer.Output();
@@ -15431,6 +15443,148 @@ public sealed class Output
         }
     }
 
+    namespace Transforms
+    {
+        public enum WordEmbeddingsTransformPretrainedModelKind
+        {
+            GloVe50D = 0,
+            GloVe100D = 1,
+            GloVe200D = 2,
+            GloVe300D = 3,
+            GloVeTwitter25D = 4,
+            GloVeTwitter50D = 5,
+            GloVeTwitter100D = 6,
+            GloVeTwitter200D = 7,
+            FastTextWikipedia300D = 8,
+            Sswe = 9
+        }
+
+
+        public sealed partial class WordEmbeddingsTransformColumn : OneToOneColumn<WordEmbeddingsTransformColumn>, IOneToOneColumn
+        {
+            /// <summary>
+            /// Name of the new column
+            /// </summary>
+            public string Name { get; set; }
+
+            /// <summary>
+            /// Name of the source column
+            /// </summary>
+            public string Source { get; set; }
+
+        }
+
+        /// <include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name="WordEmbeddings"]/*' />
+        /// <include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/example[@name="WordEmbeddings"]/*' />
+        public sealed partial class WordEmbeddings : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
+        {
+
+            public WordEmbeddings()
+            {
+            }
+
+            public WordEmbeddings(params string[] inputColumns)
+            {
+                if (inputColumns != null)
+                {
+                    foreach (string input in inputColumns)
+                    {
+                        AddColumn(input);
+                    }
+                }
+            }
+
+            public WordEmbeddings(params (string inputColumn, string outputColumn)[] inputOutputColumns)
+            {
+                if (inputOutputColumns != null)
+                {
+                    foreach (var inputOutput in inputOutputColumns)
+                    {
+                        AddColumn(inputOutput.outputColumn, inputOutput.inputColumn);
+                    }
+                }
+            }
+
+            public void AddColumn(string inputColumn)
+            {
+                var list = Column == null ? new List<Microsoft.ML.Transforms.WordEmbeddingsTransformColumn>() : new List<Microsoft.ML.Transforms.WordEmbeddingsTransformColumn>(Column);
+                list.Add(OneToOneColumn<Microsoft.ML.Transforms.WordEmbeddingsTransformColumn>.Create(inputColumn));
+                Column = list.ToArray();
+            }
+
+            public void AddColumn(string outputColumn, string inputColumn)
+            {
+                var list = Column == null ? new List<Microsoft.ML.Transforms.WordEmbeddingsTransformColumn>() : new List<Microsoft.ML.Transforms.WordEmbeddingsTransformColumn>(Column);
+                list.Add(OneToOneColumn<Microsoft.ML.Transforms.WordEmbeddingsTransformColumn>.Create(outputColumn, inputColumn));
+                Column = list.ToArray();
+            }
+
+
+            /// <summary>
+            /// New column definition(s) (optional form: name:src)
+            /// </summary>
+            public WordEmbeddingsTransformColumn[] Column { get; set; }
+
+            /// <summary>
+            /// Pre-trained model used to create the vocabulary
+            /// </summary>
+            public WordEmbeddingsTransformPretrainedModelKind? ModelKind { get; set; } = WordEmbeddingsTransformPretrainedModelKind.Sswe;
+
+            /// <summary>
+            /// Filename for custom word embedding model
+            /// </summary>
+            public string CustomLookupTable { get; set; }
+
+            /// <summary>
+            /// Input dataset
+            /// </summary>
+            public Var<Microsoft.ML.Runtime.Data.IDataView> Data { get; set; } = new Var<Microsoft.ML.Runtime.Data.IDataView>();
+
+
+            public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput
+            {
+                /// <summary>
+                /// Transformed dataset
+                /// </summary>
+                public Var<Microsoft.ML.Runtime.Data.IDataView> OutputData { get; set; } = new Var<Microsoft.ML.Runtime.Data.IDataView>();
+
+                /// <summary>
+                /// Transform model
+                /// </summary>
+                public Var<Microsoft.ML.Runtime.EntryPoints.ITransformModel> Model { get; set; } = new Var<Microsoft.ML.Runtime.EntryPoints.ITransformModel>();
+
+            }
+            public Var<IDataView> GetInputData() => Data;
+
+            public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment)
+            {
+                if (previousStep != null)
+                {
+                    if (!(previousStep is ILearningPipelineDataStep dataStep))
+                    {
+                        throw new InvalidOperationException($"{ nameof(WordEmbeddings)} only supports an { nameof(ILearningPipelineDataStep)} as an input.");
+                    }
+
+                    Data = dataStep.Data;
+                }
+                Output output = experiment.Add(this);
+                return new WordEmbeddingsPipelineStep(output);
+            }
+
+            private class WordEmbeddingsPipelineStep : ILearningPipelineDataStep
+            {
+                public WordEmbeddingsPipelineStep(Output output)
+                {
+                    Data = output.OutputData;
+                    Model = output.Model;
+                }
+
+                public Var<IDataView> Data { get; }
+                public Var<ITransformModel> Model { get; }
+            }
+        }
+    }
+
     namespace Transforms
     {
 

diff --git a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv
@@ -123,4 +123,5 @@ Transforms.TextToKeyConverter	Converts input values (words, numbers, etc.) to in
 Transforms.TrainTestDatasetSplitter	Split the dataset into train and test sets	Microsoft.ML.Runtime.EntryPoints.TrainTestSplit	Split	Microsoft.ML.Runtime.EntryPoints.TrainTestSplit+Input	Microsoft.ML.Runtime.EntryPoints.TrainTestSplit+Output
 Transforms.TreeLeafFeaturizer	Trains a tree ensemble, or loads it from a file, then maps a numeric feature vector to three outputs: 1. A vector containing the individual tree outputs of the tree ensemble. 2. A vector indicating the leaves that the feature vector falls on in the tree ensemble. 3. A vector indicating the paths that the feature vector falls on in the tree ensemble. If a both a model file and a trainer are specified - will use the model file. If neither are specified, will train a default FastTree model. This can handle key labels by training a regression model towards their optionally permuted indices.	Microsoft.ML.Runtime.Data.TreeFeaturize	Featurizer	Microsoft.ML.Runtime.Data.TreeEnsembleFeaturizerTransform+ArgumentsForEntryPoint	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
 Transforms.TwoHeterogeneousModelCombiner	Combines a TransformModel and a PredictorModel into a single PredictorModel.	Microsoft.ML.Runtime.EntryPoints.ModelOperations	CombineTwoModels	Microsoft.ML.Runtime.EntryPoints.ModelOperations+SimplePredictorModelInput	Microsoft.ML.Runtime.EntryPoints.ModelOperations+PredictorModelOutput
+Transforms.WordEmbeddings	Word Embeddings transform is a text featurizer which converts vectors of text tokens into sentence vectors using a pre-trained model	Microsoft.ML.Runtime.Transforms.TextAnalytics	WordEmbeddings	Microsoft.ML.Runtime.Data.WordEmbeddingsTransform+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
 Transforms.WordTokenizer	The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed.	Microsoft.ML.Runtime.Transforms.TextAnalytics	DelimitedTokenizeTransform	Microsoft.ML.Runtime.Data.DelimitedTokenizeTransform+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json
@@ -21450,6 +21450,120 @@
         }
       ]
     },
+    {
+      "Name": "Transforms.WordEmbeddings",
+      "Desc": "Word Embeddings transform is a text featurizer which converts vectors of text tokens into sentence vectors using a pre-trained model",
+      "FriendlyName": "Word Embeddings Transform",
+      "ShortName": "WordEmbeddings",
+      "Inputs": [
+        {
+          "Name": "Column",
+          "Type": {
+            "Kind": "Array",
+            "ItemType": {
+              "Kind": "Struct",
+              "Fields": [
+                {
+                  "Name": "Name",
+                  "Type": "String",
+                  "Desc": "Name of the new column",
+                  "Aliases": [
+                    "name"
+                  ],
+                  "Required": false,
+                  "SortOrder": 150.0,
+                  "IsNullable": false,
+                  "Default": null
+                },
+                {
+                  "Name": "Source",
+                  "Type": "String",
+                  "Desc": "Name of the source column",
+                  "Aliases": [
+                    "src"
+                  ],
+                  "Required": false,
+                  "SortOrder": 150.0,
+                  "IsNullable": false,
+                  "Default": null
+                }
+              ]
+            }
+          },
+          "Desc": "New column definition(s) (optional form: name:src)",
+          "Aliases": [
+            "col"
+          ],
+          "Required": true,
+          "SortOrder": 0.0,
+          "IsNullable": false
+        },
+        {
+          "Name": "ModelKind",
+          "Type": {
+            "Kind": "Enum",
+            "Values": [
+              "GloVe50D",
+              "GloVe100D",
+              "GloVe200D",
+              "GloVe300D",
+              "GloVeTwitter25D",
+              "GloVeTwitter50D",
+              "GloVeTwitter100D",
+              "GloVeTwitter200D",
+              "FastTextWikipedia300D",
+              "Sswe"
+            ]
+          },
+          "Desc": "Pre-trained model used to create the vocabulary",
+          "Aliases": [
+            "model"
+          ],
+          "Required": false,
+          "SortOrder": 1.0,
+          "IsNullable": true,
+          "Default": "Sswe"
+        },
+        {
+          "Name": "Data",
+          "Type": "DataView",
+          "Desc": "Input dataset",
+          "Required": true,
+          "SortOrder": 1.0,
+          "IsNullable": false
+        },
+        {
+          "Name": "CustomLookupTable",
+          "Type": "String",
+          "Desc": "Filename for custom word embedding model",
+          "Aliases": [
+            "dataFile"
+          ],
+          "Required": false,
+          "SortOrder": 2.0,
+          "IsNullable": false,
+          "Default": null
+        }
+      ],
+      "Outputs": [
+        {
+          "Name": "OutputData",
+          "Type": "DataView",
+          "Desc": "Transformed dataset"
+        },
+        {
+          "Name": "Model",
+          "Type": "TransformModel",
+          "Desc": "Transform model"
+        }
+      ],
+      "InputKind": [
+        "ITransformInput"
+      ],
+      "OutputKind": [
+        "ITransformOutput"
+      ]
+    },
     {
       "Name": "Transforms.WordTokenizer",
       "Desc": "The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed.",