Skip to content

word embedding transform #545

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 29 commits into from
Jul 31, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs
Original file line number Diff line number Diff line change
Expand Up @@ -137,5 +137,25 @@ public static CommonOutputs.TransformOutput LightLda(IHostEnvironment env, LdaTr
OutputData = view
};
}

[TlcModule.EntryPoint(Name = "Transforms.WordEmbeddings",
Desc = WordEmbeddingsTransform.Summary,
UserName = WordEmbeddingsTransform.UserName,
ShortName = WordEmbeddingsTransform.ShortName,
XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name=""WordEmbeddings""]/*' />",
@"<include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/example[@name=""WordEmbeddings""]/*' />" })]
public static CommonOutputs.TransformOutput WordEmbeddings(IHostEnvironment env, WordEmbeddingsTransform.Arguments input)
{
Contracts.CheckValue(env, nameof(env));
env.CheckValue(input, nameof(input));

var h = EntryPointUtils.CheckArgsAndCreateHost(env, "WordEmbeddings", input);
var view = new WordEmbeddingsTransform(h, input, input.Data);
return new CommonOutputs.TransformOutput()
{
Model = new TransformModel(h, view, input.Data),
OutputData = view
};
}
}
}
444 changes: 444 additions & 0 deletions src/Microsoft.ML.Transforms/Text/WordEmbeddingsTransform.cs

Large diffs are not rendered by default.

44 changes: 43 additions & 1 deletion src/Microsoft.ML.Transforms/Text/doc.xml
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,49 @@
<example name="LightLDA">
<example>
<code language="csharp">
pipeline.Add(new LightLda(("InTextCol" , "OutTextCol")));
pipeline.Add(new LightLda((&quot;InTextCol&quot; , &quot;OutTextCol&quot;)));
</code>
</example>
</example>

<member name="WordEmbeddings">
<summary>
Word Embeddings transform is a text featurizer which converts vectors of text tokens into sentence vectors using a pre-trained model.
</summary>
<remarks>
WordEmbeddings wrap different embedding models, such as GloVe. Users can specify which embedding to use.
The available options are various versions of <a href="https://nlp.stanford.edu/projects/glove/">GloVe Models</a>, <a href="https://en.wikipedia.org/wiki/FastText">fastText</a>, and <a href="http://anthology.aclweb.org/P/P14/P14-1146.pdf">SSWE</a>.
<para>
Note: As WordEmbedding requires a column with text vector, e.g. %3C%27this%27, %27is%27, %27good%27%3E, users need to create an input column by
using the output_tokens=True for TextTransform to convert a column with sentences like "This is good" into %3C%27this%27, %27is%27, %27good%27 %3E.
The suffix of %27_TransformedText%27 is added to the original column name to create the output token column. For instance if the input column is %27body%27,
the output tokens column is named %27body_TransformedText%27.
</para>
<para>
License attributes for pretrained models:
<list type="bullet">
<item>
<description>
&quot;fastText Wikipedia 300D&quot; by Facebook, Inc. is licensed under <a href="https://creativecommons.org/licenses/by-sa/3.0/">CC-BY-SA 3.0</a> based on:
P. Bojanowski*, E. Grave*, A. Joulin, T. Mikolov,<a href="https://arxiv.org/abs/1607.04606">Enriching Word Vectors with Subword Information</a>
%40article%7Bbojanowski2016enriching%2C%0A%20%20title%3D%7BEnriching%20Word%20Vectors%20with%20Subword%20Information%7D%2C%0A%20%20author%3D%7BBojanowski%2C%20Piotr%20and%20Grave%2C%20Edouard%20and%20Joulin%2C%20Armand%20and%20Mikolov%2C%20Tomas%7D%2C%0A%20%20journal%3D%7BarXiv%20preprint%20arXiv%3A1607.04606%7D%2C%0A%20%20year%3D%7B2016%7D%0A%7D
More information can be found <a href="https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md">here</a>.
</description>
</item>
<item>
<description>
GloVe models by Stanford University, or (Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. <a href="https://nlp.stanford.edu/pubs/glove.pdf">GloVe: Global Vectors for Word Representation</a>) is licensed under <a href="https://opendatacommons.org/licenses/pddl/1.0/">PDDL</a>.
More information can be found <a href="https://nlp.stanford.edu/projects/glove/">here</a>. Repository can be found <a href="https://github.com/stanfordnlp/GloVe">here</a>.
</description>
</item>
</list>
</para>
</remarks>
</member>
<example name="WordEmbeddings">
<example>
<code language="csharp">
pipeline.Add(new WordEmbeddings((&quot;InVectorTextCol&quot; , &quot;OutTextCol&quot;)));
</code>
</example>
</example>
Expand Down
154 changes: 154 additions & 0 deletions src/Microsoft.ML/CSharpApi.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1522,6 +1522,18 @@ public void Add(Microsoft.ML.Transforms.TwoHeterogeneousModelCombiner input, Mic
_jsonNodes.Add(Serialize("Transforms.TwoHeterogeneousModelCombiner", input, output));
}

public Microsoft.ML.Transforms.WordEmbeddings.Output Add(Microsoft.ML.Transforms.WordEmbeddings input)
{
var output = new Microsoft.ML.Transforms.WordEmbeddings.Output();
Add(input, output);
return output;
}

public void Add(Microsoft.ML.Transforms.WordEmbeddings input, Microsoft.ML.Transforms.WordEmbeddings.Output output)
{
_jsonNodes.Add(Serialize("Transforms.WordEmbeddings", input, output));
}

public Microsoft.ML.Transforms.WordTokenizer.Output Add(Microsoft.ML.Transforms.WordTokenizer input)
{
var output = new Microsoft.ML.Transforms.WordTokenizer.Output();
Expand Down Expand Up @@ -15431,6 +15443,148 @@ public sealed class Output
}
}

namespace Transforms
{
public enum WordEmbeddingsTransformPretrainedModelKind
{
GloVe50D = 0,
GloVe100D = 1,
GloVe200D = 2,
GloVe300D = 3,
GloVeTwitter25D = 4,
GloVeTwitter50D = 5,
GloVeTwitter100D = 6,
GloVeTwitter200D = 7,
FastTextWikipedia300D = 8,
Sswe = 9
}


public sealed partial class WordEmbeddingsTransformColumn : OneToOneColumn<WordEmbeddingsTransformColumn>, IOneToOneColumn
{
/// <summary>
/// Name of the new column
/// </summary>
public string Name { get; set; }

/// <summary>
/// Name of the source column
/// </summary>
public string Source { get; set; }

}

/// <include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name="WordEmbeddings"]/*' />
/// <include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/example[@name="WordEmbeddings"]/*' />
public sealed partial class WordEmbeddings : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
{

public WordEmbeddings()
{
}

public WordEmbeddings(params string[] inputColumns)
{
if (inputColumns != null)
{
foreach (string input in inputColumns)
{
AddColumn(input);
}
}
}

public WordEmbeddings(params (string inputColumn, string outputColumn)[] inputOutputColumns)
{
if (inputOutputColumns != null)
{
foreach (var inputOutput in inputOutputColumns)
{
AddColumn(inputOutput.outputColumn, inputOutput.inputColumn);
}
}
}

public void AddColumn(string inputColumn)
{
var list = Column == null ? new List<Microsoft.ML.Transforms.WordEmbeddingsTransformColumn>() : new List<Microsoft.ML.Transforms.WordEmbeddingsTransformColumn>(Column);
list.Add(OneToOneColumn<Microsoft.ML.Transforms.WordEmbeddingsTransformColumn>.Create(inputColumn));
Column = list.ToArray();
}

public void AddColumn(string outputColumn, string inputColumn)
{
var list = Column == null ? new List<Microsoft.ML.Transforms.WordEmbeddingsTransformColumn>() : new List<Microsoft.ML.Transforms.WordEmbeddingsTransformColumn>(Column);
list.Add(OneToOneColumn<Microsoft.ML.Transforms.WordEmbeddingsTransformColumn>.Create(outputColumn, inputColumn));
Column = list.ToArray();
}


/// <summary>
/// New column definition(s) (optional form: name:src)
/// </summary>
public WordEmbeddingsTransformColumn[] Column { get; set; }

/// <summary>
/// Pre-trained model used to create the vocabulary
/// </summary>
public WordEmbeddingsTransformPretrainedModelKind? ModelKind { get; set; } = WordEmbeddingsTransformPretrainedModelKind.Sswe;

/// <summary>
/// Filename for custom word embedding model
/// </summary>
public string CustomLookupTable { get; set; }

/// <summary>
/// Input dataset
/// </summary>
public Var<Microsoft.ML.Runtime.Data.IDataView> Data { get; set; } = new Var<Microsoft.ML.Runtime.Data.IDataView>();


public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput
{
/// <summary>
/// Transformed dataset
/// </summary>
public Var<Microsoft.ML.Runtime.Data.IDataView> OutputData { get; set; } = new Var<Microsoft.ML.Runtime.Data.IDataView>();

/// <summary>
/// Transform model
/// </summary>
public Var<Microsoft.ML.Runtime.EntryPoints.ITransformModel> Model { get; set; } = new Var<Microsoft.ML.Runtime.EntryPoints.ITransformModel>();

}
public Var<IDataView> GetInputData() => Data;

public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment)
{
if (previousStep != null)
{
if (!(previousStep is ILearningPipelineDataStep dataStep))
{
throw new InvalidOperationException($"{ nameof(WordEmbeddings)} only supports an { nameof(ILearningPipelineDataStep)} as an input.");
}

Data = dataStep.Data;
}
Output output = experiment.Add(this);
return new WordEmbeddingsPipelineStep(output);
}

private class WordEmbeddingsPipelineStep : ILearningPipelineDataStep
{
public WordEmbeddingsPipelineStep(Output output)
{
Data = output.OutputData;
Model = output.Model;
}

public Var<IDataView> Data { get; }
public Var<ITransformModel> Model { get; }
}
}
}

namespace Transforms
{

Expand Down
1 change: 1 addition & 0 deletions test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -123,4 +123,5 @@ Transforms.TextToKeyConverter Converts input values (words, numbers, etc.) to in
Transforms.TrainTestDatasetSplitter Split the dataset into train and test sets Microsoft.ML.Runtime.EntryPoints.TrainTestSplit Split Microsoft.ML.Runtime.EntryPoints.TrainTestSplit+Input Microsoft.ML.Runtime.EntryPoints.TrainTestSplit+Output
Transforms.TreeLeafFeaturizer Trains a tree ensemble, or loads it from a file, then maps a numeric feature vector to three outputs: 1. A vector containing the individual tree outputs of the tree ensemble. 2. A vector indicating the leaves that the feature vector falls on in the tree ensemble. 3. A vector indicating the paths that the feature vector falls on in the tree ensemble. If a both a model file and a trainer are specified - will use the model file. If neither are specified, will train a default FastTree model. This can handle key labels by training a regression model towards their optionally permuted indices. Microsoft.ML.Runtime.Data.TreeFeaturize Featurizer Microsoft.ML.Runtime.Data.TreeEnsembleFeaturizerTransform+ArgumentsForEntryPoint Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
Transforms.TwoHeterogeneousModelCombiner Combines a TransformModel and a PredictorModel into a single PredictorModel. Microsoft.ML.Runtime.EntryPoints.ModelOperations CombineTwoModels Microsoft.ML.Runtime.EntryPoints.ModelOperations+SimplePredictorModelInput Microsoft.ML.Runtime.EntryPoints.ModelOperations+PredictorModelOutput
Transforms.WordEmbeddings Word Embeddings transform is a text featurizer which converts vectors of text tokens into sentence vectors using a pre-trained model Microsoft.ML.Runtime.Transforms.TextAnalytics WordEmbeddings Microsoft.ML.Runtime.Data.WordEmbeddingsTransform+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
Transforms.WordTokenizer The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed. Microsoft.ML.Runtime.Transforms.TextAnalytics DelimitedTokenizeTransform Microsoft.ML.Runtime.Data.DelimitedTokenizeTransform+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
114 changes: 114 additions & 0 deletions test/BaselineOutput/Common/EntryPoints/core_manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -21450,6 +21450,120 @@
}
]
},
{
"Name": "Transforms.WordEmbeddings",
"Desc": "Word Embeddings transform is a text featurizer which converts vectors of text tokens into sentence vectors using a pre-trained model",
"FriendlyName": "Word Embeddings Transform",
"ShortName": "WordEmbeddings",
"Inputs": [
{
"Name": "Column",
"Type": {
"Kind": "Array",
"ItemType": {
"Kind": "Struct",
"Fields": [
{
"Name": "Name",
"Type": "String",
"Desc": "Name of the new column",
"Aliases": [
"name"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
"Default": null
},
{
"Name": "Source",
"Type": "String",
"Desc": "Name of the source column",
"Aliases": [
"src"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
"Default": null
}
]
}
},
"Desc": "New column definition(s) (optional form: name:src)",
"Aliases": [
"col"
],
"Required": true,
"SortOrder": 0.0,
"IsNullable": false
},
{
"Name": "ModelKind",
"Type": {
"Kind": "Enum",
"Values": [
"GloVe50D",
"GloVe100D",
"GloVe200D",
"GloVe300D",
"GloVeTwitter25D",
"GloVeTwitter50D",
"GloVeTwitter100D",
"GloVeTwitter200D",
"FastTextWikipedia300D",
"Sswe"
]
},
"Desc": "Pre-trained model used to create the vocabulary",
"Aliases": [
"model"
],
"Required": false,
"SortOrder": 1.0,
"IsNullable": true,
"Default": "Sswe"
},
{
"Name": "Data",
"Type": "DataView",
"Desc": "Input dataset",
"Required": true,
"SortOrder": 1.0,
"IsNullable": false
},
{
"Name": "CustomLookupTable",
"Type": "String",
"Desc": "Filename for custom word embedding model",
"Aliases": [
"dataFile"
],
"Required": false,
"SortOrder": 2.0,
"IsNullable": false,
"Default": null
}
],
"Outputs": [
{
"Name": "OutputData",
"Type": "DataView",
"Desc": "Transformed dataset"
},
{
"Name": "Model",
"Type": "TransformModel",
"Desc": "Transform model"
}
],
"InputKind": [
"ITransformInput"
],
"OutputKind": [
"ITransformOutput"
]
},
{
"Name": "Transforms.WordTokenizer",
"Desc": "The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed.",
Expand Down
Loading