Skip to content

Commit 7508b68

Browse files
committed
Scrub word embedding
1 parent 8bcc03c commit 7508b68

File tree

9 files changed

+20
-20
lines changed

9 files changed

+20
-20
lines changed

docs/code/MlNetCookBook.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -782,7 +782,7 @@ var pipeline =
782782
// NLP pipeline 4: word embeddings.
783783
.Append(mlContext.Transforms.Text.TokenizeWords("TokenizedMessage", "NormalizedMessage"))
784784
.Append(mlContext.Transforms.Text.ExtractWordEmbeddings("Embeddings", "TokenizedMessage",
785-
WordEmbeddingsExtractingEstimator.PretrainedModelKind.GloVeTwitter25D));
785+
WordEmbeddingsExtractingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding));
786786

787787
// Let's train our pipeline, and then apply it to the same data.
788788
// Note that even on a small dataset of 70KB the pipeline above can take up to a minute to completely train.

src/Microsoft.ML.StaticPipe/WordEmbeddingsStaticExtensions.cs

+3-3
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ public static class WordEmbeddingsStaticExtensions
1414
/// <param name="input">Vector of tokenized text.</param>
1515
/// <param name="modelKind">The pretrained word embedding model.</param>
1616
/// <returns></returns>
17-
public static Vector<float> WordEmbeddings(this VarVector<string> input, WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe)
17+
public static Vector<float> WordEmbeddings(this VarVector<string> input, WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)
1818
{
1919
Contracts.CheckValue(input, nameof(input));
2020
return new OutColumn(input, modelKind);
@@ -33,7 +33,7 @@ private sealed class OutColumn : Vector<float>
3333
{
3434
public PipelineColumn Input { get; }
3535

36-
public OutColumn(VarVector<string> input, WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe)
36+
public OutColumn(VarVector<string> input, WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)
3737
: base(new Reconciler(modelKind), input)
3838
{
3939
Input = input;
@@ -51,7 +51,7 @@ private sealed class Reconciler : EstimatorReconciler
5151
private readonly WordEmbeddingsExtractingEstimator.PretrainedModelKind? _modelKind;
5252
private readonly string _customLookupTable;
5353

54-
public Reconciler(WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe)
54+
public Reconciler(WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)
5555
{
5656
_modelKind = modelKind;
5757
_customLookupTable = null;

src/Microsoft.ML.Transforms/Text/TextCatalog.cs

+2-2
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ public static TextNormalizingEstimator NormalizeText(this TransformsCatalog.Text
112112
public static WordEmbeddingsExtractingEstimator ExtractWordEmbeddings(this TransformsCatalog.TextTransforms catalog,
113113
string outputColumnName,
114114
string inputColumnName = null,
115-
WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe)
115+
WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)
116116
=> new WordEmbeddingsExtractingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, modelKind);
117117

118118
/// <include file='doc.xml' path='doc/members/member[@name="WordEmbeddings"]/*' />
@@ -146,7 +146,7 @@ public static WordEmbeddingsExtractingEstimator ExtractWordEmbeddings(this Trans
146146
/// </format>
147147
/// </example>
148148
public static WordEmbeddingsExtractingEstimator ExtractWordEmbeddings(this TransformsCatalog.TextTransforms catalog,
149-
WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe,
149+
WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding,
150150
params WordEmbeddingsExtractingEstimator.ColumnOptions[] columns)
151151
=> new WordEmbeddingsExtractingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), modelKind, columns);
152152

src/Microsoft.ML.Transforms/Text/WordEmbeddingsExtractor.cs

+9-9
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ internal sealed class Options : TransformInputBase
6161
public Column[] Columns;
6262

6363
[Argument(ArgumentType.AtMostOnce, HelpText = "Pre-trained model used to create the vocabulary", ShortName = "model", SortOrder = 1)]
64-
public WordEmbeddingsExtractingEstimator.PretrainedModelKind? ModelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe;
64+
public WordEmbeddingsExtractingEstimator.PretrainedModelKind? ModelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding;
6565

6666
[Argument(ArgumentType.AtMostOnce, IsInputFileName = true, HelpText = "Filename for custom word embedding model",
6767
ShortName = "dataFile", SortOrder = 2)]
@@ -96,7 +96,7 @@ internal static VersionInfo GetVersionInfo()
9696
/// <summary>
9797
/// The names of the output and input column pairs on which the transformation is applied.
9898
/// </summary>
99-
public IReadOnlyCollection<(string outputColumnName, string inputColumnName)> Columns => ColumnPairs.AsReadOnly();
99+
private IReadOnlyCollection<(string outputColumnName, string inputColumnName)> Columns => ColumnPairs.AsReadOnly();
100100

101101
private sealed class Model
102102
{
@@ -162,7 +162,7 @@ public List<string> GetWordLabels()
162162
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
163163
/// <param name="modelKind">The pretrained word embedding model.</param>
164164
internal WordEmbeddingsExtractingTransformer(IHostEnvironment env, string outputColumnName, string inputColumnName = null,
165-
WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe)
165+
WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)
166166
: this(env, modelKind, new WordEmbeddingsExtractingEstimator.ColumnOptions(outputColumnName, inputColumnName ?? outputColumnName))
167167
{
168168
}
@@ -227,7 +227,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa
227227
env.CheckValue(input, nameof(input));
228228

229229
if (options.ModelKind == null)
230-
options.ModelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe;
230+
options.ModelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding;
231231
env.CheckUserArg(!options.ModelKind.HasValue || Enum.IsDefined(typeof(WordEmbeddingsExtractingEstimator.PretrainedModelKind), options.ModelKind), nameof(options.ModelKind));
232232

233233
env.CheckValue(options.Columns, nameof(options.Columns));
@@ -614,7 +614,7 @@ private ValueGetter<VBuffer<float>> GetGetterVec(DataViewRow input, int iinfo)
614614
{ WordEmbeddingsExtractingEstimator.PretrainedModelKind.GloVeTwitter100D, "glove.twitter.27B.100d.txt" },
615615
{ WordEmbeddingsExtractingEstimator.PretrainedModelKind.GloVeTwitter200D, "glove.twitter.27B.200d.txt" },
616616
{ WordEmbeddingsExtractingEstimator.PretrainedModelKind.FastTextWikipedia300D, "wiki.en.vec" },
617-
{ WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe, "sentiment.emd" }
617+
{ WordEmbeddingsExtractingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding, "sentiment.emd" }
618618
};
619619

620620
private static Dictionary<WordEmbeddingsExtractingEstimator.PretrainedModelKind, int> _linesToSkipInModels = new Dictionary<WordEmbeddingsExtractingEstimator.PretrainedModelKind, int>()
@@ -630,7 +630,7 @@ private string EnsureModelFile(IHostEnvironment env, out int linesToSkip, WordEm
630630
linesToSkip = _linesToSkipInModels[kind];
631631
using (var ch = Host.Start("Ensuring resources"))
632632
{
633-
string dir = kind == WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe ? Path.Combine("Text", "Sswe") : "WordVectors";
633+
string dir = kind == WordEmbeddingsExtractingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding ? Path.Combine("Text", "Sswe") : "WordVectors";
634634
var url = $"{dir}/{modelFileName}";
635635
var ensureModel = ResourceManagerUtils.Instance.EnsureResource(Host, ch, url, modelFileName, dir, Timeout);
636636
ensureModel.Wait();
@@ -747,7 +747,7 @@ public sealed class WordEmbeddingsExtractingEstimator : IEstimator<WordEmbedding
747747
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
748748
/// <param name="modelKind">The embeddings <see cref="PretrainedModelKind"/> to use. </param>
749749
internal WordEmbeddingsExtractingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null,
750-
PretrainedModelKind modelKind = PretrainedModelKind.Sswe)
750+
PretrainedModelKind modelKind = PretrainedModelKind.SentimentSpecificWordEmbedding)
751751
: this(env, modelKind, new ColumnOptions(outputColumnName, inputColumnName ?? outputColumnName))
752752
{
753753
}
@@ -777,7 +777,7 @@ internal WordEmbeddingsExtractingEstimator(IHostEnvironment env, string outputCo
777777
/// <param name="modelKind">The embeddings <see cref="PretrainedModelKind"/> to use. </param>
778778
/// <param name="columns">The array columns, and per-column configurations to extract embeedings from.</param>
779779
internal WordEmbeddingsExtractingEstimator(IHostEnvironment env,
780-
PretrainedModelKind modelKind = PretrainedModelKind.Sswe,
780+
PretrainedModelKind modelKind = PretrainedModelKind.SentimentSpecificWordEmbedding,
781781
params ColumnOptions[] columns)
782782
{
783783
Contracts.CheckValue(env, nameof(env));
@@ -829,7 +829,7 @@ public enum PretrainedModelKind
829829
FastTextWikipedia300D = 8,
830830

831831
[TGUI(Label = "Sentiment-Specific Word Embedding")]
832-
Sswe = 9
832+
SentimentSpecificWordEmbedding = 9
833833
}
834834
/// <summary>
835835
/// Information for each column pair.

test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ public void TrainSentiment()
104104
}, "SentimentText").Fit(loader).Transform(loader);
105105

106106
var trans = mlContext.Transforms.Text.ExtractWordEmbeddings("Features", "WordEmbeddings_TransformedText",
107-
WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe).Fit(text).Transform(text);
107+
WordEmbeddingsExtractingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding).Fit(text).Transform(text);
108108

109109
// Train
110110
var trainer = mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent();

test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -3607,7 +3607,7 @@ public void EntryPointWordEmbeddings()
36073607
{
36083608
Data = dataView,
36093609
Columns = new[] { new WordEmbeddingsExtractingTransformer.Column { Name = "Features", Source = "Text" } },
3610-
ModelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe
3610+
ModelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding
36113611
});
36123612
var result = embedding.OutputData;
36133613
using (var cursor = result.GetRowCursorForAllColumns())

test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -472,7 +472,7 @@ private void TextFeaturizationOn(string dataPath)
472472
// NLP pipeline 4: word embeddings.
473473
// PretrainedModelKind.Sswe is used here for performance of the test. In a real
474474
// scenario, it is best to use a different model for more accuracy.
475-
Embeddings: r.Message.NormalizeText().TokenizeText().WordEmbeddings(WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe)
475+
Embeddings: r.Message.NormalizeText().TokenizeText().WordEmbeddings(WordEmbeddingsExtractingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)
476476
));
477477

478478
// Let's train our pipeline, and then apply it to the same data.

test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,7 @@ private void TextFeaturizationOn(string dataPath)
314314
// scenario, it is best to use a different model for more accuracy.
315315
.Append(mlContext.Transforms.Text.TokenizeWords("TokenizedMessage", "NormalizedMessage"))
316316
.Append(mlContext.Transforms.Text.ExtractWordEmbeddings("Embeddings", "TokenizedMessage",
317-
WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe));
317+
WordEmbeddingsExtractingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding));
318318

319319
// Let's train our pipeline, and then apply it to the same data.
320320
// Note that even on a small dataset of 70KB the pipeline above can take up to a minute to completely train.

test/Microsoft.ML.Tests/Transformers/WordEmbeddingsTests.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ public void TestWordEmbeddings()
3939
.Append(ML.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words"));
4040
var words = est.Fit(data).Transform(data);
4141

42-
var pipe = ML.Transforms.Text.ExtractWordEmbeddings("WordEmbeddings", "CleanWords", modelKind: WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe);
42+
var pipe = ML.Transforms.Text.ExtractWordEmbeddings("WordEmbeddings", "CleanWords", modelKind: WordEmbeddingsExtractingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding);
4343

4444
TestEstimatorCore(pipe, words, invalidInput: data);
4545

0 commit comments

Comments
 (0)