Skip to content

Creation of components through MLContext and cleanup (text related transforms) #2393

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Feb 6, 2019
Merged
4 changes: 2 additions & 2 deletions docs/code/MlNetCookBook.md
Original file line number Diff line number Diff line change
Expand Up @@ -782,7 +782,7 @@ var pipeline =
// NLP pipeline 4: word embeddings.
.Append(mlContext.Transforms.Text.TokenizeWords("TokenizedMessage", "NormalizedMessage"))
.Append(mlContext.Transforms.Text.ExtractWordEmbeddings("Embeddings", "TokenizedMessage",
WordEmbeddingsExtractingTransformer.PretrainedModelKind.GloVeTwitter25D));
WordEmbeddingsExtractingEstimator.PretrainedModelKind.GloVeTwitter25D));

// Let's train our pipeline, and then apply it to the same data.
// Note that even on a small dataset of 70KB the pipeline above can take up to a minute to completely train.
Expand Down Expand Up @@ -1020,4 +1020,4 @@ newContext.CompositionContainer = new CompositionContainer(new TypeCatalog(typeo
ITransformer loadedModel;
using (var fs = File.OpenRead(modelPath))
loadedModel = newContext.Model.Load(fs);
```
```
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,15 @@ public static void KeyToValueValueToKey()
// making use of default settings.
string defaultColumnName = "DefaultKeys";
// REVIEW create through the catalog extension
var default_pipeline = new WordTokenizingEstimator(ml, "Review")
var default_pipeline = ml.Transforms.Text.TokenizeWords("Review")
.Append(ml.Transforms.Conversion.MapValueToKey(defaultColumnName, "Review"));

// Another pipeline, that customizes the advanced settings of the ValueToKeyMappingEstimator.
// We can change the maxNumTerm to limit how many keys will get generated out of the set of words,
// and condition the order in which they get evaluated by changing sort from the default Occurence (order in which they get encountered)
// to value/alphabetically.
string customizedColumnName = "CustomizedKeys";
var customized_pipeline = new WordTokenizingEstimator(ml, "Review")
var customized_pipeline = ml.Transforms.Text.TokenizeWords("Review")
.Append(ml.Transforms.Conversion.MapValueToKey(customizedColumnName, "Review", maxNumKeys: 10, sort: ValueToKeyMappingEstimator.SortOrder.Value));

// The transformed data.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ public static void ExtractEmbeddings()
// Let's apply pretrained word embedding model GloVeTwitter25D.
// 25D means each word mapped into 25 dimensional space, basically each word represented by 25 float values.
var gloveWordEmbedding = ml.Transforms.Text.ExtractWordEmbeddings("GloveEmbeddings", "CleanWords",
WordEmbeddingsExtractingTransformer.PretrainedModelKind.GloVeTwitter25D);
WordEmbeddingsExtractingEstimator.PretrainedModelKind.GloVeTwitter25D);

// We also have option to apply custom word embedding models.
// Let's first create one.
Expand Down
4 changes: 4 additions & 0 deletions src/Microsoft.ML.Data/DataLoadSave/TransformWrapper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,10 @@ protected TrivialWrapperEstimator(IHost host, TransformWrapper transformer)
{
}

/// <summary>
/// Returns the <see cref="SchemaShape"/> of the schema which will be produced by the transformer.
/// Used for schema propagation and verification in a pipeline.
/// </summary>
public override SchemaShape GetOutputSchema(SchemaShape inputSchema)
{
Host.CheckValue(inputSchema, nameof(inputSchema));
Expand Down
4 changes: 2 additions & 2 deletions src/Microsoft.ML.StaticPipe/LdaStaticExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -101,13 +101,13 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
IReadOnlyDictionary<PipelineColumn, string> outputNames,
IReadOnlyCollection<string> usedNames)
{
var infos = new LatentDirichletAllocationTransformer.ColumnInfo[toOutput.Length];
var infos = new LatentDirichletAllocationEstimator.ColumnInfo[toOutput.Length];
Action<LatentDirichletAllocationTransformer> onFit = null;
for (int i = 0; i < toOutput.Length; ++i)
{
var tcol = (ILdaCol)toOutput[i];

infos[i] = new LatentDirichletAllocationTransformer.ColumnInfo(outputNames[toOutput[i]],
infos[i] = new LatentDirichletAllocationEstimator.ColumnInfo(outputNames[toOutput[i]],
inputNames[tcol.Input],
tcol.Config.NumTopic,
tcol.Config.AlphaSum,
Expand Down
8 changes: 4 additions & 4 deletions src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -151,9 +151,9 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
{
Contracts.Assert(toOutput.Length == 1);

var columns = new List<StopWordsRemovingTransformer.ColumnInfo>();
var columns = new List<StopWordsRemovingEstimator.ColumnInfo>();
foreach (var outCol in toOutput)
columns.Add(new StopWordsRemovingTransformer.ColumnInfo(outputNames[outCol], inputNames[((OutPipelineColumn)outCol).Input], _language));
columns.Add(new StopWordsRemovingEstimator.ColumnInfo(outputNames[outCol], inputNames[((OutPipelineColumn)outCol).Input], _language));

return new StopWordsRemovingEstimator(env, columns.ToArray());
}
Expand Down Expand Up @@ -559,9 +559,9 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
IReadOnlyCollection<string> usedNames)
{
Contracts.Assert(toOutput.Length == 1);
var columns = new List<NgramHashingTransformer.ColumnInfo>();
var columns = new List<NgramHashingEstimator.ColumnInfo>();
foreach (var outCol in toOutput)
columns.Add(new NgramHashingTransformer.ColumnInfo(outputNames[outCol], new[] { inputNames[((OutPipelineColumn)outCol).Input] },
columns.Add(new NgramHashingEstimator.ColumnInfo(outputNames[outCol], new[] { inputNames[((OutPipelineColumn)outCol).Input] },
_ngramLength, _skipLength, _allLengths, _hashBits, _seed, _ordered, _invertHash));

return new NgramHashingEstimator(env, columns.ToArray());
Expand Down
12 changes: 6 additions & 6 deletions src/Microsoft.ML.StaticPipe/WordEmbeddingsStaticExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ public static class WordEmbeddingsStaticExtensions
/// <param name="input">Vector of tokenized text.</param>
/// <param name="modelKind">The pretrained word embedding model.</param>
/// <returns></returns>
public static Vector<float> WordEmbeddings(this VarVector<string> input, WordEmbeddingsExtractingTransformer.PretrainedModelKind modelKind = WordEmbeddingsExtractingTransformer.PretrainedModelKind.Sswe)
public static Vector<float> WordEmbeddings(this VarVector<string> input, WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe)
{
Contracts.CheckValue(input, nameof(input));
return new OutColumn(input, modelKind);
Expand All @@ -34,7 +34,7 @@ private sealed class OutColumn : Vector<float>
{
public PipelineColumn Input { get; }

public OutColumn(VarVector<string> input, WordEmbeddingsExtractingTransformer.PretrainedModelKind modelKind = WordEmbeddingsExtractingTransformer.PretrainedModelKind.Sswe)
public OutColumn(VarVector<string> input, WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe)
: base(new Reconciler(modelKind), input)
{
Input = input;
Expand All @@ -49,10 +49,10 @@ public OutColumn(VarVector<string> input, string customModelFile = null)

private sealed class Reconciler : EstimatorReconciler
{
private readonly WordEmbeddingsExtractingTransformer.PretrainedModelKind? _modelKind;
private readonly WordEmbeddingsExtractingEstimator.PretrainedModelKind? _modelKind;
private readonly string _customLookupTable;

public Reconciler(WordEmbeddingsExtractingTransformer.PretrainedModelKind modelKind = WordEmbeddingsExtractingTransformer.PretrainedModelKind.Sswe)
public Reconciler(WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe)
{
_modelKind = modelKind;
_customLookupTable = null;
Expand All @@ -72,11 +72,11 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
{
Contracts.Assert(toOutput.Length == 1);

var cols = new WordEmbeddingsExtractingTransformer.ColumnInfo[toOutput.Length];
var cols = new WordEmbeddingsExtractingEstimator.ColumnInfo[toOutput.Length];
for (int i = 0; i < toOutput.Length; ++i)
{
var outCol = (OutColumn)toOutput[i];
cols[i] = new WordEmbeddingsExtractingTransformer.ColumnInfo(outputNames[outCol], inputNames[outCol.Input]);
cols[i] = new WordEmbeddingsExtractingEstimator.ColumnInfo(outputNames[outCol], inputNames[outCol.Input]);
}

bool customLookup = !string.IsNullOrWhiteSpace(_customLookupTable);
Expand Down
12 changes: 6 additions & 6 deletions src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public static CommonOutputs.TransformOutput TextTransform(IHostEnvironment env,
Desc = ML.Transforms.Text.WordTokenizingTransformer.Summary,
UserName = ML.Transforms.Text.WordTokenizingTransformer.UserName,
ShortName = ML.Transforms.Text.WordTokenizingTransformer.LoaderSignature)]
public static CommonOutputs.TransformOutput DelimitedTokenizeTransform(IHostEnvironment env, WordTokenizingTransformer.Arguments input)
public static CommonOutputs.TransformOutput DelimitedTokenizeTransform(IHostEnvironment env, WordTokenizingTransformer.Options input)
{
var h = EntryPointUtils.CheckArgsAndCreateHost(env, "DelimitedTokenizeTransform", input);
var xf = ML.Transforms.Text.WordTokenizingTransformer.Create(h, input, input.Data);
Expand All @@ -51,7 +51,7 @@ public static CommonOutputs.TransformOutput DelimitedTokenizeTransform(IHostEnvi
Desc = NgramExtractingTransformer.Summary,
UserName = NgramExtractingTransformer.UserName,
ShortName = NgramExtractingTransformer.LoaderSignature)]
public static CommonOutputs.TransformOutput NGramTransform(IHostEnvironment env, NgramExtractingTransformer.Arguments input)
public static CommonOutputs.TransformOutput NGramTransform(IHostEnvironment env, NgramExtractingTransformer.Options input)
{
var h = EntryPointUtils.CheckArgsAndCreateHost(env, "NGramTransform", input);
var xf = NgramExtractingTransformer.Create(h, input, input.Data);
Expand Down Expand Up @@ -96,7 +96,7 @@ public static CommonOutputs.TransformOutput AnalyzeSentiment(IHostEnvironment en
Desc = TokenizingByCharactersTransformer.Summary,
UserName = TokenizingByCharactersTransformer.UserName,
ShortName = TokenizingByCharactersTransformer.LoaderSignature)]
public static CommonOutputs.TransformOutput CharTokenize(IHostEnvironment env, TokenizingByCharactersTransformer.Arguments input)
public static CommonOutputs.TransformOutput CharTokenize(IHostEnvironment env, TokenizingByCharactersTransformer.Options input)
{
Contracts.CheckValue(env, nameof(env));
env.CheckValue(input, nameof(input));
Expand All @@ -114,13 +114,13 @@ public static CommonOutputs.TransformOutput CharTokenize(IHostEnvironment env, T
Desc = LatentDirichletAllocationTransformer.Summary,
UserName = LatentDirichletAllocationTransformer.UserName,
ShortName = LatentDirichletAllocationTransformer.ShortName)]
public static CommonOutputs.TransformOutput LightLda(IHostEnvironment env, LatentDirichletAllocationTransformer.Arguments input)
public static CommonOutputs.TransformOutput LightLda(IHostEnvironment env, LatentDirichletAllocationTransformer.Options input)
{
Contracts.CheckValue(env, nameof(env));
env.CheckValue(input, nameof(input));

var h = EntryPointUtils.CheckArgsAndCreateHost(env, "LightLda", input);
var cols = input.Columns.Select(colPair => new LatentDirichletAllocationTransformer.ColumnInfo(colPair, input)).ToArray();
var cols = input.Columns.Select(colPair => new LatentDirichletAllocationEstimator.ColumnInfo(colPair, input)).ToArray();
var est = new LatentDirichletAllocationEstimator(h, cols);
var view = est.Fit(input.Data).Transform(input.Data);

Expand All @@ -135,7 +135,7 @@ public static CommonOutputs.TransformOutput LightLda(IHostEnvironment env, Laten
Desc = WordEmbeddingsExtractingTransformer.Summary,
UserName = WordEmbeddingsExtractingTransformer.UserName,
ShortName = WordEmbeddingsExtractingTransformer.ShortName)]
public static CommonOutputs.TransformOutput WordEmbeddings(IHostEnvironment env, WordEmbeddingsExtractingTransformer.Arguments input)
public static CommonOutputs.TransformOutput WordEmbeddings(IHostEnvironment env, WordEmbeddingsExtractingTransformer.Options input)
{
Contracts.CheckValue(env, nameof(env));
env.CheckValue(input, nameof(input));
Expand Down
Loading