Skip to content

API scenarios implemented with low-level functions #653

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Aug 17, 2018
17 changes: 16 additions & 1 deletion src/Microsoft.ML.Core/EntryPoints/ComponentFactory.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ public interface IArgsComponent : IComponentFactory
/// <summary>
/// An interface for creating a component with no extra parameters (other than an <see cref="IHostEnvironment"/>).
/// </summary>
public interface IComponentFactory<out TComponent>: IComponentFactory
public interface IComponentFactory<out TComponent> : IComponentFactory
{
TComponent CreateComponent(IHostEnvironment env);
}
Expand Down Expand Up @@ -57,6 +57,21 @@ public TComponent CreateComponent(IHostEnvironment env, TArg1 argument1)
}
}

public class SimpleComponentFactory<TComponent> : IComponentFactory<TComponent>
{
private Func<IHostEnvironment, TComponent> _factory;

public SimpleComponentFactory(Func<IHostEnvironment, TComponent> factory)
{
_factory = factory;
}

public TComponent CreateComponent(IHostEnvironment env)
{
return _factory(env);
}
}

/// <summary>
/// An interface for creating a component when we take two extra parameters (and an <see cref="IHostEnvironment"/>).
/// </summary>
Expand Down
10 changes: 5 additions & 5 deletions src/Microsoft.ML.FastTree/FastTreeArguments.cs
Original file line number Diff line number Diff line change
Expand Up @@ -228,14 +228,14 @@ public abstract class TreeArgs : LearnerInputBaseWithGroupId
// REVIEW: Different from original FastRank arguments (shortname l vs. nl). Different default from TLC FR Wrapper (20 vs. 20).
[Argument(ArgumentType.LastOccurenceWins, HelpText = "The max number of leaves in each regression tree", ShortName = "nl", SortOrder = 2)]
[TGUI(Description = "The maximum number of leaves per tree", SuggestedSweeps = "2-128;log;inc:4")]
[TlcModule.SweepableLongParamAttribute("NumLeaves", 2, 128, isLogScale:true, stepSize:4)]
[TlcModule.SweepableLongParamAttribute("NumLeaves", 2, 128, isLogScale: true, stepSize: 4)]
public int NumLeaves = 20;

// REVIEW: Arrays not supported in GUI
// REVIEW: Different shortname than FastRank module. Same as the TLC FRWrapper.
[Argument(ArgumentType.LastOccurenceWins, HelpText = "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", ShortName = "mil", SortOrder = 3)]
[TGUI(Description = "Minimum number of training instances required to form a leaf", SuggestedSweeps = "1,10,50")]
[TlcModule.SweepableDiscreteParamAttribute("MinDocumentsInLeafs", new object[] {1, 10, 50})]
[TlcModule.SweepableDiscreteParamAttribute("MinDocumentsInLeafs", new object[] { 1, 10, 50 })]
public int MinDocumentsInLeafs = 10;

// REVIEW: Different shortname than FastRank module. Same as the TLC FRWrapper.
Expand Down Expand Up @@ -364,17 +364,17 @@ public enum OptimizationAlgorithmType { GradientDescent, AcceleratedGradientDesc

[Argument(ArgumentType.LastOccurenceWins, HelpText = "The learning rate", ShortName = "lr", SortOrder = 4)]
[TGUI(Label = "Learning Rate", SuggestedSweeps = "0.025-0.4;log")]
[TlcModule.SweepableFloatParamAttribute("LearningRates", 0.025f, 0.4f, isLogScale:true)]
[TlcModule.SweepableFloatParamAttribute("LearningRates", 0.025f, 0.4f, isLogScale: true)]
public Double LearningRates = 0.2;

[Argument(ArgumentType.AtMostOnce, HelpText = "Shrinkage", ShortName = "shrk")]
[TGUI(Label = "Shrinkage", SuggestedSweeps = "0.25-4;log")]
[TlcModule.SweepableFloatParamAttribute("Shrinkage", 0.025f, 4f, isLogScale:true)]
[TlcModule.SweepableFloatParamAttribute("Shrinkage", 0.025f, 4f, isLogScale: true)]
public Double Shrinkage = 1;

[Argument(ArgumentType.AtMostOnce, HelpText = "Dropout rate for tree regularization", ShortName = "tdrop")]
[TGUI(SuggestedSweeps = "0,0.000000001,0.05,0.1,0.2")]
[TlcModule.SweepableDiscreteParamAttribute("DropoutRate", new object[] { 0.0f, 1E-9f, 0.05f, 0.1f, 0.2f})]
[TlcModule.SweepableDiscreteParamAttribute("DropoutRate", new object[] { 0.0f, 1E-9f, 0.05f, 0.1f, 0.2f })]
public Double DropoutRate = 0;

[Argument(ArgumentType.AtMostOnce, HelpText = "Sample each query 1 in k times in the GetDerivatives function", ShortName = "sr")]
Expand Down
58 changes: 58 additions & 0 deletions test/Microsoft.ML.Tests/Scenarios/Api/ApiScenariosTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using Microsoft.ML.Runtime.Api;
using Microsoft.ML.TestFramework;
using Xunit.Abstractions;

namespace Microsoft.ML.Tests.Scenarios.Api
{
/// <summary>
/// Common utility functions for API scenarios tests.
/// </summary>
public partial class ApiScenariosTests : BaseTestClass
{
public ApiScenariosTests(ITestOutputHelper output) : base(output)
{
}

public const string IrisDataPath = "iris.data";
public const string SentimentDataPath = "wikipedia-detox-250-line-data.tsv";
public const string SentimentTestPath = "wikipedia-detox-250-line-test.tsv";

public class IrisData : IrisDataNoLabel
{
public string Label;
}

public class IrisDataNoLabel
{
public float SepalLength;
public float SepalWidth;
public float PetalLength;
public float PetalWidth;
}

public class IrisPrediction
{
public string PredictedLabel;
public float[] Score;
}

public class SentimentData
{
[ColumnName("Label")]
public bool Sentiment;
public string SentimentText;
}

public class SentimentPrediction
{
[ColumnName("PredictedLabel")]
public bool Sentiment;

public float Score;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using Microsoft.ML.Runtime.Data;
using Microsoft.ML.Runtime.Learners;
using Xunit;

namespace Microsoft.ML.Tests.Scenarios.Api
{
public partial class ApiScenariosTests
{
/// <summary>
/// Auto-normalization and caching: It should be relatively easy for normalization
/// and caching to be introduced for training, if the trainer supports or would benefit
/// from that.
/// </summary>
[Fact]
public void AutoNormalizationAndCaching()
{
var dataPath = GetDataPath(SentimentDataPath);
var testDataPath = GetDataPath(SentimentTestPath);

using (var env = new TlcEnvironment(seed: 1, conc: 1))
{
// Pipeline.
var loader = new TextLoader(env, MakeSentimentTextLoaderArgs(), new MultiFileSource(dataPath));

var trans = TextTransform.Create(env, MakeSentimentTextTransformArgs(false), loader);

// Train.
var trainer = new LinearClassificationTrainer(env, new LinearClassificationTrainer.Arguments
{
NumThreads = 1,
ConvergenceTolerance = 1f
});

// Auto-caching.
IDataView trainData = trainer.Info.WantCaching ? (IDataView)new CacheDataView(env, trans, prefetch: null) : trans;
var trainRoles = new RoleMappedData(trainData, label: "Label", feature: "Features");

// Auto-normalization.
NormalizeTransform.CreateIfNeeded(env, ref trainRoles, trainer);
var predictor = trainer.Train(new Runtime.TrainContext(trainRoles));
}

}
}
}
95 changes: 95 additions & 0 deletions test/Microsoft.ML.Tests/Scenarios/Api/CrossValidation.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using Microsoft.ML.Models;
using Microsoft.ML.Runtime.Data;
using Microsoft.ML.Runtime.Learners;
using System;
using System.Collections.Generic;
using System.Linq;
using Xunit;

namespace Microsoft.ML.Tests.Scenarios.Api
{
public partial class ApiScenariosTests
{
/// <summary>
/// Cross-validation: Have a mechanism to do cross validation, that is, you come up with
/// a data source (optionally with stratification column), come up with an instantiable transform
/// and trainer pipeline, and it will handle (1) splitting up the data, (2) training the separate
/// pipelines on in-fold data, (3) scoring on the out-fold data, (4) returning the set of
/// evaluations and optionally trained pipes. (People always want metrics out of xfold,
/// they sometimes want the actual models too.)
/// </summary>
[Fact]
void CrossValidation()
{
var dataPath = GetDataPath(SentimentDataPath);
var testDataPath = GetDataPath(SentimentTestPath);

int numFolds = 5;
using (var env = new TlcEnvironment(seed: 1, conc: 1))
{
// Pipeline.
var loader = new TextLoader(env, MakeSentimentTextLoaderArgs(), new MultiFileSource(dataPath));

var text = TextTransform.Create(env, MakeSentimentTextTransformArgs(false), loader);
IDataView trans = new GenerateNumberTransform(env, text, "StratificationColumn");
// Train.
var trainer = new LinearClassificationTrainer(env, new LinearClassificationTrainer.Arguments
{
NumThreads = 1,
ConvergenceTolerance = 1f
});


var metrics = new List<BinaryClassificationMetrics>();
for (int fold = 0; fold < numFolds; fold++)
{
IDataView trainPipe = new RangeFilter(env, new RangeFilter.Arguments()
{
Column = "StratificationColumn",
Min = (Double)fold / numFolds,
Max = (Double)(fold + 1) / numFolds,
Complement = true
}, trans);
trainPipe = new OpaqueDataView(trainPipe);
var trainData = new RoleMappedData(trainPipe, label: "Label", feature: "Features");
// Auto-normalization.
NormalizeTransform.CreateIfNeeded(env, ref trainData, trainer);
var preCachedData = trainData;
// Auto-caching.
if (trainer.Info.WantCaching)
{
var prefetch = trainData.Schema.GetColumnRoles().Select(kc => kc.Value.Index).ToArray();
var cacheView = new CacheDataView(env, trainData.Data, prefetch);
// Because the prefetching worked, we know that these are valid columns.
trainData = new RoleMappedData(cacheView, trainData.Schema.GetColumnRoleNames());
}

var predictor = trainer.Train(new Runtime.TrainContext(trainData));
IDataView testPipe = new RangeFilter(env, new RangeFilter.Arguments()
{
Column = "StratificationColumn",
Min = (Double)fold / numFolds,
Max = (Double)(fold + 1) / numFolds,
Complement = false
}, trans);
testPipe = new OpaqueDataView(testPipe);
var pipe = ApplyTransformUtils.ApplyAllTransformsToData(env, preCachedData.Data, testPipe, trainPipe);

var testRoles = new RoleMappedData(pipe, trainData.Schema.GetColumnRoleNames());

IDataScorerTransform scorer = ScoreUtils.GetScorer(predictor, testRoles, env, testRoles.Schema);

BinaryClassifierMamlEvaluator eval = new BinaryClassifierMamlEvaluator(env, new BinaryClassifierMamlEvaluator.Arguments() { });
var dataEval = new RoleMappedData(scorer, testRoles.Schema.GetColumnRoleNames(), opt: true);
var dict = eval.Evaluate(dataEval);
var foldMetrics = BinaryClassificationMetrics.FromMetrics(env, dict["OverallMetrics"], dict["ConfusionMatrix"]);
metrics.Add(foldMetrics.Single());
}
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using Microsoft.ML.Runtime.Api;
using Microsoft.ML.Runtime.Data;
using Microsoft.ML.Runtime.Learners;
using System.Linq;
using Xunit;

namespace Microsoft.ML.Tests.Scenarios.Api
{

public partial class ApiScenariosTests
{
/// <summary>
/// Decomposable train and predict: Train on Iris multiclass problem, which will require
/// a transform on labels. Be able to reconstitute the pipeline for a prediction only task,
/// which will essentially "drop" the transform over labels, while retaining the property
/// that the predicted label for this has a key-type, the probability outputs for the classes
/// have the class labels as slot names, etc. This should be do-able without ugly compromises like,
/// say, injecting a dummy label.
/// </summary>
[Fact]
void DecomposableTrainAndPredict()
{
var dataPath = GetDataPath(IrisDataPath);
using (var env = new TlcEnvironment())
{
var loader = new TextLoader(env, MakeIrisTextLoaderArgs(), new MultiFileSource(dataPath));
var term = new TermTransform(env, loader, "Label");
var concat = new ConcatTransform(env, term, "Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth");
var trainer = new SdcaMultiClassTrainer(env, new SdcaMultiClassTrainer.Arguments { MaxIterations = 100, Shuffle = true, NumThreads = 1 });

IDataView trainData = trainer.Info.WantCaching ? (IDataView)new CacheDataView(env, concat, prefetch: null) : concat;
var trainRoles = new RoleMappedData(trainData, label: "Label", feature: "Features");

// Auto-normalization.
NormalizeTransform.CreateIfNeeded(env, ref trainRoles, trainer);
var predictor = trainer.Train(new Runtime.TrainContext(trainRoles));

var scoreRoles = new RoleMappedData(concat, label: "Label", feature: "Features");
IDataScorerTransform scorer = ScoreUtils.GetScorer(predictor, scoreRoles, env, trainRoles.Schema);

// Cut out term transform from pipeline.
var newScorer = ApplyTransformUtils.ApplyAllTransformsToData(env, scorer, loader, term);
var keyToValue = new KeyToValueTransform(env, newScorer, "PredictedLabel");
var model = env.CreatePredictionEngine<IrisDataNoLabel, IrisPrediction>(keyToValue);

var testLoader = new TextLoader(env, MakeIrisTextLoaderArgs(), new MultiFileSource(dataPath));
var testData = testLoader.AsEnumerable<IrisDataNoLabel>(env, false);
foreach (var input in testData.Take(20))
{
var prediction = model.Predict(input);
Assert.True(prediction.PredictedLabel == "Iris-setosa");
Copy link
Contributor

@Zruty0 Zruty0 Aug 7, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Iris-setosa [](start = 62, length = 11)

What, all of them? :) #Closed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

data file sorted by label, so first 50 is iris-setosa, then next 50 is iris-versicolor, and then last 50 is iris-virginica


In reply to: 208363615 [](ancestors = 208363615)

}
}
}
}
}
62 changes: 62 additions & 0 deletions test/Microsoft.ML.Tests/Scenarios/Api/Evaluation.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using Microsoft.ML.Runtime.Api;
using Microsoft.ML.Runtime.Data;
using Microsoft.ML.Runtime.Learners;
using Xunit;
using Microsoft.ML.Models;

namespace Microsoft.ML.Tests.Scenarios.Api
{
public partial class ApiScenariosTests
{
/// <summary>
/// Evaluation: Similar to the simple train scenario, except instead of having some
/// predictive structure, be able to score another "test" data file, run the result
/// through an evaluator and get metrics like AUC, accuracy, PR curves, and whatnot.
/// Getting metrics out of this shoudl be as straightforward and unannoying as possible.
/// </summary>
[Fact]
public void Evaluation()
{
var dataPath = GetDataPath(SentimentDataPath);
var testDataPath = GetDataPath(SentimentTestPath);

using (var env = new TlcEnvironment(seed: 1, conc: 1))
{
// Pipeline
var loader = new TextLoader(env, MakeSentimentTextLoaderArgs(), new MultiFileSource(dataPath));

var trans = TextTransform.Create(env, MakeSentimentTextTransformArgs(), loader);

// Train
var trainer = new LinearClassificationTrainer(env, new LinearClassificationTrainer.Arguments
{
NumThreads = 1
});

var cached = new CacheDataView(env, trans, prefetch: null);
var trainRoles = new RoleMappedData(cached, label: "Label", feature: "Features");
var predictor = trainer.Train(new Runtime.TrainContext(trainRoles));
var scoreRoles = new RoleMappedData(trans, label: "Label", feature: "Features");
IDataScorerTransform scorer = ScoreUtils.GetScorer(predictor, scoreRoles, env, trainRoles.Schema);

// Create prediction engine and test predictions.
var model = env.CreatePredictionEngine<SentimentData, SentimentPrediction>(scorer);

// Take a couple examples out of the test data and run predictions on top.
var testLoader = new TextLoader(env, MakeSentimentTextLoaderArgs(), new MultiFileSource(GetDataPath(SentimentTestPath)));
var testData = testLoader.AsEnumerable<SentimentData>(env, false);

var dataEval = new RoleMappedData(scorer, label: "Label", feature: "Features", opt: true);

var evaluator = new BinaryClassifierMamlEvaluator(env, new BinaryClassifierMamlEvaluator.Arguments() { });
var metricsDict = evaluator.Evaluate(dataEval);

var metrics = BinaryClassificationMetrics.FromMetrics(env, metricsDict["OverallMetrics"], metricsDict["ConfusionMatrix"])[0];
}
}
}
}
Loading