Skip to content

Commit 86cf8c9

Browse files
author
Ivan Matantsev
committed
add 10 examples of api scenarios
1 parent f9d3973 commit 86cf8c9

11 files changed

+714
-0
lines changed
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
using Microsoft.ML.Runtime.Api;
2+
using Microsoft.ML.Runtime.Data;
3+
using Microsoft.ML.TestFramework;
4+
using System;
5+
using System.Collections.Generic;
6+
using System.Text;
7+
using Xunit.Abstractions;
8+
9+
namespace Microsoft.ML.Tests.Scenarios.Api
10+
{
11+
/// <summary>
12+
/// Common utility functions for API scenarios tests.
13+
/// </summary>
14+
public partial class ApiScenariosTests : BaseTestClass
15+
{
16+
public ApiScenariosTests(ITestOutputHelper output) : base(output)
17+
{
18+
}
19+
20+
public const string IrisDataPath = "iris.data";
21+
public const string SentimentDataPath = "wikipedia-detox-250-line-data.tsv";
22+
public const string SentimentTestPath = "wikipedia-detox-250-line-test.tsv";
23+
24+
public class IrisData
25+
{
26+
public float SepalLength;
27+
public float SepalWidth;
28+
public float PetalLength;
29+
public float PetalWidth;
30+
public string Label;
31+
}
32+
33+
public class IrisPrediction
34+
{
35+
public string PredictedLabel;
36+
public float[] Score;
37+
}
38+
39+
public class SentimentData
40+
{
41+
[ColumnName("Label")]
42+
public bool Sentiment;
43+
public string SentimentText;
44+
}
45+
46+
public class SentimentPrediction
47+
{
48+
[ColumnName("PredictedLabel")]
49+
public bool Sentiment;
50+
51+
public float Score;
52+
}
53+
}
54+
}
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
using Microsoft.ML.Runtime.Data;
2+
using Microsoft.ML.Runtime.Learners;
3+
using System;
4+
using System.Collections.Generic;
5+
using System.Text;
6+
using Xunit;
7+
8+
namespace Microsoft.ML.Tests.Scenarios.Api
9+
{
10+
public partial class ApiScenariosTests
11+
{
12+
/// <summary>
13+
/// Auto-normalization and caching: It should be relatively easy for normalization
14+
/// and caching to be introduced for training, if the trainer supports or would benefit
15+
/// from that.
16+
/// </summary>
17+
[Fact]
18+
public void AutoNormalizationAndCaching()
19+
{
20+
var dataPath = GetDataPath(SentimentDataPath);
21+
var testDataPath = GetDataPath(SentimentTestPath);
22+
23+
using (var env = new TlcEnvironment(seed: 1, conc: 1))
24+
{
25+
// Pipeline.
26+
var loader = new TextLoader(env, MakeSentimentTextLoaderArgs(), new MultiFileSource(dataPath));
27+
28+
var trans = TextTransform.Create(env, MakeSentimentTextTransformArgs(false), loader);
29+
30+
// Train.
31+
var trainer = new LinearClassificationTrainer(env, new LinearClassificationTrainer.Arguments
32+
{
33+
NumThreads = 1,
34+
ConvergenceTolerance = 1f
35+
});
36+
37+
// Auto-caching.
38+
IDataView trainData = trainer.Info.WantCaching ? (IDataView)new CacheDataView(env, trans, prefetch: null) : trans;
39+
var trainRoles = new RoleMappedData(trainData, label: "Label", feature: "Features");
40+
41+
// Auto-normalization.
42+
NormalizeTransform.CreateIfNeeded(env, ref trainRoles, trainer);
43+
var predictor = trainer.Train(new Runtime.TrainContext(trainRoles));
44+
}
45+
46+
}
47+
}
48+
}
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
using Microsoft.ML.Models;
2+
using Microsoft.ML.Runtime.Data;
3+
using Microsoft.ML.Runtime.Learners;
4+
using System;
5+
using System.Collections.Generic;
6+
using Xunit;
7+
8+
namespace Microsoft.ML.Tests.Scenarios.Api
9+
{
10+
public partial class ApiScenariosTests
11+
{
12+
/// <summary>
13+
/// Cross-validation: Have a mechanism to do cross validation, that is, you come up with
14+
/// a data source (optionally with stratification column), come up with an instantiable transform
15+
/// and trainer pipeline, and it will handle (1) splitting up the data, (2) training the separate
16+
/// pipelines on in-fold data, (3) scoring on the out-fold data, (4) returning the set of
17+
/// evaluations and optionally trained pipes. (People always want metrics out of xfold,
18+
/// they sometimes want the actual models too.)
19+
/// </summary>
20+
[Fact]
21+
void CrossValidation()
22+
{
23+
var dataPath = GetDataPath(SentimentDataPath);
24+
var testDataPath = GetDataPath(SentimentTestPath);
25+
26+
int numFolds = 5;
27+
using (var env = new TlcEnvironment(seed: 1, conc: 1))
28+
{
29+
// Pipeline.
30+
var loader = new TextLoader(env, MakeSentimentTextLoaderArgs(), new MultiFileSource(dataPath));
31+
32+
var trans = TextTransform.Create(env, MakeSentimentTextTransformArgs(false), loader);
33+
var random = new GenerateNumberTransform(env, trans, "StratificationColumn");
34+
// Train.
35+
var trainer = new LinearClassificationTrainer(env, new LinearClassificationTrainer.Arguments
36+
{
37+
NumThreads = 1,
38+
ConvergenceTolerance = 1f
39+
});
40+
41+
// Auto-caching.
42+
IDataView trainData = trainer.Info.WantCaching ? (IDataView)new CacheDataView(env, random, prefetch: null) : random;
43+
var metrics = new List<BinaryClassificationMetrics>();
44+
for (int fold = 0; fold < numFolds; fold++)
45+
{
46+
var trainFilter = new RangeFilter(env, new RangeFilter.Arguments()
47+
{
48+
Column = "StratificationColumn",
49+
Min = (Double)fold / numFolds,
50+
Max = (Double)(fold + 1) / numFolds,
51+
Complement = true
52+
}, trainData);
53+
54+
// Auto-normalization.
55+
var trainRoles = new RoleMappedData(trainFilter, label: "Label", feature: "Features");
56+
NormalizeTransform.CreateIfNeeded(env, ref trainRoles, trainer);
57+
58+
var predictor = trainer.Train(new Runtime.TrainContext(trainRoles));
59+
var testFilter = new RangeFilter(env, new RangeFilter.Arguments()
60+
{
61+
Column = "StratificationColumn",
62+
Min = (Double)fold / numFolds,
63+
Max = (Double)(fold + 1) / numFolds,
64+
Complement = false
65+
}, trainData);
66+
// Auto-normalization.
67+
var testRoles = new RoleMappedData(testFilter, label: "Label", feature: "Features");
68+
NormalizeTransform.CreateIfNeeded(env, ref testRoles, trainer);
69+
70+
IDataScorerTransform scorer = ScoreUtils.GetScorer(predictor, testRoles, env, testRoles.Schema);
71+
72+
BinaryClassifierMamlEvaluator eval = new BinaryClassifierMamlEvaluator(env, new BinaryClassifierMamlEvaluator.Arguments() { });
73+
var dataEval = new RoleMappedData(scorer, testRoles.Schema.GetColumnRoleNames(), opt: true);
74+
var dict = eval.Evaluate(dataEval);
75+
var foldMetrics = BinaryClassificationMetrics.FromMetrics(env, dict["OverallMetrics"], dict["ConfusionMatrix"]);
76+
metrics.AddRange(foldMetrics);
77+
}
78+
}
79+
}
80+
}
81+
}
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
using Microsoft.ML.Runtime.Api;
2+
using Microsoft.ML.Runtime.Data;
3+
using Microsoft.ML.Runtime.Learners;
4+
using System.Linq;
5+
using Xunit;
6+
7+
namespace Microsoft.ML.Tests.Scenarios.Api
8+
{
9+
10+
public partial class ApiScenariosTests
11+
{
12+
/// <summary>
13+
/// Decomposable train and predict: Train on Iris multiclass problem, which will require
14+
/// a transform on labels. Be able to reconstitute the pipeline for a prediction only task,
15+
/// which will essentially "drop" the transform over labels, while retaining the property
16+
/// that the predicted label for this has a key-type, the probability outputs for the classes
17+
/// have the class labels as slot names, etc. This should be do-able without ugly compromises like,
18+
/// say, injecting a dummy label.
19+
/// </summary>
20+
[Fact]
21+
void DecomposableTrainAndPredict()
22+
{
23+
var dataPath = GetDataPath(IrisDataPath);
24+
using (var env = new TlcEnvironment())
25+
{
26+
var loader = new TextLoader(env, MakeIrisTextLoaderArgs(), new MultiFileSource(dataPath));
27+
var term = new TermTransform(env, loader, "Label");
28+
var concat = new ConcatTransform(env, term, "Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth");
29+
var trainer = new SdcaMultiClassTrainer(env, new SdcaMultiClassTrainer.Arguments { MaxIterations = 100, Shuffle = true, NumThreads = 1 });
30+
31+
IDataView trainData = trainer.Info.WantCaching ? (IDataView)new CacheDataView(env, concat, prefetch: null) : concat;
32+
var trainRoles = new RoleMappedData(trainData, label: "Label", feature: "Features");
33+
34+
// Auto-normalization.
35+
NormalizeTransform.CreateIfNeeded(env, ref trainRoles, trainer);
36+
var predictor = trainer.Train(new Runtime.TrainContext(trainRoles));
37+
38+
var scoreRoles = new RoleMappedData(concat, label: "Label", feature: "Features");
39+
IDataScorerTransform scorer = ScoreUtils.GetScorer(predictor, scoreRoles, env, trainRoles.Schema);
40+
41+
// Cut of term transform from pipeline.
42+
var new_scorer = ApplyTransformUtils.ApplyAllTransformsToData(env, scorer, loader, term);
43+
var keyToValue = new KeyToValueTransform(env, new_scorer, "PredictedLabel");
44+
var model = env.CreatePredictionEngine<IrisData, IrisPrediction>(keyToValue);
45+
46+
var testLoader = new TextLoader(env, MakeIrisTextLoaderArgs(), new MultiFileSource(dataPath));
47+
var testData = testLoader.AsEnumerable<IrisData>(env, false);
48+
foreach (var input in testData.Take(20))
49+
{
50+
var prediction = model.Predict(input);
51+
Assert.True(prediction.PredictedLabel == input.Label);
52+
}
53+
}
54+
}
55+
}
56+
}
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
using Microsoft.ML.Runtime.Data;
2+
using Microsoft.ML.Runtime.Learners;
3+
using Xunit;
4+
using Microsoft.ML.Models;
5+
6+
namespace Microsoft.ML.Tests.Scenarios.Api
7+
{
8+
public partial class ApiScenariosTests
9+
{
10+
/// <summary>
11+
/// Evaluation: Similar to the simple train scenario, except instead of having some
12+
/// predictive structure, be able to score another "test" data file, run the result
13+
/// through an evaluator and get metrics like AUC, accuracy, PR curves, and whatnot.
14+
/// Getting metrics out of this shoudl be as straightforward and unannoying as possible.
15+
/// </summary>
16+
[Fact]
17+
public void Evaluation()
18+
{
19+
var dataPath = GetDataPath(SentimentDataPath);
20+
var testDataPath = GetDataPath(SentimentTestPath);
21+
22+
using (var env = new TlcEnvironment(seed: 1, conc: 1))
23+
{
24+
// Pipeline
25+
var loader = new TextLoader(env, MakeSentimentTextLoaderArgs(), new MultiFileSource(dataPath));
26+
27+
var trans = TextTransform.Create(env, MakeSentimentTextTransformArgs(), loader);
28+
29+
// Train
30+
var trainer = new LinearClassificationTrainer(env, new LinearClassificationTrainer.Arguments
31+
{
32+
NumThreads = 1
33+
});
34+
35+
var cached = new CacheDataView(env, trans, prefetch: null);
36+
var trainRoles = new RoleMappedData(cached, label: "Label", feature: "Features");
37+
var predictor = trainer.Train(new Runtime.TrainContext(trainRoles));
38+
var scoreRoles = new RoleMappedData(trans, label: "Label", feature: "Features");
39+
IDataScorerTransform scorer = ScoreUtils.GetScorer(predictor, scoreRoles, env, trainRoles.Schema);
40+
41+
var dataEval = new RoleMappedData(scorer, label: "Label", feature: "Features", opt: true);
42+
43+
var evaluator = new BinaryClassifierMamlEvaluator(env, new BinaryClassifierMamlEvaluator.Arguments() { });
44+
var metricsDict = evaluator.Evaluate(dataEval);
45+
46+
var metrics = BinaryClassificationMetrics.FromMetrics(env, metricsDict["OverallMetrics"], metricsDict["ConfusionMatrix"])[0];
47+
}
48+
}
49+
}
50+
}
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
using Microsoft.ML.Runtime.Data;
2+
using Microsoft.ML.Runtime.Data.IO;
3+
using Microsoft.ML.Runtime.Learners;
4+
using Xunit;
5+
6+
namespace Microsoft.ML.Tests.Scenarios.Api
7+
{
8+
public partial class ApiScenariosTests
9+
{
10+
/// <summary>
11+
/// File-based saving of data: Come up with transform pipeline. Transform training and
12+
/// test data, and save the featurized data to some file, using the .idv format.
13+
/// Train and evaluate multiple models over that pre-featurized data. (Useful for
14+
/// sweeping scenarios, where you are training many times on the same data,
15+
/// and don't necessarily want to transform it every single time.)
16+
/// </summary>
17+
[Fact]
18+
void FileBasedSavingOfData()
19+
{
20+
var dataPath = GetDataPath(SentimentDataPath);
21+
var testDataPath = GetDataPath(SentimentTestPath);
22+
23+
using (var env = new TlcEnvironment(seed: 1, conc: 1))
24+
{
25+
// Pipeline
26+
var loader = new TextLoader(env, MakeSentimentTextLoaderArgs(), new MultiFileSource(dataPath));
27+
28+
var trans = TextTransform.Create(env, MakeSentimentTextTransformArgs(), loader);
29+
var saver = new BinarySaver(env, new BinarySaver.Arguments());
30+
using (var ch = env.Start("SaveData"))
31+
using (var file = env.CreateOutputFile("i.idv"))
32+
{
33+
DataSaverUtils.SaveDataView(ch, saver, trans, file);
34+
}
35+
36+
var binData = new BinaryLoader(env, new BinaryLoader.Arguments(), new MultiFileSource("i.idv"));
37+
var trainRoles = new RoleMappedData(binData, label: "Label", feature: "Features");
38+
var trainer = new LinearClassificationTrainer(env, new LinearClassificationTrainer.Arguments
39+
{
40+
NumThreads = 1
41+
});
42+
var predictor = trainer.Train(new Runtime.TrainContext(trainRoles));
43+
44+
DeleteOutputPath("i.idv");
45+
}
46+
}
47+
}
48+
}

0 commit comments

Comments
 (0)