dotnet
diff --git a/‎test/Microsoft.ML.Tests/Scenarios/Api/ApiScenariosTests.cs
Lines changed: 54 additions & 0 deletions b/‎test/Microsoft.ML.Tests/Scenarios/Api/ApiScenariosTests.cs
Lines changed: 54 additions & 0 deletions
diff --git a/‎test/Microsoft.ML.Tests/Scenarios/Api/AutoNormalizationAndCaching.cs
Lines changed: 48 additions & 0 deletions b/‎test/Microsoft.ML.Tests/Scenarios/Api/AutoNormalizationAndCaching.cs
Lines changed: 48 additions & 0 deletions
diff --git a/‎test/Microsoft.ML.Tests/Scenarios/Api/CrossValidation.cs
Lines changed: 81 additions & 0 deletions b/‎test/Microsoft.ML.Tests/Scenarios/Api/CrossValidation.cs
Lines changed: 81 additions & 0 deletions
diff --git a/‎test/Microsoft.ML.Tests/Scenarios/Api/DecomposableTrainAndPredict.cs
Lines changed: 56 additions & 0 deletions b/‎test/Microsoft.ML.Tests/Scenarios/Api/DecomposableTrainAndPredict.cs
Lines changed: 56 additions & 0 deletions
diff --git a/‎test/Microsoft.ML.Tests/Scenarios/Api/Evaluation.cs
Lines changed: 50 additions & 0 deletions b/‎test/Microsoft.ML.Tests/Scenarios/Api/Evaluation.cs
Lines changed: 50 additions & 0 deletions
diff --git a/‎test/Microsoft.ML.Tests/Scenarios/Api/FileBasedSavingOfData.cs
Lines changed: 48 additions & 0 deletions b/‎test/Microsoft.ML.Tests/Scenarios/Api/FileBasedSavingOfData.cs
Lines changed: 48 additions & 0 deletions
@@ -0,0 +1,54 @@
+using Microsoft.ML.Runtime.Api;
+using Microsoft.ML.Runtime.Data;
+using Microsoft.ML.TestFramework;
+using System;
+using System.Collections.Generic;
+using System.Text;
+using Xunit.Abstractions;
+
+namespace Microsoft.ML.Tests.Scenarios.Api
+{
+    /// <summary>
+    /// Common utility functions for API scenarios tests.
+    /// </summary>
+    public partial class ApiScenariosTests : BaseTestClass
+    {
+        public ApiScenariosTests(ITestOutputHelper output) : base(output)
+        {
+        }
+
+        public const string IrisDataPath = "iris.data";
+        public const string SentimentDataPath = "wikipedia-detox-250-line-data.tsv";
+        public const string SentimentTestPath = "wikipedia-detox-250-line-test.tsv";
+
+        public class IrisData
+        {
+            public float SepalLength;
+            public float SepalWidth;
+            public float PetalLength;
+            public float PetalWidth;
+            public string Label;
+        }
+
+        public class IrisPrediction
+        {
+            public string PredictedLabel;
+            public float[] Score;
+        }
+
+        public class SentimentData
+        {
+            [ColumnName("Label")]
+            public bool Sentiment;
+            public string SentimentText;
+        }
+
+        public class SentimentPrediction
+        {
+            [ColumnName("PredictedLabel")]
+            public bool Sentiment;
+
+            public float Score;
+        }
+    }
+}
@@ -0,0 +1,48 @@
+using Microsoft.ML.Runtime.Data;
+using Microsoft.ML.Runtime.Learners;
+using System;
+using System.Collections.Generic;
+using System.Text;
+using Xunit;
+
+namespace Microsoft.ML.Tests.Scenarios.Api
+{
+    public partial class ApiScenariosTests
+    {
+        /// <summary>
+        /// Auto-normalization and caching: It should be relatively easy for normalization 
+        /// and caching to be introduced for training, if the trainer supports or would benefit
+        /// from that.
+        /// </summary>
+        [Fact]
+        public void AutoNormalizationAndCaching()
+        {
+            var dataPath = GetDataPath(SentimentDataPath);
+            var testDataPath = GetDataPath(SentimentTestPath);
+
+            using (var env = new TlcEnvironment(seed: 1, conc: 1))
+            {
+                // Pipeline.
+                var loader = new TextLoader(env, MakeSentimentTextLoaderArgs(), new MultiFileSource(dataPath));
+
+                var trans = TextTransform.Create(env, MakeSentimentTextTransformArgs(false), loader);
+
+                // Train.
+                var trainer = new LinearClassificationTrainer(env, new LinearClassificationTrainer.Arguments
+                {
+                    NumThreads = 1,
+                    ConvergenceTolerance = 1f
+                });
+
+                // Auto-caching.
+                IDataView trainData = trainer.Info.WantCaching ? (IDataView)new CacheDataView(env, trans, prefetch: null) : trans;
+                var trainRoles = new RoleMappedData(trainData, label: "Label", feature: "Features");
+                
+                // Auto-normalization.
+                NormalizeTransform.CreateIfNeeded(env, ref trainRoles, trainer);
+                var predictor = trainer.Train(new Runtime.TrainContext(trainRoles));
+            }
+
+        }
+    }
+}
@@ -0,0 +1,81 @@
+using Microsoft.ML.Models;
+using Microsoft.ML.Runtime.Data;
+using Microsoft.ML.Runtime.Learners;
+using System;
+using System.Collections.Generic;
+using Xunit;
+
+namespace Microsoft.ML.Tests.Scenarios.Api
+{
+    public partial class ApiScenariosTests
+    {
+        /// <summary>
+        /// Cross-validation: Have a mechanism to do cross validation, that is, you come up with
+        /// a data source (optionally with stratification column), come up with an instantiable transform
+        /// and trainer pipeline, and it will handle (1) splitting up the data, (2) training the separate
+        /// pipelines on in-fold data, (3) scoring on the out-fold data, (4) returning the set of
+        /// evaluations and optionally trained pipes. (People always want metrics out of xfold,
+        /// they sometimes want the actual models too.)
+        /// </summary>
+        [Fact]
+        void CrossValidation()
+        {
+            var dataPath = GetDataPath(SentimentDataPath);
+            var testDataPath = GetDataPath(SentimentTestPath);
+
+            int numFolds = 5;
+            using (var env = new TlcEnvironment(seed: 1, conc: 1))
+            {
+                // Pipeline.
+                var loader = new TextLoader(env, MakeSentimentTextLoaderArgs(), new MultiFileSource(dataPath));
+
+                var trans = TextTransform.Create(env, MakeSentimentTextTransformArgs(false), loader);
+                var random = new GenerateNumberTransform(env, trans, "StratificationColumn");
+                // Train.
+                var trainer = new LinearClassificationTrainer(env, new LinearClassificationTrainer.Arguments
+                {
+                    NumThreads = 1,
+                    ConvergenceTolerance = 1f
+                });
+
+                // Auto-caching.
+                IDataView trainData = trainer.Info.WantCaching ? (IDataView)new CacheDataView(env, random, prefetch: null) : random;
+                var metrics = new List<BinaryClassificationMetrics>();
+                for (int fold = 0; fold < numFolds; fold++)
+                {
+                    var trainFilter = new RangeFilter(env, new RangeFilter.Arguments()
+                    {
+                        Column = "StratificationColumn",
+                        Min = (Double)fold / numFolds,
+                        Max = (Double)(fold + 1) / numFolds,
+                        Complement = true
+                    }, trainData);
+
+                    // Auto-normalization.
+                    var trainRoles = new RoleMappedData(trainFilter, label: "Label", feature: "Features");
+                    NormalizeTransform.CreateIfNeeded(env, ref trainRoles, trainer);
+
+                    var predictor = trainer.Train(new Runtime.TrainContext(trainRoles));
+                    var testFilter = new RangeFilter(env, new RangeFilter.Arguments()
+                    {
+                        Column = "StratificationColumn",
+                        Min = (Double)fold / numFolds,
+                        Max = (Double)(fold + 1) / numFolds,
+                        Complement = false
+                    }, trainData);
+                    // Auto-normalization.
+                    var testRoles = new RoleMappedData(testFilter, label: "Label", feature: "Features");
+                    NormalizeTransform.CreateIfNeeded(env, ref testRoles, trainer);
+
+                    IDataScorerTransform scorer = ScoreUtils.GetScorer(predictor, testRoles, env, testRoles.Schema);
+
+                    BinaryClassifierMamlEvaluator eval = new BinaryClassifierMamlEvaluator(env, new BinaryClassifierMamlEvaluator.Arguments() { });
+                    var dataEval = new RoleMappedData(scorer, testRoles.Schema.GetColumnRoleNames(), opt: true);
+                    var dict = eval.Evaluate(dataEval);
+                    var foldMetrics = BinaryClassificationMetrics.FromMetrics(env, dict["OverallMetrics"], dict["ConfusionMatrix"]);
+                    metrics.AddRange(foldMetrics);
+                }
+            }
+        }
+    }
+}
@@ -0,0 +1,56 @@
+using Microsoft.ML.Runtime.Api;
+using Microsoft.ML.Runtime.Data;
+using Microsoft.ML.Runtime.Learners;
+using System.Linq;
+using Xunit;
+
+namespace Microsoft.ML.Tests.Scenarios.Api
+{
+
+    public partial class ApiScenariosTests
+    {
+        /// <summary>
+        /// Decomposable train and predict: Train on Iris multiclass problem, which will require
+        /// a transform on labels. Be able to reconstitute the pipeline for a prediction only task,
+        /// which will essentially "drop" the transform over labels, while retaining the property
+        /// that the predicted label for this has a key-type, the probability outputs for the classes
+        /// have the class labels as slot names, etc. This should be do-able without ugly compromises like,
+        /// say, injecting a dummy label.
+        /// </summary>
+        [Fact]
+        void DecomposableTrainAndPredict()
+        {
+            var dataPath = GetDataPath(IrisDataPath);
+            using (var env = new TlcEnvironment())
+            {
+                var loader = new TextLoader(env, MakeIrisTextLoaderArgs(), new MultiFileSource(dataPath));
+                var term = new TermTransform(env, loader, "Label");
+                var concat = new ConcatTransform(env, term, "Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth");
+                var trainer = new SdcaMultiClassTrainer(env, new SdcaMultiClassTrainer.Arguments { MaxIterations = 100, Shuffle = true, NumThreads = 1 });
+
+                IDataView trainData = trainer.Info.WantCaching ? (IDataView)new CacheDataView(env, concat, prefetch: null) : concat;
+                var trainRoles = new RoleMappedData(trainData, label: "Label", feature: "Features");
+
+                // Auto-normalization.
+                NormalizeTransform.CreateIfNeeded(env, ref trainRoles, trainer);
+                var predictor = trainer.Train(new Runtime.TrainContext(trainRoles));
+
+                var scoreRoles = new RoleMappedData(concat, label: "Label", feature: "Features");
+                IDataScorerTransform scorer = ScoreUtils.GetScorer(predictor, scoreRoles, env, trainRoles.Schema);
+                
+                // Cut of term transform from pipeline.
+                var new_scorer = ApplyTransformUtils.ApplyAllTransformsToData(env, scorer, loader, term);
+                var keyToValue = new KeyToValueTransform(env, new_scorer, "PredictedLabel");
+                var model = env.CreatePredictionEngine<IrisData, IrisPrediction>(keyToValue);
+
+                var testLoader = new TextLoader(env, MakeIrisTextLoaderArgs(), new MultiFileSource(dataPath));
+                var testData = testLoader.AsEnumerable<IrisData>(env, false);
+                foreach (var input in testData.Take(20))
+                {
+                    var prediction = model.Predict(input);
+                    Assert.True(prediction.PredictedLabel == input.Label);
+                }
+            }
+        }
+    }
+}
@@ -0,0 +1,50 @@
+using Microsoft.ML.Runtime.Data;
+using Microsoft.ML.Runtime.Learners;
+using Xunit;
+using Microsoft.ML.Models;
+
+namespace Microsoft.ML.Tests.Scenarios.Api
+{
+    public partial class ApiScenariosTests
+    {
+        /// <summary>
+        /// Evaluation: Similar to the simple train scenario, except instead of having some 
+        /// predictive structure, be able to score another "test" data file, run the result 
+        /// through an evaluator and get metrics like AUC, accuracy, PR curves, and whatnot. 
+        /// Getting metrics out of this shoudl be as straightforward and unannoying as possible.
+        /// </summary>
+        [Fact]
+        public void Evaluation()
+        {
+            var dataPath = GetDataPath(SentimentDataPath);
+            var testDataPath = GetDataPath(SentimentTestPath);
+
+            using (var env = new TlcEnvironment(seed: 1, conc: 1))
+            {
+                // Pipeline
+                var loader = new TextLoader(env, MakeSentimentTextLoaderArgs(), new MultiFileSource(dataPath));
+
+                var trans = TextTransform.Create(env, MakeSentimentTextTransformArgs(), loader);
+
+                // Train
+                var trainer = new LinearClassificationTrainer(env, new LinearClassificationTrainer.Arguments
+                {
+                    NumThreads = 1
+                });
+
+                var cached = new CacheDataView(env, trans, prefetch: null);
+                var trainRoles = new RoleMappedData(cached, label: "Label", feature: "Features");
+                var predictor = trainer.Train(new Runtime.TrainContext(trainRoles));
+                var scoreRoles = new RoleMappedData(trans, label: "Label", feature: "Features");
+                IDataScorerTransform scorer = ScoreUtils.GetScorer(predictor, scoreRoles, env, trainRoles.Schema);
+
+                var dataEval = new RoleMappedData(scorer, label: "Label", feature: "Features", opt: true);
+
+                var evaluator = new BinaryClassifierMamlEvaluator(env, new BinaryClassifierMamlEvaluator.Arguments() { });
+                var metricsDict = evaluator.Evaluate(dataEval);
+
+                var metrics = BinaryClassificationMetrics.FromMetrics(env, metricsDict["OverallMetrics"], metricsDict["ConfusionMatrix"])[0];
+            }
+        }
+    }
+}
@@ -0,0 +1,48 @@
+using Microsoft.ML.Runtime.Data;
+using Microsoft.ML.Runtime.Data.IO;
+using Microsoft.ML.Runtime.Learners;
+using Xunit;
+
+namespace Microsoft.ML.Tests.Scenarios.Api
+{
+    public partial class ApiScenariosTests
+    {
+        /// <summary>
+        /// File-based saving of data: Come up with transform pipeline. Transform training and
+        /// test data, and save the featurized data to some file, using the .idv format.
+        /// Train and evaluate multiple models over that pre-featurized data. (Useful for
+        /// sweeping scenarios, where you are training many times on the same data,
+        /// and don't necessarily want to transform it every single time.)
+        /// </summary>
+        [Fact]
+        void FileBasedSavingOfData()
+        {
+            var dataPath = GetDataPath(SentimentDataPath);
+            var testDataPath = GetDataPath(SentimentTestPath);
+            
+            using (var env = new TlcEnvironment(seed: 1, conc: 1))
+            {
+                // Pipeline
+                var loader = new TextLoader(env, MakeSentimentTextLoaderArgs(), new MultiFileSource(dataPath));
+                
+                var trans = TextTransform.Create(env, MakeSentimentTextTransformArgs(), loader);
+                var saver = new BinarySaver(env, new BinarySaver.Arguments());
+                using (var ch = env.Start("SaveData"))
+                using (var file = env.CreateOutputFile("i.idv"))
+                {
+                    DataSaverUtils.SaveDataView(ch, saver, trans, file);
+                }
+
+                var binData = new BinaryLoader(env, new BinaryLoader.Arguments(), new MultiFileSource("i.idv"));
+                var trainRoles = new RoleMappedData(binData, label: "Label", feature: "Features");
+                var trainer = new LinearClassificationTrainer(env, new LinearClassificationTrainer.Arguments
+                {
+                    NumThreads = 1
+                });
+                var predictor = trainer.Train(new Runtime.TrainContext(trainRoles));
+                
+                DeleteOutputPath("i.idv");
+            }
+        }
+    }
+}