dotnet · Zruty0 · Aug 17, 2018 · Aug 6, 2018 · Aug 6, 2018 · Aug 7, 2018
diff --git a/src/Microsoft.ML.Core/EntryPoints/ComponentFactory.cs b/src/Microsoft.ML.Core/EntryPoints/ComponentFactory.cs
@@ -24,7 +24,7 @@ public interface IArgsComponent : IComponentFactory
     /// <summary>
     /// An interface for creating a component with no extra parameters (other than an <see cref="IHostEnvironment"/>).
     /// </summary>
-    public interface IComponentFactory<out TComponent>: IComponentFactory
+    public interface IComponentFactory<out TComponent> : IComponentFactory
     {
         TComponent CreateComponent(IHostEnvironment env);
     }
@@ -57,6 +57,21 @@ public TComponent CreateComponent(IHostEnvironment env, TArg1 argument1)
         }
     }
 
+    public class SimpleComponentFactory<TComponent> : IComponentFactory<TComponent>
+    {
+        private Func<IHostEnvironment, TComponent> _factory;
+
+        public SimpleComponentFactory(Func<IHostEnvironment, TComponent> factory)
+        {
+            _factory = factory;
+        }
+
+        public TComponent CreateComponent(IHostEnvironment env)
+        {
+            return _factory(env);
+        }
+    }
+
     /// <summary>
     /// An interface for creating a component when we take two extra parameters (and an <see cref="IHostEnvironment"/>).
     /// </summary>

diff --git a/src/Microsoft.ML.FastTree/FastTreeArguments.cs b/src/Microsoft.ML.FastTree/FastTreeArguments.cs
@@ -228,14 +228,14 @@ public abstract class TreeArgs : LearnerInputBaseWithGroupId
         // REVIEW: Different from original FastRank arguments (shortname l vs. nl). Different default from TLC FR Wrapper (20 vs. 20).
         [Argument(ArgumentType.LastOccurenceWins, HelpText = "The max number of leaves in each regression tree", ShortName = "nl", SortOrder = 2)]
         [TGUI(Description = "The maximum number of leaves per tree", SuggestedSweeps = "2-128;log;inc:4")]
-        [TlcModule.SweepableLongParamAttribute("NumLeaves", 2, 128, isLogScale:true, stepSize:4)]
+        [TlcModule.SweepableLongParamAttribute("NumLeaves", 2, 128, isLogScale: true, stepSize: 4)]
         public int NumLeaves = 20;
 
         // REVIEW: Arrays not supported in GUI
         // REVIEW: Different shortname than FastRank module. Same as the TLC FRWrapper.
         [Argument(ArgumentType.LastOccurenceWins, HelpText = "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", ShortName = "mil", SortOrder = 3)]
         [TGUI(Description = "Minimum number of training instances required to form a leaf", SuggestedSweeps = "1,10,50")]
-        [TlcModule.SweepableDiscreteParamAttribute("MinDocumentsInLeafs", new object[] {1, 10, 50})]
+        [TlcModule.SweepableDiscreteParamAttribute("MinDocumentsInLeafs", new object[] { 1, 10, 50 })]
         public int MinDocumentsInLeafs = 10;
 
         // REVIEW: Different shortname than FastRank module. Same as the TLC FRWrapper.
@@ -364,17 +364,17 @@ public enum OptimizationAlgorithmType { GradientDescent, AcceleratedGradientDesc
 
         [Argument(ArgumentType.LastOccurenceWins, HelpText = "The learning rate", ShortName = "lr", SortOrder = 4)]
         [TGUI(Label = "Learning Rate", SuggestedSweeps = "0.025-0.4;log")]
-        [TlcModule.SweepableFloatParamAttribute("LearningRates", 0.025f, 0.4f, isLogScale:true)]
+        [TlcModule.SweepableFloatParamAttribute("LearningRates", 0.025f, 0.4f, isLogScale: true)]
         public Double LearningRates = 0.2;
 
         [Argument(ArgumentType.AtMostOnce, HelpText = "Shrinkage", ShortName = "shrk")]
         [TGUI(Label = "Shrinkage", SuggestedSweeps = "0.25-4;log")]
-        [TlcModule.SweepableFloatParamAttribute("Shrinkage", 0.025f, 4f, isLogScale:true)]
+        [TlcModule.SweepableFloatParamAttribute("Shrinkage", 0.025f, 4f, isLogScale: true)]
         public Double Shrinkage = 1;
 
         [Argument(ArgumentType.AtMostOnce, HelpText = "Dropout rate for tree regularization", ShortName = "tdrop")]
         [TGUI(SuggestedSweeps = "0,0.000000001,0.05,0.1,0.2")]
-        [TlcModule.SweepableDiscreteParamAttribute("DropoutRate", new object[] { 0.0f, 1E-9f, 0.05f, 0.1f, 0.2f})]
+        [TlcModule.SweepableDiscreteParamAttribute("DropoutRate", new object[] { 0.0f, 1E-9f, 0.05f, 0.1f, 0.2f })]
         public Double DropoutRate = 0;
 
         [Argument(ArgumentType.AtMostOnce, HelpText = "Sample each query 1 in k times in the GetDerivatives function", ShortName = "sr")]

diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/ApiScenariosTests.cs b/test/Microsoft.ML.Tests/Scenarios/Api/ApiScenariosTests.cs
@@ -0,0 +1,58 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using Microsoft.ML.Runtime.Api;
+using Microsoft.ML.TestFramework;
+using Xunit.Abstractions;
+
+namespace Microsoft.ML.Tests.Scenarios.Api
+{
+    /// <summary>
+    /// Common utility functions for API scenarios tests.
+    /// </summary>
+    public partial class ApiScenariosTests : BaseTestClass
+    {
+        public ApiScenariosTests(ITestOutputHelper output) : base(output)
+        {
+        }
+
+        public const string IrisDataPath = "iris.data";
+        public const string SentimentDataPath = "wikipedia-detox-250-line-data.tsv";
+        public const string SentimentTestPath = "wikipedia-detox-250-line-test.tsv";
+
+        public class IrisData : IrisDataNoLabel
+        {
+            public string Label;
+        }
+
+        public class IrisDataNoLabel
+        {
+            public float SepalLength;
+            public float SepalWidth;
+            public float PetalLength;
+            public float PetalWidth;
+        }
+
+        public class IrisPrediction
+        {
+            public string PredictedLabel;
+            public float[] Score;
+        }
+
+        public class SentimentData
+        {
+            [ColumnName("Label")]
+            public bool Sentiment;
+            public string SentimentText;
+        }
+
+        public class SentimentPrediction
+        {
+            [ColumnName("PredictedLabel")]
+            public bool Sentiment;
+
+            public float Score;
+        }
+    }
+}
diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/AutoNormalizationAndCaching.cs b/test/Microsoft.ML.Tests/Scenarios/Api/AutoNormalizationAndCaching.cs
@@ -0,0 +1,49 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using Microsoft.ML.Runtime.Data;
+using Microsoft.ML.Runtime.Learners;
+using Xunit;
+
+namespace Microsoft.ML.Tests.Scenarios.Api
+{
+    public partial class ApiScenariosTests
+    {
+        /// <summary>
+        /// Auto-normalization and caching: It should be relatively easy for normalization 
+        /// and caching to be introduced for training, if the trainer supports or would benefit
+        /// from that.
+        /// </summary>
+        [Fact]
+        public void AutoNormalizationAndCaching()
+        {
+            var dataPath = GetDataPath(SentimentDataPath);
+            var testDataPath = GetDataPath(SentimentTestPath);
+
+            using (var env = new TlcEnvironment(seed: 1, conc: 1))
+            {
+                // Pipeline.
+                var loader = new TextLoader(env, MakeSentimentTextLoaderArgs(), new MultiFileSource(dataPath));
+
+                var trans = TextTransform.Create(env, MakeSentimentTextTransformArgs(false), loader);
+
+                // Train.
+                var trainer = new LinearClassificationTrainer(env, new LinearClassificationTrainer.Arguments
+                {
+                    NumThreads = 1,
+                    ConvergenceTolerance = 1f
+                });
+
+                // Auto-caching.
+                IDataView trainData = trainer.Info.WantCaching ? (IDataView)new CacheDataView(env, trans, prefetch: null) : trans;
+                var trainRoles = new RoleMappedData(trainData, label: "Label", feature: "Features");
+
+                // Auto-normalization.
+                NormalizeTransform.CreateIfNeeded(env, ref trainRoles, trainer);
+                var predictor = trainer.Train(new Runtime.TrainContext(trainRoles));
+            }
+
+        }
+    }
+}
diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CrossValidation.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CrossValidation.cs
@@ -0,0 +1,95 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using Microsoft.ML.Models;
+using Microsoft.ML.Runtime.Data;
+using Microsoft.ML.Runtime.Learners;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using Xunit;
+
+namespace Microsoft.ML.Tests.Scenarios.Api
+{
+    public partial class ApiScenariosTests
+    {
+        /// <summary>
+        /// Cross-validation: Have a mechanism to do cross validation, that is, you come up with
+        /// a data source (optionally with stratification column), come up with an instantiable transform
+        /// and trainer pipeline, and it will handle (1) splitting up the data, (2) training the separate
+        /// pipelines on in-fold data, (3) scoring on the out-fold data, (4) returning the set of
+        /// evaluations and optionally trained pipes. (People always want metrics out of xfold,
+        /// they sometimes want the actual models too.)
+        /// </summary>
+        [Fact]
+        void CrossValidation()
+        {
+            var dataPath = GetDataPath(SentimentDataPath);
+            var testDataPath = GetDataPath(SentimentTestPath);
+
+            int numFolds = 5;
+            using (var env = new TlcEnvironment(seed: 1, conc: 1))
+            {
+                // Pipeline.
+                var loader = new TextLoader(env, MakeSentimentTextLoaderArgs(), new MultiFileSource(dataPath));
+
+                var text = TextTransform.Create(env, MakeSentimentTextTransformArgs(false), loader);
+                IDataView trans = new GenerateNumberTransform(env, text, "StratificationColumn");
+                // Train.
+                var trainer = new LinearClassificationTrainer(env, new LinearClassificationTrainer.Arguments
+                {
+                    NumThreads = 1,
+                    ConvergenceTolerance = 1f
+                });
+
+
+                var metrics = new List<BinaryClassificationMetrics>();
+                for (int fold = 0; fold < numFolds; fold++)
+                {
+                    IDataView trainPipe = new RangeFilter(env, new RangeFilter.Arguments()
+                    {
+                        Column = "StratificationColumn",
+                        Min = (Double)fold / numFolds,
+                        Max = (Double)(fold + 1) / numFolds,
+                        Complement = true
+                    }, trans);
+                    trainPipe = new OpaqueDataView(trainPipe);
+                    var trainData = new RoleMappedData(trainPipe, label: "Label", feature: "Features");
+                    // Auto-normalization.
+                    NormalizeTransform.CreateIfNeeded(env, ref trainData, trainer);
+                    var preCachedData = trainData;
+                    // Auto-caching.
+                    if (trainer.Info.WantCaching)
+                    {
+                        var prefetch = trainData.Schema.GetColumnRoles().Select(kc => kc.Value.Index).ToArray();
+                        var cacheView = new CacheDataView(env, trainData.Data, prefetch);
+                        // Because the prefetching worked, we know that these are valid columns.
+                        trainData = new RoleMappedData(cacheView, trainData.Schema.GetColumnRoleNames());
+                    }
+
+                    var predictor = trainer.Train(new Runtime.TrainContext(trainData));
+                    IDataView testPipe = new RangeFilter(env, new RangeFilter.Arguments()
+                    {
+                        Column = "StratificationColumn",
+                        Min = (Double)fold / numFolds,
+                        Max = (Double)(fold + 1) / numFolds,
+                        Complement = false
+                    }, trans);
+                    testPipe = new OpaqueDataView(testPipe);
+                    var pipe = ApplyTransformUtils.ApplyAllTransformsToData(env, preCachedData.Data, testPipe, trainPipe);
+
+                    var testRoles = new RoleMappedData(pipe, trainData.Schema.GetColumnRoleNames());
+
+                    IDataScorerTransform scorer = ScoreUtils.GetScorer(predictor, testRoles, env, testRoles.Schema);
+
+                    BinaryClassifierMamlEvaluator eval = new BinaryClassifierMamlEvaluator(env, new BinaryClassifierMamlEvaluator.Arguments() { });
+                    var dataEval = new RoleMappedData(scorer, testRoles.Schema.GetColumnRoleNames(), opt: true);
+                    var dict = eval.Evaluate(dataEval);
+                    var foldMetrics = BinaryClassificationMetrics.FromMetrics(env, dict["OverallMetrics"], dict["ConfusionMatrix"]);
+                    metrics.Add(foldMetrics.Single());
+                }
+            }
+        }
+    }
+}
diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/DecomposableTrainAndPredict.cs b/test/Microsoft.ML.Tests/Scenarios/Api/DecomposableTrainAndPredict.cs
@@ -0,0 +1,60 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using Microsoft.ML.Runtime.Api;
+using Microsoft.ML.Runtime.Data;
+using Microsoft.ML.Runtime.Learners;
+using System.Linq;
+using Xunit;
+
+namespace Microsoft.ML.Tests.Scenarios.Api
+{
+
+    public partial class ApiScenariosTests
+    {
+        /// <summary>
+        /// Decomposable train and predict: Train on Iris multiclass problem, which will require
+        /// a transform on labels. Be able to reconstitute the pipeline for a prediction only task,
+        /// which will essentially "drop" the transform over labels, while retaining the property
+        /// that the predicted label for this has a key-type, the probability outputs for the classes
+        /// have the class labels as slot names, etc. This should be do-able without ugly compromises like,
+        /// say, injecting a dummy label.
+        /// </summary>
+        [Fact]
+        void DecomposableTrainAndPredict()
+        {
+            var dataPath = GetDataPath(IrisDataPath);
+            using (var env = new TlcEnvironment())
+            {
+                var loader = new TextLoader(env, MakeIrisTextLoaderArgs(), new MultiFileSource(dataPath));
+                var term = new TermTransform(env, loader, "Label");
+                var concat = new ConcatTransform(env, term, "Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth");
+                var trainer = new SdcaMultiClassTrainer(env, new SdcaMultiClassTrainer.Arguments { MaxIterations = 100, Shuffle = true, NumThreads = 1 });
+
+                IDataView trainData = trainer.Info.WantCaching ? (IDataView)new CacheDataView(env, concat, prefetch: null) : concat;
+                var trainRoles = new RoleMappedData(trainData, label: "Label", feature: "Features");
+
+                // Auto-normalization.
+                NormalizeTransform.CreateIfNeeded(env, ref trainRoles, trainer);
+                var predictor = trainer.Train(new Runtime.TrainContext(trainRoles));
+
+                var scoreRoles = new RoleMappedData(concat, label: "Label", feature: "Features");
+                IDataScorerTransform scorer = ScoreUtils.GetScorer(predictor, scoreRoles, env, trainRoles.Schema);
+
+                // Cut out term transform from pipeline.
+                var newScorer = ApplyTransformUtils.ApplyAllTransformsToData(env, scorer, loader, term);
+                var keyToValue = new KeyToValueTransform(env, newScorer, "PredictedLabel");
+                var model = env.CreatePredictionEngine<IrisDataNoLabel, IrisPrediction>(keyToValue);
+
+                var testLoader = new TextLoader(env, MakeIrisTextLoaderArgs(), new MultiFileSource(dataPath));
+                var testData = testLoader.AsEnumerable<IrisDataNoLabel>(env, false);
+                foreach (var input in testData.Take(20))
+                {
+                    var prediction = model.Predict(input);
+                    Assert.True(prediction.PredictedLabel == "Iris-setosa");
+                }
+            }
+        }
+    }
+}
diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Evaluation.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Evaluation.cs
@@ -0,0 +1,62 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using Microsoft.ML.Runtime.Api;
+using Microsoft.ML.Runtime.Data;
+using Microsoft.ML.Runtime.Learners;
+using Xunit;
+using Microsoft.ML.Models;
+
+namespace Microsoft.ML.Tests.Scenarios.Api
+{
+    public partial class ApiScenariosTests
+    {
+        /// <summary>
+        /// Evaluation: Similar to the simple train scenario, except instead of having some 
+        /// predictive structure, be able to score another "test" data file, run the result 
+        /// through an evaluator and get metrics like AUC, accuracy, PR curves, and whatnot. 
+        /// Getting metrics out of this shoudl be as straightforward and unannoying as possible.
+        /// </summary>
+        [Fact]
+        public void Evaluation()
+        {
+            var dataPath = GetDataPath(SentimentDataPath);
+            var testDataPath = GetDataPath(SentimentTestPath);
+
+            using (var env = new TlcEnvironment(seed: 1, conc: 1))
+            {
+                // Pipeline
+                var loader = new TextLoader(env, MakeSentimentTextLoaderArgs(), new MultiFileSource(dataPath));
+
+                var trans = TextTransform.Create(env, MakeSentimentTextTransformArgs(), loader);
+
+                // Train
+                var trainer = new LinearClassificationTrainer(env, new LinearClassificationTrainer.Arguments
+                {
+                    NumThreads = 1
+                });
+
+                var cached = new CacheDataView(env, trans, prefetch: null);
+                var trainRoles = new RoleMappedData(cached, label: "Label", feature: "Features");
+                var predictor = trainer.Train(new Runtime.TrainContext(trainRoles));
+                var scoreRoles = new RoleMappedData(trans, label: "Label", feature: "Features");
+                IDataScorerTransform scorer = ScoreUtils.GetScorer(predictor, scoreRoles, env, trainRoles.Schema);
+
+                // Create prediction engine and test predictions.
+                var model = env.CreatePredictionEngine<SentimentData, SentimentPrediction>(scorer);
+
+                // Take a couple examples out of the test data and run predictions on top.
+                var testLoader = new TextLoader(env, MakeSentimentTextLoaderArgs(), new MultiFileSource(GetDataPath(SentimentTestPath)));
+                var testData = testLoader.AsEnumerable<SentimentData>(env, false);
+
+                var dataEval = new RoleMappedData(scorer, label: "Label", feature: "Features", opt: true);
+
+                var evaluator = new BinaryClassifierMamlEvaluator(env, new BinaryClassifierMamlEvaluator.Arguments() { });
+                var metricsDict = evaluator.Evaluate(dataEval);
+
+                var metrics = BinaryClassificationMetrics.FromMetrics(env, metricsDict["OverallMetrics"], metricsDict["ConfusionMatrix"])[0];
+            }
+        }
+    }
+}