diff --git a/src/Microsoft.ML.Data/EntryPoints/InputBuilder.cs b/src/Microsoft.ML.Data/EntryPoints/InputBuilder.cs index 337aff3c14..3ec23a01bf 100644 --- a/src/Microsoft.ML.Data/EntryPoints/InputBuilder.cs +++ b/src/Microsoft.ML.Data/EntryPoints/InputBuilder.cs @@ -832,21 +832,21 @@ public static class SweepableDiscreteParam public static class PipelineSweeperSupportedMetrics { public new static string ToString() => "SupportedMetric"; - public const string Auc = "Auc"; + public const string Auc = "AUC"; public const string AccuracyMicro = "AccuracyMicro"; public const string AccuracyMacro = "AccuracyMacro"; public const string F1 = "F1"; - public const string AuPrc = "AuPrc"; + public const string AuPrc = "AUPRC"; public const string TopKAccuracy = "TopKAccuracy"; public const string L1 = "L1"; public const string L2 = "L2"; - public const string Rms = "Rms"; + public const string Rms = "RMS"; public const string LossFn = "LossFn"; public const string RSquared = "RSquared"; public const string LogLoss = "LogLoss"; public const string LogLossReduction = "LogLossReduction"; - public const string Ndcg = "Ndcg"; - public const string Dcg = "Dcg"; + public const string Ndcg = "NDCG"; + public const string Dcg = "DCG"; public const string PositivePrecision = "PositivePrecision"; public const string PositiveRecall = "PositiveRecall"; public const string NegativePrecision = "NegativePrecision"; @@ -858,9 +858,9 @@ public static class PipelineSweeperSupportedMetrics public const string ThreshAtK = "ThreshAtK"; public const string ThreshAtP = "ThreshAtP"; public const string ThreshAtNumPos = "ThreshAtNumPos"; - public const string Nmi = "Nmi"; + public const string Nmi = "NMI"; public const string AvgMinScore = "AvgMinScore"; - public const string Dbi = "Dbi"; + public const string Dbi = "DBI"; } } } diff --git a/src/Microsoft.ML.PipelineInference/AutoInference.cs b/src/Microsoft.ML.PipelineInference/AutoInference.cs index 7a340e5957..642ff4d0d7 100644 --- a/src/Microsoft.ML.PipelineInference/AutoInference.cs +++ b/src/Microsoft.ML.PipelineInference/AutoInference.cs @@ -158,7 +158,8 @@ private bool GetDataVariableName(IExceptionContext ectx, string nameOfData, JTok return false; string dataVar = firstNodeInputs.Value(nameOfData); - ectx.Check(VariableBinding.IsValidVariableName(ectx, dataVar), $"Invalid variable name {dataVar}."); + if (!VariableBinding.IsValidVariableName(ectx, dataVar)) + throw ectx.ExceptParam(nameof(nameOfData), $"Invalid variable name {dataVar}."); variableName = dataVar.Substring(1); return true; @@ -172,12 +173,14 @@ private bool GetDataVariableName(IExceptionContext ectx, string nameOfData, JTok public sealed class RunSummary { public double MetricValue { get; } + public double TrainingMetricValue { get; } public int NumRowsInTraining { get; } public long RunTimeMilliseconds { get; } - public RunSummary(double metricValue, int numRows, long runTimeMilliseconds) + public RunSummary(double metricValue, int numRows, long runTimeMilliseconds, double trainingMetricValue) { MetricValue = metricValue; + TrainingMetricValue = trainingMetricValue; NumRowsInTraining = numRows; RunTimeMilliseconds = runTimeMilliseconds; } @@ -303,7 +306,7 @@ private void MainLearningLoop(int batchSize, int numOfTrainingRows) var stopwatch = new Stopwatch(); var probabilityUtils = new Sweeper.Algorithms.SweeperProbabilityUtils(_host); - while (!_terminator.ShouldTerminate(_history)) + while (!_terminator.ShouldTerminate(_history)) { // Get next set of candidates var currentBatchSize = batchSize; @@ -341,16 +344,17 @@ private void ProcessPipeline(Sweeper.Algorithms.SweeperProbabilityUtils utils, S // Run pipeline, and time how long it takes stopwatch.Restart(); - double d = candidate.RunTrainTestExperiment(_trainData.Take(randomizedNumberOfRows), - _testData, Metric, TrainerKind); + candidate.RunTrainTestExperiment(_trainData.Take(randomizedNumberOfRows), + _testData, Metric, TrainerKind, out var testMetricVal, out var trainMetricVal); stopwatch.Stop(); // Handle key collisions on sorted list - while (_sortedSampledElements.ContainsKey(d)) - d += 1e-10; + while (_sortedSampledElements.ContainsKey(testMetricVal)) + testMetricVal += 1e-10; // Save performance score - candidate.PerformanceSummary = new RunSummary(d, randomizedNumberOfRows, stopwatch.ElapsedMilliseconds); + candidate.PerformanceSummary = + new RunSummary(testMetricVal, randomizedNumberOfRows, stopwatch.ElapsedMilliseconds, trainMetricVal); _sortedSampledElements.Add(candidate.PerformanceSummary.MetricValue, candidate); _history.Add(candidate); } diff --git a/src/Microsoft.ML.PipelineInference/AutoMlUtils.cs b/src/Microsoft.ML.PipelineInference/AutoMlUtils.cs index bd4de97b48..a0aae16a63 100644 --- a/src/Microsoft.ML.PipelineInference/AutoMlUtils.cs +++ b/src/Microsoft.ML.PipelineInference/AutoMlUtils.cs @@ -15,21 +15,34 @@ namespace Microsoft.ML.Runtime.PipelineInference { public static class AutoMlUtils { - public static AutoInference.RunSummary ExtractRunSummary(IHostEnvironment env, IDataView data, string metricColumnName) + public static double ExtractValueFromIDV(IHostEnvironment env, IDataView result, string columnName) { - double metricValue = 0; - int numRows = 0; - var schema = data.Schema; - schema.TryGetColumnIndex(metricColumnName, out var metricCol); + Contracts.CheckValue(env, nameof(env)); + env.CheckValue(result, nameof(result)); + env.CheckNonEmpty(columnName, nameof(columnName)); - using (var cursor = data.GetRowCursor(col => col == metricCol)) + double outputValue = 0; + var schema = result.Schema; + if (!schema.TryGetColumnIndex(columnName, out var metricCol)) + throw env.ExceptParam(nameof(columnName), $"Schema does not contain column: {columnName}"); + + using (var cursor = result.GetRowCursor(col => col == metricCol)) { var getter = cursor.GetGetter(metricCol); - cursor.MoveNext(); - getter(ref metricValue); + bool moved = cursor.MoveNext(); + env.Check(moved, "Expected an IDataView with a single row. Results dataset has no rows to extract."); + getter(ref outputValue); + env.Check(!cursor.MoveNext(), "Expected an IDataView with a single row. Results dataset has too many rows."); } - return new AutoInference.RunSummary(metricValue, numRows, 0); + return outputValue; + } + + public static AutoInference.RunSummary ExtractRunSummary(IHostEnvironment env, IDataView result, string metricColumnName, IDataView trainResult = null) + { + double testingMetricValue = ExtractValueFromIDV(env, result, metricColumnName); + double trainingMetricValue = trainResult != null ? ExtractValueFromIDV(env, trainResult, metricColumnName) : double.MinValue; + return new AutoInference.RunSummary(testingMetricValue, 0, 0, trainingMetricValue); } public static CommonInputs.IEvaluatorInput CloneEvaluatorInstance(CommonInputs.IEvaluatorInput evalInput) => @@ -618,5 +631,7 @@ public static Tuple[] ConvertToSweepArgumentStrings(TlcModule. } return results; } + + public static string GenerateOverallTrainingMetricVarName(Guid id) => $"Var_Training_OM_{id:N}"; } } diff --git a/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs b/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs index 58f44b9ce8..06c260a054 100644 --- a/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs +++ b/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs @@ -65,11 +65,14 @@ public static Output ExtractSweepResult(IHostEnvironment env, ResultInput input) var col1 = new KeyValuePair("Graph", TextType.Instance); var col2 = new KeyValuePair("MetricValue", PrimitiveType.FromKind(DataKind.R8)); var col3 = new KeyValuePair("PipelineId", TextType.Instance); + var col4 = new KeyValuePair("TrainingMetricValue", PrimitiveType.FromKind(DataKind.R8)); + var col5 = new KeyValuePair("FirstInput", TextType.Instance); + var col6 = new KeyValuePair("PredictorModel", TextType.Instance); if (rows.Count == 0) { var host = env.Register("ExtractSweepResult"); - outputView = new EmptyDataView(host, new SimpleSchema(host, col1, col2, col3)); + outputView = new EmptyDataView(host, new SimpleSchema(host, col1, col2, col3, col4, col5, col6)); } else { @@ -77,6 +80,9 @@ public static Output ExtractSweepResult(IHostEnvironment env, ResultInput input) builder.AddColumn(col1.Key, (PrimitiveType)col1.Value, rows.Select(r => new DvText(r.GraphJson)).ToArray()); builder.AddColumn(col2.Key, (PrimitiveType)col2.Value, rows.Select(r => r.MetricValue).ToArray()); builder.AddColumn(col3.Key, (PrimitiveType)col3.Value, rows.Select(r => new DvText(r.PipelineId)).ToArray()); + builder.AddColumn(col4.Key, (PrimitiveType)col4.Value, rows.Select(r => r.TrainingMetricValue).ToArray()); + builder.AddColumn(col5.Key, (PrimitiveType)col5.Value, rows.Select(r => new DvText(r.FirstInput)).ToArray()); + builder.AddColumn(col6.Key, (PrimitiveType)col6.Value, rows.Select(r => new DvText(r.PredictorModel)).ToArray()); outputView = builder.GetDataView(); } return new Output { Results = outputView, State = autoMlState }; @@ -132,11 +138,11 @@ public static CommonOutputs.MacroOutput PipelineSweep( // Extract performance summaries and assign to previous candidate pipelines. foreach (var pipeline in autoMlState.BatchCandidates) { - if (node.Context.TryGetVariable(ExperimentUtils.GenerateOverallMetricVarName(pipeline.UniqueId), - out var v)) + if (node.Context.TryGetVariable(ExperimentUtils.GenerateOverallMetricVarName(pipeline.UniqueId), out var v) && + node.Context.TryGetVariable(AutoMlUtils.GenerateOverallTrainingMetricVarName(pipeline.UniqueId), out var v2)) { pipeline.PerformanceSummary = - AutoMlUtils.ExtractRunSummary(env, (IDataView)v.Value, autoMlState.Metric.Name); + AutoMlUtils.ExtractRunSummary(env, (IDataView)v.Value, autoMlState.Metric.Name, (IDataView)v2.Value); autoMlState.AddEvaluated(pipeline); } } @@ -168,14 +174,17 @@ public static CommonOutputs.MacroOutput PipelineSweep( { // Add train test experiments to current graph for candidate pipeline var subgraph = new Experiment(env); - var trainTestOutput = p.AddAsTrainTest(training, testing, autoMlState.TrainerKind, subgraph); + var trainTestOutput = p.AddAsTrainTest(training, testing, autoMlState.TrainerKind, subgraph, true); // Change variable name to reference pipeline ID in output map, context and entrypoint output. var uniqueName = ExperimentUtils.GenerateOverallMetricVarName(p.UniqueId); + var uniqueNameTraining = AutoMlUtils.GenerateOverallTrainingMetricVarName(p.UniqueId); var sgNode = EntryPointNode.ValidateNodes(env, node.Context, new JArray(subgraph.GetNodes().Last()), node.Catalog).Last(); sgNode.RenameOutputVariable(trainTestOutput.OverallMetrics.VarName, uniqueName, cascadeChanges: true); + sgNode.RenameOutputVariable(trainTestOutput.TrainingOverallMetrics.VarName, uniqueNameTraining, cascadeChanges: true); trainTestOutput.OverallMetrics.VarName = uniqueName; + trainTestOutput.TrainingOverallMetrics.VarName = uniqueNameTraining; expNodes.Add(sgNode); // Store indicators, to pass to next iteration of macro. diff --git a/src/Microsoft.ML.PipelineInference/Microsoft.ML.PipelineInference.csproj b/src/Microsoft.ML.PipelineInference/Microsoft.ML.PipelineInference.csproj index 7cf9585f3b..ab3e464c74 100644 --- a/src/Microsoft.ML.PipelineInference/Microsoft.ML.PipelineInference.csproj +++ b/src/Microsoft.ML.PipelineInference/Microsoft.ML.PipelineInference.csproj @@ -17,6 +17,7 @@ + diff --git a/src/Microsoft.ML.PipelineInference/PipelinePattern.cs b/src/Microsoft.ML.PipelineInference/PipelinePattern.cs index 21287742a0..662a16798f 100644 --- a/src/Microsoft.ML.PipelineInference/PipelinePattern.cs +++ b/src/Microsoft.ML.PipelineInference/PipelinePattern.cs @@ -17,20 +17,38 @@ namespace Microsoft.ML.Runtime.PipelineInference /// public sealed class PipelinePattern : IEquatable { + /// + /// Class for encapsulating the information returned in the output IDataView for a pipeline + /// that has been run through the TrainTest macro. + /// public sealed class PipelineResultRow { public string GraphJson { get; } + /// + /// The metric value of the test dataset result (always needed). + /// public double MetricValue { get; } + /// + /// The metric value of the training dataset result (not always used or set). + /// + public double TrainingMetricValue { get; } public string PipelineId { get; } + public string FirstInput { get; } + public string PredictorModel { get; } public PipelineResultRow() { } - public PipelineResultRow(string graphJson, double metricValue, string pipelineId) + public PipelineResultRow(string graphJson, double metricValue, + string pipelineId, double trainingMetricValue, string firstInput, + string predictorModel) { GraphJson = graphJson; MetricValue = metricValue; PipelineId = pipelineId; + TrainingMetricValue = trainingMetricValue; + FirstInput = firstInput; + PredictorModel = predictorModel; } } @@ -111,7 +129,8 @@ public AutoInference.EntryPointGraphDef ToEntryPointGraph(Experiment experiment public bool Equals(PipelinePattern obj) => obj != null && UniqueId == obj.UniqueId; // REVIEW: We may want to allow for sweeping with CV in the future, so we will need to add new methods like this, or refactor these in that case. - public Experiment CreateTrainTestExperiment(IDataView trainData, IDataView testData, MacroUtils.TrainerKinds trainerKind, out Models.TrainTestEvaluator.Output resultsOutput) + public Experiment CreateTrainTestExperiment(IDataView trainData, IDataView testData, MacroUtils.TrainerKinds trainerKind, + bool includeTrainingMetrics, out Models.TrainTestEvaluator.Output resultsOutput) { var graphDef = ToEntryPointGraph(); var subGraph = graphDef.Graph; @@ -136,7 +155,8 @@ public Experiment CreateTrainTestExperiment(IDataView trainData, IDataView testD Model = finalOutput }, PipelineId = UniqueId.ToString("N"), - Kind = MacroUtils.TrainerKindApiValue(trainerKind) + Kind = MacroUtils.TrainerKindApiValue(trainerKind), + IncludeTrainingMetrics = includeTrainingMetrics }; var experiment = _env.CreateExperiment(); @@ -150,7 +170,7 @@ public Experiment CreateTrainTestExperiment(IDataView trainData, IDataView testD } public Models.TrainTestEvaluator.Output AddAsTrainTest(Var trainData, Var testData, - MacroUtils.TrainerKinds trainerKind, Experiment experiment = null) + MacroUtils.TrainerKinds trainerKind, Experiment experiment = null, bool includeTrainingMetrics = false) { experiment = experiment ?? _env.CreateExperiment(); var graphDef = ToEntryPointGraph(experiment); @@ -174,7 +194,8 @@ public Models.TrainTestEvaluator.Output AddAsTrainTest(Var trainData, TrainingData = trainData, TestingData = testData, Kind = MacroUtils.TrainerKindApiValue(trainerKind), - PipelineId = UniqueId.ToString("N") + PipelineId = UniqueId.ToString("N"), + IncludeTrainingMetrics = includeTrainingMetrics }; var trainTestOutput = experiment.Add(trainTestInput); return trainTestOutput; @@ -183,57 +204,80 @@ public Models.TrainTestEvaluator.Output AddAsTrainTest(Var trainData, /// /// Runs a train-test experiment on the current pipeline, through entrypoints. /// - public double RunTrainTestExperiment(IDataView trainData, IDataView testData, AutoInference.SupportedMetric metric, MacroUtils.TrainerKinds trainerKind) + public void RunTrainTestExperiment(IDataView trainData, IDataView testData, + AutoInference.SupportedMetric metric, MacroUtils.TrainerKinds trainerKind, out double testMetricValue, + out double trainMetricValue) { - var experiment = CreateTrainTestExperiment(trainData, testData, trainerKind, out var trainTestOutput); + var experiment = CreateTrainTestExperiment(trainData, testData, trainerKind, true, out var trainTestOutput); experiment.Run(); - var dataOut = experiment.GetOutput(trainTestOutput.OverallMetrics); - var schema = dataOut.Schema; - schema.TryGetColumnIndex(metric.Name, out var metricCol); - using (var cursor = dataOut.GetRowCursor(col => col == metricCol)) - { - var getter = cursor.GetGetter(metricCol); - double metricValue = 0; - cursor.MoveNext(); - getter(ref metricValue); - return metricValue; - } + var dataOut = experiment.GetOutput(trainTestOutput.OverallMetrics); + var dataOutTraining = experiment.GetOutput(trainTestOutput.TrainingOverallMetrics); + testMetricValue = AutoMlUtils.ExtractValueFromIDV(_env, dataOut, metric.Name); + trainMetricValue = AutoMlUtils.ExtractValueFromIDV(_env, dataOutTraining, metric.Name); } - public static PipelineResultRow[] ExtractResults(IHostEnvironment env, IDataView data, string graphColName, string metricColName, string idColName) + public static PipelineResultRow[] ExtractResults(IHostEnvironment env, IDataView data, + string graphColName, string metricColName, string idColName, string trainingMetricColName, + string firstInputColName, string predictorModelColName) { var results = new List(); var schema = data.Schema; if (!schema.TryGetColumnIndex(graphColName, out var graphCol)) - throw env.ExceptNotSupp($"Column name {graphColName} not found"); + throw env.ExceptParam(nameof(graphColName), $"Column name {graphColName} not found"); if (!schema.TryGetColumnIndex(metricColName, out var metricCol)) - throw env.ExceptNotSupp($"Column name {metricColName} not found"); + throw env.ExceptParam(nameof(metricColName), $"Column name {metricColName} not found"); + if (!schema.TryGetColumnIndex(trainingMetricColName, out var trainingMetricCol)) + throw env.ExceptParam(nameof(trainingMetricColName), $"Column name {trainingMetricColName} not found"); if (!schema.TryGetColumnIndex(idColName, out var pipelineIdCol)) - throw env.ExceptNotSupp($"Column name {idColName} not found"); + throw env.ExceptParam(nameof(idColName), $"Column name {idColName} not found"); + if (!schema.TryGetColumnIndex(firstInputColName, out var firstInputCol)) + throw env.ExceptParam(nameof(firstInputColName), $"Column name {firstInputColName} not found"); + if (!schema.TryGetColumnIndex(predictorModelColName, out var predictorModelCol)) + throw env.ExceptParam(nameof(predictorModelColName), $"Column name {predictorModelColName} not found"); using (var cursor = data.GetRowCursor(col => true)) { + var getter1 = cursor.GetGetter(metricCol); + var getter2 = cursor.GetGetter(graphCol); + var getter3 = cursor.GetGetter(pipelineIdCol); + var getter4 = cursor.GetGetter(trainingMetricCol); + var getter5 = cursor.GetGetter(firstInputCol); + var getter6 = cursor.GetGetter(predictorModelCol); + double metricValue = 0; + double trainingMetricValue = 0; + DvText graphJson = new DvText(); + DvText pipelineId = new DvText(); + DvText firstInput = new DvText(); + DvText predictorModel = new DvText(); + while (cursor.MoveNext()) { - var getter1 = cursor.GetGetter(metricCol); - double metricValue = 0; getter1(ref metricValue); - var getter2 = cursor.GetGetter(graphCol); - DvText graphJson = new DvText(); getter2(ref graphJson); - var getter3 = cursor.GetGetter(pipelineIdCol); - DvText pipelineId = new DvText(); getter3(ref pipelineId); - results.Add(new PipelineResultRow(graphJson.ToString(), metricValue, pipelineId.ToString())); + getter4(ref trainingMetricValue); + getter5(ref firstInput); + getter6(ref predictorModel); + + results.Add(new PipelineResultRow(graphJson.ToString(), + metricValue, pipelineId.ToString(), trainingMetricValue, + firstInput.ToString(), predictorModel.ToString())); } } return results.ToArray(); } - public PipelineResultRow ToResultRow() => - new PipelineResultRow(ToEntryPointGraph().Graph.ToJsonString(), - PerformanceSummary?.MetricValue ?? -1d, UniqueId.ToString("N")); + public PipelineResultRow ToResultRow() + { + var graphDef = ToEntryPointGraph(); + + return new PipelineResultRow($"{{'Nodes' : [{graphDef.Graph.ToJsonString()}]}}", + PerformanceSummary?.MetricValue ?? -1d, UniqueId.ToString("N"), + PerformanceSummary?.TrainingMetricValue ?? -1d, + graphDef.GetSubgraphFirstNodeDataVarName(_env), + graphDef.ModelOutput.VarName); + } } } diff --git a/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs b/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs index f0e7d8ec73..3620a3580a 100644 --- a/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs +++ b/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs @@ -20,14 +20,14 @@ public TestAutoInference(ITestOutputHelper helper) { } - [Fact(Skip = "Need CoreTLC specific baseline update")] + [Fact] [TestCategory("EntryPoints")] public void TestLearn() { using (var env = new TlcEnvironment()) { - string pathData = GetDataPath(@"../UCI/adult.train"); - string pathDataTest = GetDataPath(@"../UCI/adult.test"); + string pathData = GetDataPath(@"../../Samples/UCI/adult.train"); + string pathDataTest = GetDataPath(@"../../Samples/UCI/adult.test"); int numOfSampleRows = 1000; int batchSize = 5; int numIterations = 10; @@ -56,22 +56,80 @@ public void TestLearn() var datasetTest = ImportTextData.ImportText(env, new ImportTextData.Input { InputFile = inputFileTest, CustomSchema = schema }).Data; #pragma warning restore 0618 + // REVIEW: Theoretically, it could be the case that a new, very bad learner is introduced and // we get unlucky and only select it every time, such that this test fails. Not // likely at all, but a non-zero probability. Should be ok, since all current learners are returning d > .80. - double d = bestPipeline.RunTrainTestExperiment(datasetTrain, datasetTest, metric, MacroUtils.TrainerKinds.SignatureBinaryClassifierTrainer); - env.Check(d > 0.2); + bestPipeline.RunTrainTestExperiment(datasetTrain, datasetTest, metric, MacroUtils.TrainerKinds.SignatureBinaryClassifierTrainer, + out var testMetricValue, out var trainMtericValue); + env.Check(testMetricValue > 0.2); } Done(); } - [Fact(Skip = "Need CoreTLC specific baseline update")] + [Fact] + [TestCategory("EntryPoints")] + public void TestPipelineSweeperMacroNoTransforms() + { + // Set up inputs for experiment + string pathData = GetDataPath(@"../../Samples/UCI/adult.train"); + string pathDataTest = GetDataPath(@"../../Samples/UCI/adult.test"); + const int numOfSampleRows = 1000; + const string schema = "sep=, col=Features:R4:0,2,4,10-12 col=Label:R4:14 header=+"; + + var inputFileTrain = new SimpleFileHandle(Env, pathData, false, false); +#pragma warning disable 0618 + var datasetTrain = ImportTextData.ImportText(Env, + new ImportTextData.Input { InputFile = inputFileTrain, CustomSchema = schema }).Data.Take(numOfSampleRows); + var inputFileTest = new SimpleFileHandle(Env, pathDataTest, false, false); + var datasetTest = ImportTextData.ImportText(Env, + new ImportTextData.Input { InputFile = inputFileTest, CustomSchema = schema }).Data.Take(numOfSampleRows); +#pragma warning restore 0618 + const int batchSize = 5; + const int numIterations = 20; + const int numTransformLevels = 2; + AutoInference.SupportedMetric metric = AutoInference.SupportedMetric.Auc; + + // Using the simple, uniform random sampling (with replacement) engine + PipelineOptimizerBase autoMlEngine = new UniformRandomEngine(Env); + + // Create search object + var amls = new AutoInference.AutoMlMlState(Env, metric, autoMlEngine, new IterationTerminator(numIterations), + MacroUtils.TrainerKinds.SignatureBinaryClassifierTrainer, datasetTrain, datasetTest); + + // Infer search space + amls.InferSearchSpace(numTransformLevels); + + // Create macro object + var pipelineSweepInput = new Microsoft.ML.Models.PipelineSweeper() + { + BatchSize = batchSize, + }; + + var exp = new Experiment(Env); + var output = exp.Add(pipelineSweepInput); + exp.Compile(); + exp.SetInput(pipelineSweepInput.TrainingData, datasetTrain); + exp.SetInput(pipelineSweepInput.TestingData, datasetTest); + exp.SetInput(pipelineSweepInput.State, amls); + exp.SetInput(pipelineSweepInput.CandidateOutputs, new IDataView[0]); + exp.Run(); + + // Make sure you get back an AutoMlState, and that it ran for correct number of iterations + // with at least minimal performance values (i.e., best should have AUC better than 0.1 on this dataset). + AutoInference.AutoMlMlState amlsOut = (AutoInference.AutoMlMlState)exp.GetOutput(output.State); + Assert.NotNull(amlsOut); + Assert.Equal(amlsOut.GetAllEvaluatedPipelines().Length, numIterations); + Assert.True(amlsOut.GetBestPipeline().PerformanceSummary.MetricValue > 0.1); + } + + [Fact] [TestCategory("EntryPoints")] public void EntryPointPipelineSweepSerialization() { // Get datasets - var pathData = GetDataPath(@"../UCI/adult.train"); - var pathDataTest = GetDataPath(@"../UCI/adult.test"); + var pathData = GetDataPath(@"../../Samples/UCI/adult.train"); + var pathDataTest = GetDataPath(@"../../Samples/UCI/adult.test"); const int numOfSampleRows = 1000; int numIterations = 10; const string schema = @@ -91,7 +149,7 @@ public void EntryPointPipelineSweepSerialization() { 'Nodes': [ { - 'Name': 'Commands.PipelineSweep', + 'Name': 'Models.PipelineSweeper', 'Inputs': { 'TrainingData': '$TrainingData', 'TestingData': '$TestingData', @@ -133,7 +191,8 @@ public void EntryPointPipelineSweepSerialization() var results = runner.GetOutput("ResultsOut"); Assert.NotNull(results); - var rows = PipelinePattern.ExtractResults(Env, results, "Graph", "MetricValue", "PipelineId"); + var rows = PipelinePattern.ExtractResults(Env, results, + "Graph", "MetricValue", "PipelineId", "TrainingMetricValue", "FirstInput", "PredictorModel"); Assert.True(rows.Length == numIterations); } @@ -205,355 +264,344 @@ public void EntryPointPipelineSweep() var results = runner.GetOutput("ResultsOut"); Assert.NotNull(results); - var rows = PipelinePattern.ExtractResults(Env, results, "Graph", "MetricValue", "PipelineId"); + var rows = PipelinePattern.ExtractResults(Env, results, + "Graph", "MetricValue", "PipelineId", "TrainingMetricValue", "FirstInput", "PredictorModel"); Assert.True(rows.Length == numIterations); + Assert.True(rows.All(r => r.TrainingMetricValue > 0.1)); } - [Fact(Skip = "Datasets Not Present")] + [Fact] public void TestRocketPipelineEngine() { - //// Get datasets - //var pathData = GetDataPath(@"../UCI", "adult.train"); - //var pathDataTest = GetDataPath(@"../UCI", "adult.test"); - //const int numOfSampleRows = 1000; - //int numIterations = 35; - //const string schema = - //"sep=, col=Features:R4:0,2,4,10-12 col=workclass:TX:1 col=education:TX:3 col=marital_status:TX:5 col=occupation:TX:6 " + - //"col=relationship:TX:7 col=ethnicity:TX:8 col=sex:TX:9 col=native_country:TX:13 col=label_IsOver50K_:R4:14 header=+"; - //var inputFileTrain = new SimpleFileHandle(Env, pathData, false, false); - //var datasetTrain = ImportTextData.ImportText(Env, - //new ImportTextData.Input { InputFile = inputFileTrain, CustomSchema = schema }).Data.Take(numOfSampleRows); - //var inputFileTest = new SimpleFileHandle(Env, pathDataTest, false, false); - //var datasetTest = ImportTextData.ImportText(Env, - //new ImportTextData.Input { InputFile = inputFileTest, CustomSchema = schema }).Data.Take(numOfSampleRows); - - //// Define entrypoint graph - //string inputGraph = @" - //{ - //'Nodes': [ - //{ - //'Name': 'Commands.PipelineSweep', - //'Inputs': { - //'TrainingData': '$TrainingData', - //'TestingData': '$TestingData', - //'StateArguments': { - //'Name': 'AutoMlState', - //'Settings': { - //'Metric': 'Auc', - //'Engine': { - //'Name': 'Rocket', - //'Settings' : { - //'TopKLearners' : 2, - //'SecondRoundTrialsPerLearner' : 5 - //}, - //}, - //'TerminatorArgs': { - //'Name': 'IterationLimited', - //'Settings': { - //'FinalHistoryLength': 35 - //} - //}, - //'TrainerKind': 'SignatureBinaryClassifierTrainer' - //} - //}, - //'BatchSize': 5 - //}, - //'Outputs': { - //'State': '$StateOut', - //'Results': '$ResultsOut' - //} - //}, - //] - //}"; - - //JObject graph = JObject.Parse(inputGraph); - //var catalog = ModuleCatalog.CreateInstance(Env); - - //var runner = new GraphRunner(Env, catalog, graph[FieldNames.Nodes] as JArray); - //runner.SetInput("TrainingData", datasetTrain); - //runner.SetInput("TestingData", datasetTest); - //runner.RunAll(); - - //var autoMlState = runner.GetOutput("StateOut"); - //Assert.IsNotNull(autoMlState); - //var allPipelines = autoMlState.GetAllEvaluatedPipelines(); - //var bestPipeline = autoMlState.GetBestPipeline(); - //Assert.AreEqual(allPipelines.Length, numIterations); - //Assert.IsTrue(bestPipeline.PerformanceSummary.MetricValue > 0.1); - - //var results = runner.GetOutput("ResultsOut"); - //Assert.IsNotNull(results); - //var rows = PipelinePattern.ExtractResults(Env, results, "Graph", "MetricValue", "PipelineId"); - //Assert.IsTrue(rows.Length == numIterations); + // Get datasets + var pathData = GetDataPath(@"../../Samples/UCI", "adult.train"); + var pathDataTest = GetDataPath(@"../../Samples/UCI", "adult.test"); + const int numOfSampleRows = 1000; + int numIterations = 35; + const string schema = + "sep=, col=Features:R4:0,2,4,10-12 col=workclass:TX:1 col=education:TX:3 col=marital_status:TX:5 col=occupation:TX:6 " + + "col=relationship:TX:7 col=ethnicity:TX:8 col=sex:TX:9 col=native_country:TX:13 col=label_IsOver50K_:R4:14 header=+"; + var inputFileTrain = new SimpleFileHandle(Env, pathData, false, false); +#pragma warning disable 0618 + var datasetTrain = ImportTextData.ImportText(Env, + new ImportTextData.Input { InputFile = inputFileTrain, CustomSchema = schema }).Data.Take(numOfSampleRows); + var inputFileTest = new SimpleFileHandle(Env, pathDataTest, false, false); + var datasetTest = ImportTextData.ImportText(Env, + new ImportTextData.Input { InputFile = inputFileTest, CustomSchema = schema }).Data.Take(numOfSampleRows); +#pragma warning restore 0618 + // Define entrypoint graph + string inputGraph = @" + { + 'Nodes': [ + { + 'Name': 'Models.PipelineSweeper', + 'Inputs': { + 'TrainingData': '$TrainingData', + 'TestingData': '$TestingData', + 'StateArguments': { + 'Name': 'AutoMlState', + 'Settings': { + 'Metric': 'Auc', + 'Engine': { + 'Name': 'Rocket', + 'Settings' : { + 'TopKLearners' : 2, + 'SecondRoundTrialsPerLearner' : 5 + }, + }, + 'TerminatorArgs': { + 'Name': 'IterationLimited', + 'Settings': { + 'FinalHistoryLength': 35 + } + }, + 'TrainerKind': 'SignatureBinaryClassifierTrainer' + } + }, + 'BatchSize': 5 + }, + 'Outputs': { + 'State': '$StateOut', + 'Results': '$ResultsOut' + } + }, + ] + }"; + + JObject graph = JObject.Parse(inputGraph); + var catalog = ModuleCatalog.CreateInstance(Env); + + var runner = new GraphRunner(Env, catalog, graph[FieldNames.Nodes] as JArray); + runner.SetInput("TrainingData", datasetTrain); + runner.SetInput("TestingData", datasetTest); + runner.RunAll(); + + var autoMlState = runner.GetOutput("StateOut"); + Assert.NotNull(autoMlState); + var allPipelines = autoMlState.GetAllEvaluatedPipelines(); + var bestPipeline = autoMlState.GetBestPipeline(); + Assert.Equal(allPipelines.Length, numIterations); + Assert.True(bestPipeline.PerformanceSummary.MetricValue > 0.1); + + var results = runner.GetOutput("ResultsOut"); + Assert.NotNull(results); + var rows = PipelinePattern.ExtractResults(Env, results, + "Graph", "MetricValue", "PipelineId", "TrainingMetricValue", "FirstInput", "PredictorModel"); + Assert.True(rows.Length == numIterations); } [Fact(Skip = "Need CoreTLC specific baseline update")] public void TestTextDatasetLearn() { - //using (var env = new TlcEnvironment()) - //{ - //string pathData = GetDataPath(@"../UnitTest/tweets_labeled_10k_test_validation.tsv"); - //int batchSize = 5; - //int numIterations = 35; - //int numTransformLevels = 1; - //int numSampleRows = 100; - //AutoInference.SupportedMetric metric = AutoInference.SupportedMetric.AccuracyMicro; - - //// Using the simple, uniform random sampling (with replacement) engine - //PipelineOptimizerBase autoMlEngine = new UniformRandomEngine(env); - - //// Test initial learning - //var amls = AutoInference.InferPipelines(env, autoMlEngine, pathData, "", out var _, numTransformLevels, batchSize, - //metric, out var _, numSampleRows, new IterationTerminator(numIterations), - //MacroUtils.TrainerKinds.SignatureMultiClassClassifierTrainer); - //env.Check(amls.GetAllEvaluatedPipelines().Length == numIterations); - //} - //Done(); - } + using (var env = new TlcEnvironment()) + { + string pathData = GetDataPath(@"../UnitTest/tweets_labeled_10k_test_validation.tsv"); + int batchSize = 5; + int numIterations = 35; + int numTransformLevels = 1; + int numSampleRows = 100; + AutoInference.SupportedMetric metric = AutoInference.SupportedMetric.AccuracyMicro; - [Fact(Skip = "Need CoreTLC specific baseline update")] - public void TestPipelineNodeCloning() - { - //using (var env = new TlcEnvironment()) - //{ - //var lr1 = RecipeInference - //.AllowedLearners(env, MacroUtils.TrainerKinds.SignatureBinaryClassifierTrainer) - //.First(learner => learner.PipelineNode != null && learner.LearnerName.Contains("LogisticRegression")); - - //var sdca1 = RecipeInference - //.AllowedLearners(env, MacroUtils.TrainerKinds.SignatureBinaryClassifierTrainer) - //.First(learner => learner.PipelineNode != null && learner.LearnerName.Contains("Sdca")); - - //// Clone and change hyperparam values - //var lr2 = lr1.Clone(); - //lr1.PipelineNode.SweepParams[0].RawValue = 1.2f; - //lr2.PipelineNode.SweepParams[0].RawValue = 3.5f; - //var sdca2 = sdca1.Clone(); - //sdca1.PipelineNode.SweepParams[0].RawValue = 3; - //sdca2.PipelineNode.SweepParams[0].RawValue = 0; - - //// Make sure the changes are propagated to entry point objects - //env.Check(lr1.PipelineNode.UpdateProperties()); - //env.Check(lr2.PipelineNode.UpdateProperties()); - //env.Check(sdca1.PipelineNode.UpdateProperties()); - //env.Check(sdca2.PipelineNode.UpdateProperties()); - //env.Check(lr1.PipelineNode.CheckEntryPointStateMatchesParamValues()); - //env.Check(lr2.PipelineNode.CheckEntryPointStateMatchesParamValues()); - //env.Check(sdca1.PipelineNode.CheckEntryPointStateMatchesParamValues()); - //env.Check(sdca2.PipelineNode.CheckEntryPointStateMatchesParamValues()); - - //// Make sure second object's set of changes didn't overwrite first object's - //env.Check(!lr1.PipelineNode.SweepParams[0].RawValue.Equals(lr2.PipelineNode.SweepParams[0].RawValue)); - //env.Check(!sdca2.PipelineNode.SweepParams[0].RawValue.Equals(sdca1.PipelineNode.SweepParams[0].RawValue)); - //} + // Using the simple, uniform random sampling (with replacement) engine + PipelineOptimizerBase autoMlEngine = new UniformRandomEngine(env); + + // Test initial learning + var amls = AutoInference.InferPipelines(env, autoMlEngine, pathData, "", out var _, numTransformLevels, batchSize, + metric, out var _, numSampleRows, new IterationTerminator(numIterations), + MacroUtils.TrainerKinds.SignatureMultiClassClassifierTrainer); + env.Check(amls.GetAllEvaluatedPipelines().Length == numIterations); + } + Done(); } - [Fact(Skip = "Need CoreTLC specific baseline update")] - public void TestSupportedMetricsByName() + [Fact] + public void TestPipelineNodeCloning() { - //var fields = - //typeof(AutoInference.SupportedMetric).GetMembers(BindingFlags.Static | BindingFlags.Public) - //.Where(s => s.MemberType == MemberTypes.Field); - //foreach (var field in fields) - //{ - //var metric = AutoInference.SupportedMetric.ByName(field.Name); - //Assert.IsTrue(metric?.Name == field.Name); - //} - + using (var env = new TlcEnvironment()) + { + var lr1 = RecipeInference + .AllowedLearners(env, MacroUtils.TrainerKinds.SignatureBinaryClassifierTrainer) + .First(learner => learner.PipelineNode != null && learner.LearnerName.Contains("LogisticRegression")); + + var sdca1 = RecipeInference + .AllowedLearners(env, MacroUtils.TrainerKinds.SignatureBinaryClassifierTrainer) + .First(learner => learner.PipelineNode != null && learner.LearnerName.Contains("StochasticDualCoordinateAscent")); + + // Clone and change hyperparam values + var lr2 = lr1.Clone(); + lr1.PipelineNode.SweepParams[0].RawValue = 1.2f; + lr2.PipelineNode.SweepParams[0].RawValue = 3.5f; + var sdca2 = sdca1.Clone(); + sdca1.PipelineNode.SweepParams[0].RawValue = 3; + sdca2.PipelineNode.SweepParams[0].RawValue = 0; + + // Make sure the changes are propagated to entry point objects + env.Check(lr1.PipelineNode.UpdateProperties()); + env.Check(lr2.PipelineNode.UpdateProperties()); + env.Check(sdca1.PipelineNode.UpdateProperties()); + env.Check(sdca2.PipelineNode.UpdateProperties()); + env.Check(lr1.PipelineNode.CheckEntryPointStateMatchesParamValues()); + env.Check(lr2.PipelineNode.CheckEntryPointStateMatchesParamValues()); + env.Check(sdca1.PipelineNode.CheckEntryPointStateMatchesParamValues()); + env.Check(sdca2.PipelineNode.CheckEntryPointStateMatchesParamValues()); + + // Make sure second object's set of changes didn't overwrite first object's + env.Check(!lr1.PipelineNode.SweepParams[0].RawValue.Equals(lr2.PipelineNode.SweepParams[0].RawValue)); + env.Check(!sdca2.PipelineNode.SweepParams[0].RawValue.Equals(sdca1.PipelineNode.SweepParams[0].RawValue)); + } } - [Fact(Skip = "Need CoreTLC specific baseline update")] + [Fact] public void TestHyperparameterFreezing() { - //string pathData = GetDataPath(@"../UCI", "adult.train"); - //int numOfSampleRows = 1000; - //int batchSize = 1; - //int numIterations = 10; - //int numTransformLevels = 3; - //AutoInference.SupportedMetric metric = AutoInference.SupportedMetric.Auc; - - //// Using the simple, uniform random sampling (with replacement) brain - //PipelineOptimizerBase autoMlBrain = new UniformRandomEngine(Env); - - //// Run initial experiments - //var amls = AutoInference.InferPipelines(Env, autoMlBrain, pathData, "", out var _, numTransformLevels, batchSize, - //metric, out var bestPipeline, numOfSampleRows, new IterationTerminator(numIterations), - //MacroUtils.TrainerKinds.SignatureBinaryClassifierTrainer); - - //// Clear results - //amls.ClearEvaluatedPipelines(); - - //// Get space, remove transforms and all but one learner, freeze hyperparameters on learner. - //var space = amls.GetSearchSpace(); - //var transforms = space.Item1.Where(t => - //t.ExpertType != typeof(TransformInference.Experts.Categorical)).ToArray(); - //var learners = new[] { space.Item2.First() }; - //var hyperParam = learners[0].PipelineNode.SweepParams.First(); - //var frozenParamValue = hyperParam.RawValue; - //hyperParam.Frozen = true; - //amls.UpdateSearchSpace(learners, transforms); - - //// Allow for one more iteration - //amls.UpdateTerminator(new IterationTerminator(numIterations + 1)); - - //// Do learning. Only retained learner should be left in all pipelines. - //bestPipeline = amls.InferPipelines(numTransformLevels, batchSize, numOfSampleRows); - - //// Make sure all pipelines have retained learner - //Assert.IsTrue(amls.GetAllEvaluatedPipelines().All(p => p.Learner.LearnerName == learners[0].LearnerName)); - - //// Make sure hyperparameter value did not change - //Assert.IsNotNull(bestPipeline); - //Assert.AreEqual(bestPipeline.Learner.PipelineNode.SweepParams.First().RawValue, frozenParamValue); + string pathData = GetDataPath(@"../../Samples/UCI", "adult.train"); + int numOfSampleRows = 1000; + int batchSize = 1; + int numIterations = 10; + int numTransformLevels = 3; + AutoInference.SupportedMetric metric = AutoInference.SupportedMetric.Auc; + + // Using the simple, uniform random sampling (with replacement) brain + PipelineOptimizerBase autoMlBrain = new UniformRandomEngine(Env); + + // Run initial experiments + var amls = AutoInference.InferPipelines(Env, autoMlBrain, pathData, "", out var _, numTransformLevels, batchSize, + metric, out var bestPipeline, numOfSampleRows, new IterationTerminator(numIterations), + MacroUtils.TrainerKinds.SignatureBinaryClassifierTrainer); + + // Clear results + amls.ClearEvaluatedPipelines(); + + // Get space, remove transforms and all but one learner, freeze hyperparameters on learner. + var space = amls.GetSearchSpace(); + var transforms = space.Item1.Where(t => + t.ExpertType != typeof(TransformInference.Experts.Categorical)).ToArray(); + var learners = new[] { space.Item2.First() }; + var hyperParam = learners[0].PipelineNode.SweepParams.First(); + var frozenParamValue = hyperParam.RawValue; + hyperParam.Frozen = true; + amls.UpdateSearchSpace(learners, transforms); + + // Allow for one more iteration + amls.UpdateTerminator(new IterationTerminator(numIterations + 1)); + + // Do learning. Only retained learner should be left in all pipelines. + bestPipeline = amls.InferPipelines(numTransformLevels, batchSize, numOfSampleRows); + + // Make sure all pipelines have retained learner + Assert.True(amls.GetAllEvaluatedPipelines().All(p => p.Learner.LearnerName == learners[0].LearnerName)); + + // Make sure hyperparameter value did not change + Assert.NotNull(bestPipeline); + Assert.Equal(bestPipeline.Learner.PipelineNode.SweepParams.First().RawValue, frozenParamValue); } - [Fact(Skip = "Need CoreTLC specific baseline update")] + [Fact(Skip = "Dataset not available.")] public void TestRegressionPipelineWithMinimizingMetric() { - //string pathData = GetDataPath("../Housing (regression)/housing.txt"); - //int numOfSampleRows = 100; - //int batchSize = 5; - //int numIterations = 10; - //int numTransformLevels = 1; - //AutoInference.SupportedMetric metric = AutoInference.SupportedMetric.L1; - - //// Using the simple, uniform random sampling (with replacement) brain - //PipelineOptimizerBase autoMlBrain = new UniformRandomEngine(Env); - - //// Run initial experiments - //var amls = AutoInference.InferPipelines(Env, autoMlBrain, pathData, "", out var _, numTransformLevels, batchSize, - //metric, out var bestPipeline, numOfSampleRows, new IterationTerminator(numIterations), - //MacroUtils.TrainerKinds.SignatureRegressorTrainer); - - //// Allow for one more iteration - //amls.UpdateTerminator(new IterationTerminator(numIterations + 1)); - - //// Do learning. Only retained learner should be left in all pipelines. - //bestPipeline = amls.InferPipelines(numTransformLevels, batchSize, numOfSampleRows); - - //// Make sure hyperparameter value did not change - //Assert.IsNotNull(bestPipeline); - //Assert.IsTrue(amls.GetAllEvaluatedPipelines().All( - //p => p.PerformanceSummary.MetricValue >= bestPipeline.PerformanceSummary.MetricValue)); + string pathData = GetDataPath("../Housing (regression)/housing.txt"); + int numOfSampleRows = 100; + int batchSize = 5; + int numIterations = 10; + int numTransformLevels = 1; + AutoInference.SupportedMetric metric = AutoInference.SupportedMetric.L1; + + // Using the simple, uniform random sampling (with replacement) brain + PipelineOptimizerBase autoMlBrain = new UniformRandomEngine(Env); + + // Run initial experiments + var amls = AutoInference.InferPipelines(Env, autoMlBrain, pathData, "", out var _, numTransformLevels, batchSize, + metric, out var bestPipeline, numOfSampleRows, new IterationTerminator(numIterations), + MacroUtils.TrainerKinds.SignatureRegressorTrainer); + + // Allow for one more iteration + amls.UpdateTerminator(new IterationTerminator(numIterations + 1)); + + // Do learning. Only retained learner should be left in all pipelines. + bestPipeline = amls.InferPipelines(numTransformLevels, batchSize, numOfSampleRows); + + // Make sure hyperparameter value did not change + Assert.NotNull(bestPipeline); + Assert.True(amls.GetAllEvaluatedPipelines().All( + p => p.PerformanceSummary.MetricValue >= bestPipeline.PerformanceSummary.MetricValue)); } - [Fact(Skip = "Need CoreTLC specific baseline update")] + [Fact] public void TestLearnerConstrainingByName() { - //string pathData = GetDataPath(@"../UCI", "adult.train"); - //int numOfSampleRows = 1000; - //int batchSize = 1; - //int numIterations = 1; - //int numTransformLevels = 2; - //var prefix = "Microsoft.ML.Api.Experiment"; - //var retainedLearnerNames = new[] { $"{prefix}.LogisticRegression", $"{prefix}.FastTree" }; - //AutoInference.SupportedMetric metric = AutoInference.SupportedMetric.Auc; - - //// Using the simple, uniform random sampling (with replacement) brain. - //PipelineOptimizerBase autoMlBrain = new UniformRandomEngine(Env); - - //// Run initial experiment. - //var amls = AutoInference.InferPipelines(Env, autoMlBrain, pathData, "", out var _, - //numTransformLevels, batchSize, metric, out var _, numOfSampleRows, - //new IterationTerminator(numIterations), MacroUtils.TrainerKinds.SignatureBinaryClassifierTrainer); - - //// Keep only logistic regression and FastTree. - //amls.KeepSelectedLearners(retainedLearnerNames); - //var space = amls.GetSearchSpace(); - - //// Make sure only learners left are those retained. - //Assert.AreEqual(retainedLearnerNames.Length, space.Item2.Length); - //Assert.IsTrue(space.Item2.All(l => retainedLearnerNames.Any(r => r == l.LearnerName))); + string pathData = GetDataPath(@"../../Samples/UCI", "adult.train"); + int numOfSampleRows = 1000; + int batchSize = 1; + int numIterations = 1; + int numTransformLevels = 2; + var retainedLearnerNames = new[] { $"LogisticRegressionBinaryClassifier", $"FastTreeBinaryClassifier" }; + AutoInference.SupportedMetric metric = AutoInference.SupportedMetric.Auc; + + // Using the simple, uniform random sampling (with replacement) brain. + PipelineOptimizerBase autoMlBrain = new UniformRandomEngine(Env); + + // Run initial experiment. + var amls = AutoInference.InferPipelines(Env, autoMlBrain, pathData, "", out var _, + numTransformLevels, batchSize, metric, out var _, numOfSampleRows, + new IterationTerminator(numIterations), MacroUtils.TrainerKinds.SignatureBinaryClassifierTrainer); + + // Keep only logistic regression and FastTree. + amls.KeepSelectedLearners(retainedLearnerNames); + var space = amls.GetSearchSpace(); + + // Make sure only learners left are those retained. + Assert.Equal(retainedLearnerNames.Length, space.Item2.Length); + Assert.True(space.Item2.All(l => retainedLearnerNames.Any(r => r == l.LearnerName))); } - [Fact(Skip = "Need CoreTLC specific baseline update")] + [Fact] public void TestRequestedLearners() { - //// Get datasets - //var pathData = GetDataPath(@"../UCI", "adult.train"); - //var pathDataTest = GetDataPath(@"../UCI", "adult.test"); - //const int numOfSampleRows = 100; - //const string schema = - //"sep=, col=Features:R4:0,2,4,10-12 col=workclass:TX:1 col=education:TX:3 col=marital_status:TX:5 col=occupation:TX:6 " + - //"col=relationship:TX:7 col=ethnicity:TX:8 col=sex:TX:9 col=native_country:TX:13 col=label_IsOver50K_:R4:14 header=+"; - //var inputFileTrain = new SimpleFileHandle(Env, pathData, false, false); - //var datasetTrain = ImportTextData.ImportText(Env, - //new ImportTextData.Input { InputFile = inputFileTrain, CustomSchema = schema }).Data.Take(numOfSampleRows); - //var inputFileTest = new SimpleFileHandle(Env, pathDataTest, false, false); - //var datasetTest = ImportTextData.ImportText(Env, - //new ImportTextData.Input { InputFile = inputFileTest, CustomSchema = schema }).Data.Take(numOfSampleRows); - //var prefix = "Microsoft.ML.Api.Experiment"; - //var requestedLearners = new[] { $"{prefix}.LogisticRegression", $"{prefix}.FastTree" }; - - //// Define entrypoint graph - //string inputGraph = @" - //{ - //'Nodes': [ - //{ - //'Name': 'Commands.PipelineSweep', - //'Inputs': { - //'TrainingData': '$TrainingData', - //'TestingData': '$TestingData', - //'StateArguments': { - //'Name': 'AutoMlState', - //'Settings': { - //'Metric': 'Auc', - //'Engine': { - //'Name': 'Rocket', - //'Settings' : { - //'TopKLearners' : 2, - //'SecondRoundTrialsPerLearner' : 0 - //}, - //}, - //'TerminatorArgs': { - //'Name': 'IterationLimited', - //'Settings': { - //'FinalHistoryLength': 35 - //} - //}, - //'TrainerKind': 'SignatureBinaryClassifierTrainer', - //'RequestedLearners' : [ - //'Microsoft.ML.Api.Experiment.LogisticRegression', - //'Microsoft.ML.Api.Experiment.FastTree' - //] - //} - //}, - //'BatchSize': 5 - //}, - //'Outputs': { - //'State': '$StateOut', - //'Results': '$ResultsOut' - //} - //}, - //] - //}"; - - //JObject graph = JObject.Parse(inputGraph); - //var catalog = ModuleCatalog.CreateInstance(Env); - - //var runner = new GraphRunner(Env, catalog, graph[FieldNames.Nodes] as JArray); - //runner.SetInput("TrainingData", datasetTrain); - //runner.SetInput("TestingData", datasetTest); - //runner.RunAll(); - - //var autoMlState = runner.GetOutput("StateOut"); - //Assert.IsNotNull(autoMlState); - //var space = autoMlState.GetSearchSpace(); - - //// Make sure only learners left are those retained. - //Assert.AreEqual(requestedLearners.Length, space.Item2.Length); - //Assert.IsTrue(space.Item2.All(l => requestedLearners.Any(r => r == l.LearnerName))); + // Get datasets + var pathData = GetDataPath(@"../../Samples/UCI", "adult.train"); + var pathDataTest = GetDataPath(@"../../Samples/UCI", "adult.test"); + const int numOfSampleRows = 100; + const string schema = + "sep=, col=Features:R4:0,2,4,10-12 col=workclass:TX:1 col=education:TX:3 col=marital_status:TX:5 col=occupation:TX:6 " + + "col=relationship:TX:7 col=race:TX:8 col=sex:TX:9 col=native_country:TX:13 col=label_IsOver50K_:R4:14 header=+"; + var inputFileTrain = new SimpleFileHandle(Env, pathData, false, false); +#pragma warning disable 0618 + var datasetTrain = ImportTextData.ImportText(Env, + new ImportTextData.Input { InputFile = inputFileTrain, CustomSchema = schema }).Data.Take(numOfSampleRows); + var inputFileTest = new SimpleFileHandle(Env, pathDataTest, false, false); + var datasetTest = ImportTextData.ImportText(Env, + new ImportTextData.Input { InputFile = inputFileTest, CustomSchema = schema }).Data.Take(numOfSampleRows); + var requestedLearners = new[] { $"LogisticRegressionBinaryClassifier", $"FastTreeBinaryClassifier" }; +#pragma warning restore 0618 + // Define entrypoint graph + string inputGraph = @" + { + 'Nodes': [ + { + 'Name': 'Models.PipelineSweeper', + 'Inputs': { + 'TrainingData': '$TrainingData', + 'TestingData': '$TestingData', + 'StateArguments': { + 'Name': 'AutoMlState', + 'Settings': { + 'Metric': 'Auc', + 'Engine': { + 'Name': 'Rocket', + 'Settings' : { + 'TopKLearners' : 2, + 'SecondRoundTrialsPerLearner' : 0 + }, + }, + 'TerminatorArgs': { + 'Name': 'IterationLimited', + 'Settings': { + 'FinalHistoryLength': 35 + } + }, + 'TrainerKind': 'SignatureBinaryClassifierTrainer', + 'RequestedLearners' : [ + 'LogisticRegressionBinaryClassifier', + 'FastTreeBinaryClassifier' + ] + } + }, + 'BatchSize': 5 + }, + 'Outputs': { + 'State': '$StateOut', + 'Results': '$ResultsOut' + } + }, + ] + }"; + + JObject graph = JObject.Parse(inputGraph); + var catalog = ModuleCatalog.CreateInstance(Env); + + var runner = new GraphRunner(Env, catalog, graph[FieldNames.Nodes] as JArray); + runner.SetInput("TrainingData", datasetTrain); + runner.SetInput("TestingData", datasetTest); + runner.RunAll(); + + var autoMlState = runner.GetOutput("StateOut"); + Assert.NotNull(autoMlState); + var space = autoMlState.GetSearchSpace(); + + // Make sure only learners left are those retained. + Assert.Equal(requestedLearners.Length, space.Item2.Length); + Assert.True(space.Item2.All(l => requestedLearners.Any(r => r == l.LearnerName))); } - [Fact(Skip = "Need CoreTLC specific baseline update")] + [Fact] public void TestMinimizingMetricTransformations() { - //var values = new[] { 100d, 10d, -2d, -1d, 5.8d, -3.1d }; - //var maxWeight = values.Max(); - //var processed = values.Select(v => AutoMlUtils.ProcessWeight(v, maxWeight, false)); - //var expectedResult = new[] { 0d, 90d, 102d, 101d, 94.2d, 103.1d }; + var values = new[] { 100d, 10d, -2d, -1d, 5.8d, -3.1d }; + var maxWeight = values.Max(); + var processed = values.Select(v => AutoMlUtils.ProcessWeight(v, maxWeight, false)); + var expectedResult = new[] { 0d, 90d, 102d, 101d, 94.2d, 103.1d }; - //Assert.IsTrue(processed.Select((x, idx) => Math.Abs(x - expectedResult[idx]) < 0.001).All(r => r)); + Assert.True(processed.Select((x, idx) => System.Math.Abs(x - expectedResult[idx]) < 0.001).All(r => r)); } } }