diff --git a/src/Microsoft.ML.PipelineInference/AutoInference.cs b/src/Microsoft.ML.PipelineInference/AutoInference.cs index 2c94c58348..73a358865f 100644 --- a/src/Microsoft.ML.PipelineInference/AutoInference.cs +++ b/src/Microsoft.ML.PipelineInference/AutoInference.cs @@ -208,6 +208,7 @@ public sealed class AutoMlMlState : IMlState private TransformInference.SuggestedTransform[] _availableTransforms; private RecipeInference.SuggestedRecipe.SuggestedLearner[] _availableLearners; private DependencyMap _dependencyMapping; + private RoleMappedData _dataRoles; public IPipelineOptimizer AutoMlEngine { get; set; } public PipelinePattern[] BatchCandidates { get; set; } public SupportedMetric Metric { get; } @@ -313,7 +314,7 @@ private void MainLearningLoop(int batchSize, int numOfTrainingRows) var currentBatchSize = batchSize; if (_terminator is IterationTerminator itr) currentBatchSize = Math.Min(itr.RemainingIterations(_history), batchSize); - var candidates = AutoMlEngine.GetNextCandidates(_sortedSampledElements.Values, currentBatchSize); + var candidates = AutoMlEngine.GetNextCandidates(_sortedSampledElements.Values, currentBatchSize, _dataRoles); // Break if no candidates returned, means no valid pipeline available. if (candidates.Length == 0) @@ -370,7 +371,7 @@ private TransformInference.SuggestedTransform[] InferAndFilter(IDataView data, T TransformInference.SuggestedTransform[] existingTransforms = null) { // Infer transforms using experts - var levelTransforms = TransformInference.InferTransforms(_env, data, args); + var levelTransforms = TransformInference.InferTransforms(_env, data, args, _dataRoles); // Retain only those transforms inferred which were also passed in. if (existingTransforms != null) @@ -378,11 +379,13 @@ private TransformInference.SuggestedTransform[] InferAndFilter(IDataView data, T return levelTransforms; } - public void InferSearchSpace(int numTransformLevels) + public void InferSearchSpace(int numTransformLevels, RoleMappedData dataRoles = null) { var learners = RecipeInference.AllowedLearners(_env, TrainerKind).ToArray(); if (_requestedLearners != null && _requestedLearners.Length > 0) learners = learners.Where(l => _requestedLearners.Contains(l.LearnerName)).ToArray(); + + _dataRoles = dataRoles; ComputeSearchSpace(numTransformLevels, learners, (b, c) => InferAndFilter(b, c)); } @@ -536,7 +539,21 @@ public PipelinePattern[] GetNextCandidates(int numberOfCandidates) var currentBatchSize = numberOfCandidates; if (_terminator is IterationTerminator itr) currentBatchSize = Math.Min(itr.RemainingIterations(_history), numberOfCandidates); - BatchCandidates = AutoMlEngine.GetNextCandidates(_sortedSampledElements.Select(kvp => kvp.Value), currentBatchSize); + BatchCandidates = AutoMlEngine.GetNextCandidates(_sortedSampledElements.Select(kvp => kvp.Value), currentBatchSize, _dataRoles); + + using (var ch = _host.Start("Suggested Pipeline")) + { + foreach (var pipeline in BatchCandidates) + { + ch.Info($"AutoInference Pipeline Id : {pipeline.UniqueId}"); + foreach (var transform in pipeline.Transforms) + { + ch.Info($"AutoInference Transform : {transform.Transform}"); + } + ch.Info($"AutoInference Learner : {pipeline.Learner}"); + } + } + return BatchCandidates; } diff --git a/src/Microsoft.ML.PipelineInference/AutoMlEngines/DefaultsEngine.cs b/src/Microsoft.ML.PipelineInference/AutoMlEngines/DefaultsEngine.cs index 1d106e4cef..19583cef8c 100644 --- a/src/Microsoft.ML.PipelineInference/AutoMlEngines/DefaultsEngine.cs +++ b/src/Microsoft.ML.PipelineInference/AutoMlEngines/DefaultsEngine.cs @@ -5,6 +5,7 @@ using System; using System.Collections.Generic; using System.Linq; +using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.EntryPoints; using Microsoft.ML.Runtime.PipelineInference; @@ -33,9 +34,10 @@ public DefaultsEngine(IHostEnvironment env, Arguments args) _currentLearnerIndex = 0; } - public override PipelinePattern[] GetNextCandidates(IEnumerable history, int numCandidates) + public override PipelinePattern[] GetNextCandidates(IEnumerable history, int numCandidates, RoleMappedData dataRoles) { var candidates = new List(); + DataRoles = dataRoles; while (candidates.Count < numCandidates) { @@ -53,7 +55,8 @@ public override PipelinePattern[] GetNextCandidates(IEnumerable do { // Make sure transforms set is valid. Repeat until passes verifier. - pipeline = new PipelinePattern(SampleTransforms(out var transformsBitMask), learner, "", Env); + pipeline = new PipelinePattern(SampleTransforms(out var transformsBitMask), + learner, "", Env); valid = PipelineVerifier(pipeline, transformsBitMask); count++; } while (!valid && count <= 1000); @@ -77,7 +80,7 @@ private TransformInference.SuggestedTransform[] SampleTransforms(out long transf // Add final features concat transform. sampledTransforms.AddRange(AutoMlUtils.GetFinalFeatureConcat(Env, FullyTransformedData, - DependencyMapping, sampledTransforms.ToArray(), AvailableTransforms)); + DependencyMapping, sampledTransforms.ToArray(), AvailableTransforms, DataRoles)); return sampledTransforms.ToArray(); } diff --git a/src/Microsoft.ML.PipelineInference/AutoMlEngines/RocketEngine.cs b/src/Microsoft.ML.PipelineInference/AutoMlEngines/RocketEngine.cs index 80388eb53e..f06fa759e8 100644 --- a/src/Microsoft.ML.PipelineInference/AutoMlEngines/RocketEngine.cs +++ b/src/Microsoft.ML.PipelineInference/AutoMlEngines/RocketEngine.cs @@ -187,7 +187,7 @@ private TransformInference.SuggestedTransform[] SampleTransforms(RecipeInference // cause an error in verification, since it isn't included in the original // dependency mapping (i.e., its level isn't in the dictionary). sampledTransforms.AddRange(AutoMlUtils.GetFinalFeatureConcat(Env, FullyTransformedData, - DependencyMapping, sampledTransforms.ToArray(), AvailableTransforms)); + DependencyMapping, sampledTransforms.ToArray(), AvailableTransforms, DataRoles)); transformsBitMask = mask; return sampledTransforms.ToArray(); @@ -202,9 +202,10 @@ private RecipeInference.SuggestedRecipe.SuggestedLearner[] GetTopLearners(IEnume .Select(t=>AvailableLearners[t.Index]).ToArray(); } - public override PipelinePattern[] GetNextCandidates(IEnumerable history, int numCandidates) + public override PipelinePattern[] GetNextCandidates(IEnumerable history, int numCandidates, RoleMappedData dataRoles) { var prevCandidates = history.ToArray(); + DataRoles = dataRoles; switch (_currentStage) { @@ -220,7 +221,7 @@ public override PipelinePattern[] GetNextCandidates(IEnumerable // number of candidates, using second stage logic. UpdateLearners(GetTopLearners(prevCandidates)); _currentStage++; - return GetNextCandidates(prevCandidates, numCandidates); + return GetNextCandidates(prevCandidates, numCandidates, DataRoles); } else return GetInitialPipelines(prevCandidates, remainingNum); @@ -252,9 +253,11 @@ public override PipelinePattern[] GetNextCandidates(IEnumerable } } - private PipelinePattern[] GetInitialPipelines(IEnumerable history, int numCandidates) => - _secondaryEngines[_randomInit ? nameof(UniformRandomEngine) : nameof(DefaultsEngine)] - .GetNextCandidates(history, numCandidates); + private PipelinePattern[] GetInitialPipelines(IEnumerable history, int numCandidates) + { + var engine = _secondaryEngines[_randomInit ? nameof(UniformRandomEngine) : nameof(DefaultsEngine)]; + return engine.GetNextCandidates(history, numCandidates, DataRoles); + } private PipelinePattern[] NextCandidates(PipelinePattern[] history, int numCandidates, bool defaultHyperParams = false, bool uniformRandomTransforms = false) @@ -294,8 +297,9 @@ private PipelinePattern[] NextCandidates(PipelinePattern[] history, int numCandi do { // Make sure transforms set is valid and have not seen pipeline before. // Repeat until passes or runs out of chances. - pipeline = new PipelinePattern(SampleTransforms(learner, history, - out var transformsBitMask, uniformRandomTransforms), learner, "", Env); + pipeline = new PipelinePattern( + SampleTransforms(learner, history, out var transformsBitMask, uniformRandomTransforms), + learner, "", Env); hashKey = GetHashKey(transformsBitMask, learner); valid = PipelineVerifier(pipeline, transformsBitMask) && !VisitedPipelines.Contains(hashKey); count++; diff --git a/src/Microsoft.ML.PipelineInference/AutoMlEngines/UniformRandomEngine.cs b/src/Microsoft.ML.PipelineInference/AutoMlEngines/UniformRandomEngine.cs index 9f304312c2..23afce66ff 100644 --- a/src/Microsoft.ML.PipelineInference/AutoMlEngines/UniformRandomEngine.cs +++ b/src/Microsoft.ML.PipelineInference/AutoMlEngines/UniformRandomEngine.cs @@ -5,6 +5,7 @@ using System; using System.Collections.Generic; using System.Linq; +using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.EntryPoints; using Microsoft.ML.Runtime.PipelineInference; @@ -30,8 +31,9 @@ public UniformRandomEngine(IHostEnvironment env) : base(env, env.Register("UniformRandomEngine(AutoML)")) {} - public override PipelinePattern[] GetNextCandidates(IEnumerable history, int numberOfCandidates) + public override PipelinePattern[] GetNextCandidates(IEnumerable history, int numberOfCandidates, RoleMappedData dataRoles) { + DataRoles = dataRoles; return GetRandomPipelines(numberOfCandidates); } @@ -66,7 +68,7 @@ private PipelinePattern[] GetRandomPipelines(int numOfPipelines) // Always include features concat transform selectedTransforms.AddRange(AutoMlUtils.GetFinalFeatureConcat(Env, FullyTransformedData, - DependencyMapping, selectedTransforms.ToArray(), AvailableTransforms)); + DependencyMapping, selectedTransforms.ToArray(), AvailableTransforms, DataRoles)); // Compute hash key for checking if we've already seen this pipeline. // However, if we keep missing, don't want to get stuck in infinite loop. diff --git a/src/Microsoft.ML.PipelineInference/AutoMlUtils.cs b/src/Microsoft.ML.PipelineInference/AutoMlUtils.cs index 6aec714618..5f028835e2 100644 --- a/src/Microsoft.ML.PipelineInference/AutoMlUtils.cs +++ b/src/Microsoft.ML.PipelineInference/AutoMlUtils.cs @@ -257,7 +257,7 @@ public static long TransformsToBitmask(TransformInference.SuggestedTransform[] t /// (In other words, if there would be nothing for that concatenate transform to do.) /// private static TransformInference.SuggestedTransform[] GetFinalFeatureConcat(IHostEnvironment env, - IDataView dataSample, int[] excludedColumnIndices, int level, int atomicIdOffset) + IDataView dataSample, int[] excludedColumnIndices, int level, int atomicIdOffset, RoleMappedData dataRoles) { var finalArgs = new TransformInference.Arguments { @@ -266,7 +266,7 @@ private static TransformInference.SuggestedTransform[] GetFinalFeatureConcat(IHo ExcludedColumnIndices = excludedColumnIndices }; - var featuresConcatTransforms = TransformInference.InferConcatNumericFeatures(env, dataSample, finalArgs); + var featuresConcatTransforms = TransformInference.InferConcatNumericFeatures(env, dataSample, finalArgs, dataRoles); for (int i = 0; i < featuresConcatTransforms.Length; i++) { @@ -282,7 +282,7 @@ private static TransformInference.SuggestedTransform[] GetFinalFeatureConcat(IHo /// public static TransformInference.SuggestedTransform[] GetFinalFeatureConcat(IHostEnvironment env, IDataView data, AutoInference.DependencyMap dependencyMapping, TransformInference.SuggestedTransform[] selectedTransforms, - TransformInference.SuggestedTransform[] allTransforms) + TransformInference.SuggestedTransform[] allTransforms, RoleMappedData dataRoles) { int level = 1; int atomicGroupLimit = 0; @@ -292,7 +292,7 @@ public static TransformInference.SuggestedTransform[] GetFinalFeatureConcat(IHos atomicGroupLimit = allTransforms.Max(t => t.AtomicGroupId) + 1; } var excludedColumnIndices = GetExcludedColumnIndices(selectedTransforms, data, dependencyMapping); - return GetFinalFeatureConcat(env, data, excludedColumnIndices, level, atomicGroupLimit); + return GetFinalFeatureConcat(env, data, excludedColumnIndices, level, atomicGroupLimit, dataRoles); } public static IDataView ApplyTransformSet(IHostEnvironment env, IDataView data, TransformInference.SuggestedTransform[] transforms) diff --git a/src/Microsoft.ML.PipelineInference/InferenceUtils.cs b/src/Microsoft.ML.PipelineInference/InferenceUtils.cs index 8f31a792eb..311e98e75d 100644 --- a/src/Microsoft.ML.PipelineInference/InferenceUtils.cs +++ b/src/Microsoft.ML.PipelineInference/InferenceUtils.cs @@ -83,7 +83,7 @@ public static Type InferPredictorCategoryType(IDataView data, PurposeInference.C label.ItemKind == DataKind.TX || data.Schema.GetColumnType(label.ColumnIndex).IsKey) { - if (columns.Any(col => col.Purpose == ColumnPurpose.GroupId)) + if (columns.Any(col => col.Purpose == ColumnPurpose.Group)) return typeof(SignatureRankerTrainer); else return typeof(SignatureMultiClassClassifierTrainer); @@ -177,7 +177,7 @@ public enum ColumnPurpose CategoricalFeature = 4, TextFeature = 5, Weight = 6, - GroupId = 7, + Group = 7, ImagePath = 8 } } diff --git a/src/Microsoft.ML.PipelineInference/Interfaces/IPipelineOptimizer.cs b/src/Microsoft.ML.PipelineInference/Interfaces/IPipelineOptimizer.cs index 84603fc017..5fe46ec61e 100644 --- a/src/Microsoft.ML.PipelineInference/Interfaces/IPipelineOptimizer.cs +++ b/src/Microsoft.ML.PipelineInference/Interfaces/IPipelineOptimizer.cs @@ -21,7 +21,7 @@ namespace Microsoft.ML.Runtime.PipelineInference /// public interface IPipelineOptimizer { - PipelinePattern[] GetNextCandidates(IEnumerable history, int numberOfCandidates); + PipelinePattern[] GetNextCandidates(IEnumerable history, int numberOfCandidates, RoleMappedData dataRoles); void SetSpace(TransformInference.SuggestedTransform[] availableTransforms, RecipeInference.SuggestedRecipe.SuggestedLearner[] availableLearners, @@ -44,6 +44,7 @@ public abstract class PipelineOptimizerBase : IPipelineOptimizer protected IDataView OriginalData; protected IDataView FullyTransformedData; protected AutoInference.DependencyMap DependencyMapping; + protected RoleMappedData DataRoles; protected readonly IHostEnvironment Env; protected readonly IHost Host; protected readonly Dictionary TransformsMaskValidity; @@ -60,7 +61,7 @@ protected PipelineOptimizerBase(IHostEnvironment env, IHost host) ProbUtils = new SweeperProbabilityUtils(host); } - public abstract PipelinePattern[] GetNextCandidates(IEnumerable history, int numberOfCandidates); + public abstract PipelinePattern[] GetNextCandidates(IEnumerable history, int numberOfCandidates, RoleMappedData dataRoles); public virtual void SetSpace(TransformInference.SuggestedTransform[] availableTransforms, RecipeInference.SuggestedRecipe.SuggestedLearner[] availableLearners, diff --git a/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs b/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs index 06c260a054..be51c1e695 100644 --- a/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs +++ b/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs @@ -36,6 +36,30 @@ public sealed class Arguments [Argument(ArgumentType.AtMostOnce, HelpText = "Output datasets from previous iteration of sweep.", SortOrder = 7, Hide = true)] public IDataView[] CandidateOutputs; + + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as Role 'Label'", SortOrder = 8)] + public string[] LabelColumns; + + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as Role 'Group'", SortOrder = 9)] + public string[] GroupColumns; + + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as Role 'Weight'", SortOrder = 10)] + public string[] WeightColumns; + + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as Role 'Name'", SortOrder = 11)] + public string[] NameColumns; + + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as Role 'NumericFeature'", SortOrder = 12)] + public string[] NumericFeatureColumns; + + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as Role 'CategoricalFeature'", SortOrder = 13)] + public string[] CategoricalFeatureColumns; + + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as Role 'TextFeature'", SortOrder = 14)] + public string[] TextFeatureColumns; + + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as Role 'ImagePath'", SortOrder = 15)] + public string[] ImagePathColumns; } public sealed class Output @@ -88,6 +112,77 @@ public static Output ExtractSweepResult(IHostEnvironment env, ResultInput input) return new Output { Results = outputView, State = autoMlState }; } + private static RoleMappedData GetDataRoles(IHostEnvironment env, Arguments input) + { + var roles = new List>(); + + if (input.LabelColumns != null) + { + env.Check(input.LabelColumns.Length == 1, "LabelColumns expected one column name to be specified."); + roles.Add(RoleMappedSchema.ColumnRole.Label.Bind(input.LabelColumns[0])); + } + + if (input.GroupColumns != null) + { + env.Check(input.GroupColumns.Length == 1, "GroupColumns expected one column name to be specified."); + roles.Add(RoleMappedSchema.ColumnRole.Group.Bind(input.GroupColumns[0])); + } + + if (input.WeightColumns != null) + { + env.Check(input.WeightColumns.Length == 1, "WeightColumns expected one column name to be specified."); + roles.Add(RoleMappedSchema.ColumnRole.Weight.Bind(input.WeightColumns[0])); + } + + if (input.NameColumns != null) + { + env.Check(input.NameColumns.Length == 1, "NameColumns expected one column name to be specified."); + roles.Add(RoleMappedSchema.ColumnRole.Name.Bind(input.NameColumns[0])); + } + + if (input.NumericFeatureColumns != null) + { + var numericFeature = new RoleMappedSchema.ColumnRole(ColumnPurpose.NumericFeature.ToString()); + foreach (var colName in input.NumericFeatureColumns) + { + var item = numericFeature.Bind(colName); + roles.Add(item); + } + } + + if (input.CategoricalFeatureColumns != null) + { + var categoricalFeature = new RoleMappedSchema.ColumnRole(ColumnPurpose.CategoricalFeature.ToString()); + foreach (var colName in input.CategoricalFeatureColumns) + { + var item = categoricalFeature.Bind(colName); + roles.Add(item); + } + } + + if (input.TextFeatureColumns != null) + { + var textFeature = new RoleMappedSchema.ColumnRole(ColumnPurpose.TextFeature.ToString()); + foreach (var colName in input.TextFeatureColumns) + { + var item = textFeature.Bind(colName); + roles.Add(item); + } + } + + if (input.ImagePathColumns != null) + { + var imagePath = new RoleMappedSchema.ColumnRole(ColumnPurpose.ImagePath.ToString()); + foreach (var colName in input.ImagePathColumns) + { + var item = imagePath.Bind(colName); + roles.Add(item); + } + } + + return new RoleMappedData(input.TrainingData, roles); + } + [TlcModule.EntryPoint(Desc = "AutoML pipeline sweeping optimzation macro.", Name = "Models.PipelineSweeper")] public static CommonOutputs.MacroOutput PipelineSweep( IHostEnvironment env, @@ -98,6 +193,9 @@ public static CommonOutputs.MacroOutput PipelineSweep( "Must have a valid AutoML State, or pass arguments to create one."); env.Check(input.BatchSize > 0, "Batch size must be > 0."); + // Get the user-defined column roles (if any) + var dataRoles = GetDataRoles(env, input); + // If no current state, create object and set data. if (input.State == null) { @@ -133,7 +231,7 @@ public static CommonOutputs.MacroOutput PipelineSweep( // Make sure search space is defined. If not, infer, // with default number of transform levels. if (!autoMlState.IsSearchSpaceDefined()) - autoMlState.InferSearchSpace(numTransformLevels: 1); + autoMlState.InferSearchSpace(numTransformLevels: 1, dataRoles); // Extract performance summaries and assign to previous candidate pipelines. foreach (var pipeline in autoMlState.BatchCandidates) diff --git a/src/Microsoft.ML.PipelineInference/PurposeInference.cs b/src/Microsoft.ML.PipelineInference/PurposeInference.cs index 7858f1d12b..b870b392f3 100644 --- a/src/Microsoft.ML.PipelineInference/PurposeInference.cs +++ b/src/Microsoft.ML.PipelineInference/PurposeInference.cs @@ -147,9 +147,9 @@ public void Apply(IChannel ch, IntermediateColumn[] columns) else if (Regex.IsMatch(column.ColumnName, @"^m_rating$", RegexOptions.IgnoreCase)) column.SuggestedPurpose = ColumnPurpose.Label; else if (Regex.IsMatch(column.ColumnName, @"^m_queryid$", RegexOptions.IgnoreCase)) - column.SuggestedPurpose = ColumnPurpose.GroupId; - else if (Regex.IsMatch(column.ColumnName, @"groupid", RegexOptions.IgnoreCase)) - column.SuggestedPurpose = ColumnPurpose.GroupId; + column.SuggestedPurpose = ColumnPurpose.Group; + else if (Regex.IsMatch(column.ColumnName, @"group", RegexOptions.IgnoreCase)) + column.SuggestedPurpose = ColumnPurpose.Group; else if (Regex.IsMatch(column.ColumnName, @"^m_\w+id$", RegexOptions.IgnoreCase)) column.SuggestedPurpose = ColumnPurpose.Name; else if (Regex.IsMatch(column.ColumnName, @"^id$", RegexOptions.IgnoreCase)) @@ -318,8 +318,10 @@ private static IEnumerable GetExperts() /// The data to use for inference. /// Indices of columns that we're interested in. /// Additional arguments to inference. + /// (Optional) User defined Role mappings for data. /// The result includes the array of auto-detected column purposes. - public static InferenceResult InferPurposes(IHostEnvironment env, IDataView data, IEnumerable columnIndices, Arguments args) + public static InferenceResult InferPurposes(IHostEnvironment env, IDataView data, IEnumerable columnIndices, Arguments args, + RoleMappedData dataRoles = null) { Contracts.CheckValue(env, nameof(env)); var host = env.Register("InferPurposes"); @@ -330,14 +332,25 @@ public static InferenceResult InferPurposes(IHostEnvironment env, IDataView data using (var ch = host.Start("InferPurposes")) { var takenData = data.Take(args.MaxRowsToRead); - var cols = columnIndices.Select(x => new IntermediateColumn(takenData, x)).ToArray(); + var cols = columnIndices.Select(x => new IntermediateColumn(takenData, x)).ToList(); data = takenData; + if (dataRoles != null) + { + var items = dataRoles.Schema.GetColumnRoles(); + foreach(var item in items) + { + Enum.TryParse(item.Key.Value, out ColumnPurpose purpose); + var col = cols.Find(x => x.ColumnName == item.Value.Name); + col.SuggestedPurpose = purpose; + } + } + foreach (var expert in GetExperts()) { using (var expertChannel = host.Start(expert.GetType().ToString())) { - expert.Apply(expertChannel, cols); + expert.Apply(expertChannel, cols.ToArray()); expertChannel.Done(); } } diff --git a/src/Microsoft.ML.PipelineInference/TransformInference.cs b/src/Microsoft.ML.PipelineInference/TransformInference.cs index 4e02320451..d0475f8637 100644 --- a/src/Microsoft.ML.PipelineInference/TransformInference.cs +++ b/src/Microsoft.ML.PipelineInference/TransformInference.cs @@ -417,7 +417,7 @@ public sealed class GroupIdHashRename : TransformInferenceExpertBase { public override IEnumerable Apply(IntermediateColumn[] columns, Arguments inferenceArgs, IChannel ch) { - var firstGroupColId = Array.FindIndex(columns, x => x.Purpose == ColumnPurpose.GroupId); + var firstGroupColId = Array.FindIndex(columns, x => x.Purpose == ColumnPurpose.Group); if (firstGroupColId < 0) yield break; @@ -1559,7 +1559,7 @@ public static InferenceResult InferTransforms(IHostEnvironment env, IDataView da } } - public static SuggestedTransform[] InferTransforms(IHostEnvironment env, IDataView data, Arguments args) + public static SuggestedTransform[] InferTransforms(IHostEnvironment env, IDataView data, Arguments args, RoleMappedData dataRoles) { Contracts.CheckValue(env, nameof(env)); var h = env.Register("InferTransforms"); @@ -1576,7 +1576,7 @@ public static SuggestedTransform[] InferTransforms(IHostEnvironment env, IDataVi // Infer column purposes from data sample. var piArgs = new PurposeInference.Arguments { MaxRowsToRead = MaxRowsToRead }; var columnIndices = Enumerable.Range(0, dataSample.Schema.ColumnCount); - var piResult = PurposeInference.InferPurposes(env, dataSample, columnIndices, piArgs); + var piResult = PurposeInference.InferPurposes(env, dataSample, columnIndices, piArgs, dataRoles); var purposes = piResult.Columns; // Infer transforms @@ -1595,7 +1595,7 @@ public static SuggestedTransform[] InferTransforms(IHostEnvironment env, IDataVi .Contains(t.AtomicGroupId)).ToArray(); } - public static SuggestedTransform[] InferConcatNumericFeatures(IHostEnvironment env, IDataView data, Arguments args) + public static SuggestedTransform[] InferConcatNumericFeatures(IHostEnvironment env, IDataView data, Arguments args, RoleMappedData dataRoles) { Contracts.CheckValue(env, nameof(env)); var h = env.Register("InferConcatNumericFeatures"); @@ -1608,7 +1608,7 @@ public static SuggestedTransform[] InferConcatNumericFeatures(IHostEnvironment e // Infer column purposes from data sample. var piArgs = new PurposeInference.Arguments { MaxRowsToRead = MaxRowsToRead }; var columnIndices = Enumerable.Range(0, data.Schema.ColumnCount); - var piResult = PurposeInference.InferPurposes(env, data, columnIndices, piArgs); + var piResult = PurposeInference.InferPurposes(env, data, columnIndices, piArgs, dataRoles); var purposes = piResult.Columns; var cols = purposes.Where(x => !data.Schema.IsHidden(x.ColumnIndex) diff --git a/src/Microsoft.ML/CSharpApi.cs b/src/Microsoft.ML/CSharpApi.cs index b190750ca1..83723638e1 100644 --- a/src/Microsoft.ML/CSharpApi.cs +++ b/src/Microsoft.ML/CSharpApi.cs @@ -3420,6 +3420,46 @@ public sealed partial class PipelineSweeper /// public ArrayVar CandidateOutputs { get; set; } = new ArrayVar(); + /// + /// Column(s) to use as Role 'Label' + /// + public string[] LabelColumns { get; set; } + + /// + /// Column(s) to use as Role 'Group' + /// + public string[] GroupColumns { get; set; } + + /// + /// Column(s) to use as Role 'Weight' + /// + public string[] WeightColumns { get; set; } + + /// + /// Column(s) to use as Role 'Name' + /// + public string[] NameColumns { get; set; } + + /// + /// Column(s) to use as Role 'NumericFeature' + /// + public string[] NumericFeatureColumns { get; set; } + + /// + /// Column(s) to use as Role 'CategoricalFeature' + /// + public string[] CategoricalFeatureColumns { get; set; } + + /// + /// Column(s) to use as Role 'TextFeature' + /// + public string[] TextFeatureColumns { get; set; } + + /// + /// Column(s) to use as Role 'ImagePath' + /// + public string[] ImagePathColumns { get; set; } + public sealed class Output { diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json index ed80555eb1..f7d73f54b4 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json +++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json @@ -2751,6 +2751,102 @@ "SortOrder": 7.0, "IsNullable": false, "Default": null + }, + { + "Name": "LabelColumns", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Column(s) to use as Role 'Label'", + "Required": false, + "SortOrder": 8.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "GroupColumns", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Column(s) to use as Role 'Group'", + "Required": false, + "SortOrder": 9.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "WeightColumns", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Column(s) to use as Role 'Weight'", + "Required": false, + "SortOrder": 10.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "NameColumns", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Column(s) to use as Role 'Name'", + "Required": false, + "SortOrder": 11.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "NumericFeatureColumns", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Column(s) to use as Role 'NumericFeature'", + "Required": false, + "SortOrder": 12.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "CategoricalFeatureColumns", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Column(s) to use as Role 'CategoricalFeature'", + "Required": false, + "SortOrder": 13.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "TextFeatureColumns", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Column(s) to use as Role 'TextFeature'", + "Required": false, + "SortOrder": 14.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "ImagePathColumns", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Column(s) to use as Role 'ImagePath'", + "Required": false, + "SortOrder": 15.0, + "IsNullable": false, + "Default": null } ], "Outputs": [ diff --git a/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs b/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs index 77697eea9f..4d8ec880d1 100644 --- a/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs +++ b/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs @@ -272,6 +272,96 @@ public void EntryPointPipelineSweep() Assert.True(rows.All(r => r.TrainingMetricValue > 0.1)); } + [Fact] + [TestCategory("EntryPoints")] + public void EntryPointPipelineSweepRoles() + { + // Get datasets + var pathData = GetDataPath("adult.train"); + var pathDataTest = GetDataPath("adult.test"); + const int numOfSampleRows = 100; + int numIterations = 2; + const string schema = + "sep=, col=age:R4:0 col=workclass:TX:1 col=fnlwgt:R4:2 col=education:TX:3 col=education_num:R4:4 col=marital_status:TX:5 col=occupation:TX:6 " + + "col=relationship:TX:7 col=ethnicity:TX:8 col=sex:TX:9 col=Features:R4:10-12 col=native_country:TX:13 col=IsOver50K_:R4:14 header=+"; + var inputFileTrain = new SimpleFileHandle(Env, pathData, false, false); +#pragma warning disable 0618 + var datasetTrain = ImportTextData.ImportText(Env, + new ImportTextData.Input { InputFile = inputFileTrain, CustomSchema = schema }).Data.Take(numOfSampleRows); + var inputFileTest = new SimpleFileHandle(Env, pathDataTest, false, false); + var datasetTest = ImportTextData.ImportText(Env, + new ImportTextData.Input { InputFile = inputFileTest, CustomSchema = schema }).Data.Take(numOfSampleRows); +#pragma warning restore 0618 + + // Define entrypoint graph + string inputGraph = @" + { + 'Nodes': [ + { + 'Name': 'Models.PipelineSweeper', + 'Inputs': { + 'TrainingData': '$TrainingData', + 'TestingData': '$TestingData', + 'LabelColumns': ['IsOver50K_'], + 'WeightColumns': ['education_num'], + 'NameColumns': ['education'], + 'TextFeatureColumns': ['workclass', 'marital_status', 'occupation'], + 'StateArguments': { + 'Name': 'AutoMlState', + 'Settings': { + 'Metric': 'Auc', + 'Engine': { + 'Name': 'Defaults' + }, + 'TerminatorArgs': { + 'Name': 'IterationLimited', + 'Settings': { + 'FinalHistoryLength': 2 + } + }, + 'TrainerKind': 'SignatureBinaryClassifierTrainer', + 'RequestedLearners' : [ + 'LogisticRegressionBinaryClassifier', + 'FastTreeBinaryClassifier' + ] + } + }, + 'BatchSize': 1 + }, + 'Outputs': { + 'State': '$StateOut', + 'Results': '$ResultsOut' + } + }, + ] + }"; + + JObject graphJson = JObject.Parse(inputGraph); + var catalog = ModuleCatalog.CreateInstance(Env); + var runner = new GraphRunner(Env, catalog, graphJson[FieldNames.Nodes] as JArray); + runner.SetInput("TrainingData", datasetTrain); + runner.SetInput("TestingData", datasetTest); + runner.RunAll(); + + var autoMlState = runner.GetOutput("StateOut"); + Assert.NotNull(autoMlState); + var allPipelines = autoMlState.GetAllEvaluatedPipelines(); + var bestPipeline = autoMlState.GetBestPipeline(); + Assert.Equal(allPipelines.Length, numIterations); + + var trainAuc = bestPipeline.PerformanceSummary.TrainingMetricValue; + var testAuc = bestPipeline.PerformanceSummary.MetricValue; + Assert.True((0.94 < trainAuc) && (trainAuc < 0.95)); + Assert.True((0.83 < testAuc) && (testAuc < 0.84)); + + var results = runner.GetOutput("ResultsOut"); + Assert.NotNull(results); + var rows = PipelinePattern.ExtractResults(Env, results, + "Graph", "MetricValue", "PipelineId", "TrainingMetricValue", "FirstInput", "PredictorModel"); + Assert.True(rows.Length == numIterations); + Assert.True(rows.All(r => r.TrainingMetricValue > 0.1)); + } + [Fact] public void TestRocketPipelineEngine() {