From 47894d1e31c09e5e5305a0066d85b1637f6a9e31 Mon Sep 17 00:00:00 2001 From: Eric Erhardt Date: Tue, 4 Sep 2018 18:38:52 -0500 Subject: [PATCH] Remove SubComponent usage from ML.PipelineInference. Working towards #585 --- .../ExperimentsGenerator.cs | 14 ++- .../TransformInference.cs | 91 +++++++++++-------- 2 files changed, 63 insertions(+), 42 deletions(-) diff --git a/src/Microsoft.ML.PipelineInference/ExperimentsGenerator.cs b/src/Microsoft.ML.PipelineInference/ExperimentsGenerator.cs index c9621029d8..053b86a40b 100644 --- a/src/Microsoft.ML.PipelineInference/ExperimentsGenerator.cs +++ b/src/Microsoft.ML.PipelineInference/ExperimentsGenerator.cs @@ -101,7 +101,7 @@ public string ToStringRep(string sweeperType) public static List GenerateCandidates(IHostEnvironment env, string dataFile, string schemaDefinitionFile) { var patterns = new List(); - string loaderSettings = ""; + string loaderSettings; Type predictorType; TransformInference.InferenceResult inferenceResult; @@ -112,11 +112,17 @@ public static List GenerateCandidates(IHostEnvironment env, string dataFi // Exclude the hidden learners, and the metalinear learners. var trainers = ComponentCatalog.GetAllDerivedClasses(typeof(ITrainer), predictorType).Where(cls => !cls.IsHidden); - var loaderSubComponent = new SubComponent("TextLoader", loaderSettings); - string loader = $" loader={loaderSubComponent}"; + if (!string.IsNullOrEmpty(loaderSettings)) + { + StringBuilder sb = new StringBuilder(); + CmdQuoter.QuoteValue(loaderSettings, sb, true); + loaderSettings = sb.ToString(); + } + + string loader = $" loader=TextLoader{loaderSettings}"; // REVIEW: there are more learners than recipes atm. - // Flip looping through recipes, than through learners if the cardinality changes. + // Flip looping through recipes, then through learners if the cardinality changes. foreach (ComponentCatalog.LoadableClassInfo cl in trainers) { string learnerSettings; diff --git a/src/Microsoft.ML.PipelineInference/TransformInference.cs b/src/Microsoft.ML.PipelineInference/TransformInference.cs index b636c0d058..be84a681ac 100644 --- a/src/Microsoft.ML.PipelineInference/TransformInference.cs +++ b/src/Microsoft.ML.PipelineInference/TransformInference.cs @@ -49,7 +49,7 @@ public Arguments() public struct SuggestedTransform : IEquatable { public readonly string Description; - public readonly SubComponent Transform; + public readonly TransformString Transform; // Indicates the type of the transform. This is used by the recipe to leave/take transform. public readonly Type ExpertType; public TransformPipelineNode PipelineNode; @@ -61,7 +61,7 @@ public struct SuggestedTransform : IEquatable public bool AlwaysInclude { get; set; } public SuggestedTransform(string description, - SubComponent transform, Type expertType, + TransformString transform, Type expertType, TransformPipelineNode pipelineNode = null, int atomicGroupId = -1, ColumnRoutingStructure routingStructure = null, bool alwaysInclude = false) { @@ -92,6 +92,34 @@ public SuggestedTransform Clone() public override string ToString() => ExpertType.Name; } + public struct TransformString : IEquatable + { + public readonly string Kind; + public readonly string Settings; + + public TransformString(string kind, string settings) + { + Kind = kind ?? ""; + Settings = settings ?? ""; + } + + public bool Equals(TransformString other) + { + return Kind == other.Kind && + Settings == other.Settings; + } + + public override string ToString() + { + if (Settings.Length == 0) + return Kind; + + StringBuilder sb = new StringBuilder(); + CmdQuoter.QuoteValue(Settings, sb, true); + return Kind + sb.ToString(); + } + } + public struct InferenceResult { public readonly SuggestedTransform[] SuggestedTransforms; @@ -338,8 +366,7 @@ public override IEnumerable Apply(IntermediateColumn[] colum col.GetUniqueValueCounts(out var unique, out var _, out var _); ch.Info("Label column '{0}' is text. Suggested auto-labeling.", col.ColumnName); - var args = new SubComponent("AutoLabel", - new[] { columnArgument.ToString() }); + var args = new TransformString("AutoLabel", columnArgument.ToString()); string dest = DefaultColumnNames.Label; string source = columnNameQuoted.ToString(); @@ -382,8 +409,7 @@ public override IEnumerable Apply(IntermediateColumn[] colum { string dest = DefaultColumnNames.Label; string source = columnNameQuoted.ToString(); - var args = new SubComponent("Copy", - new[] { columnArgument.ToString() }); + var args = new TransformString("Copy", columnArgument.ToString()); var epInput = new ML.Transforms.ColumnCopier { Column = new[] @@ -443,8 +469,7 @@ public override IEnumerable Apply(IntermediateColumn[] colum { ch.Info("Group Id column '{0}' is text. Suggested hashing.", col.ColumnName); // REVIEW: we could potentially apply HashJoin to vectors of text. - var args = new SubComponent("Hash", - new[] { columnArgument.ToString() }); + var args = new TransformString("Hash", columnArgument.ToString()); string dest = DefaultColumnNames.GroupId; string source = columnNameQuoted.ToString(); var epInput = new ML.Transforms.CategoricalHashOneHotVectorizer @@ -476,8 +501,7 @@ public override IEnumerable Apply(IntermediateColumn[] colum else if (col.ColumnName != DefaultColumnNames.GroupId) { ch.Warning("Group Id column '{0}' is not text. Couldn't determine correct transformation."); - var args = new SubComponent("Copy", - new[] { columnArgument.ToString() }); + var args = new TransformString("Copy", columnArgument.ToString()); string dest = DefaultColumnNames.GroupId; string source = columnNameQuoted.ToString(); @@ -628,8 +652,7 @@ public override IEnumerable Apply(IntermediateColumn[] colum featureCols.AddRange(catColumns.Select(c => c.Name)); ch.Info("Suggested dictionary-based category encoding for categorical columns."); - var args = new SubComponent("Cat", - new[] { colSpecCat.ToString() }); + var args = new TransformString("Cat", colSpecCat.ToString()); yield return new SuggestedTransform("Convert categorical features to indicator vectors", args, GetType(), new TransformPipelineNode(epInput), -1, routingStructure); } @@ -646,8 +669,7 @@ public override IEnumerable Apply(IntermediateColumn[] colum featureCols.AddRange(catColumns.Select(c => c.Name)); ch.Info("Suggested hash-based category encoding for categorical columns."); - var args = new SubComponent("CatHash", - new[] { colSpecCatHash.ToString() }); + var args = new TransformString("CatHash", colSpecCatHash.ToString()); yield return new SuggestedTransform("Hash categorical features and convert to indicator vectors", args, GetType(), new TransformPipelineNode(epInput), -1, routingStructure); } @@ -718,8 +740,7 @@ public override IEnumerable Apply(IntermediateColumn[] colum if (columnArgument.Length > 0) { ch.Info("Suggested conversion to numeric for boolean features."); - var args = new SubComponent("Convert", - new[] { $"{columnArgument}type=R4" }); + var args = new TransformString("Convert", $"{columnArgument}type=R4"); var epInput = new ML.Transforms.ColumnTypeConverter { Column = epColumns.ToArray(), ResultType = ML.Data.DataKind.R4 }; ColumnRoutingStructure.AnnotatedName[] columnsSource = epColumns.Select(c => new ColumnRoutingStructure.AnnotatedName { IsNumeric = false, Name = c.Name }).ToArray(); @@ -842,8 +863,11 @@ public static SuggestedTransform ConcatColumnsIntoOne(List columnNames, return new SuggestedTransform( $"Concatenate {columnsToConcat} columns into column {concatColumnName}", - new SubComponent("Concat", - new[] { arguments }), transformType, new TransformPipelineNode(epInput), -1, routingStructure); + new TransformString("Concat", arguments), + transformType, + new TransformPipelineNode(epInput), + -1, + routingStructure); } public static SuggestedTransform TextTransformUnigramTriChar(string srcColumn, string dstColumn, string arg, Type transformType) @@ -897,7 +921,7 @@ public static SuggestedTransform TextTransform(string srcColumn, string dstColum "Apply text-vectorize featurization(" + outputMsg + ") for column '{0}' and output to column '{1}'", srcColumn, dstColumn), - new SubComponent("Text", arg), + new TransformString("Text", arg), transformType, pipelineNode, -1, routingStructure); } } @@ -935,8 +959,7 @@ public override IEnumerable Apply(IntermediateColumn[] colum yield return InferenceHelpers.TextTransformUnigramTriChar(concatTextColumnName, featureTextColumn, " tokens=+", GetType()); //Get Tree Featurizer with FastTreeRegression. - var args = new SubComponent("TreeFeaturizationTransform", - new[] { "tr=FastTreeRegression feat=" + featureTextColumn }); + var args = new TransformString("TreeFeaturizationTransform", "tr=FastTreeRegression feat=" + featureTextColumn); // REVIEW: Once entrypoint defined for TreeFeaturizationTransform, add ep object. string treeFeaturizerOutputColumnName = "Leaves"; @@ -969,13 +992,11 @@ public override IEnumerable Apply(IntermediateColumn[] colum var routingStructureCr = new ColumnRoutingStructure(columnsSourceCr, columnsDestCr); yield return new SuggestedTransform("Concatenate-Rename Leaves column generated by tree featurizer to " + featuresTreeFeatColumn, - new SubComponent("Concat", - $"col={featuresTreeFeatColumn}:{treeFeaturizerOutputColumnName}"), + new TransformString("Concat", $"col={featuresTreeFeatColumn}:{treeFeaturizerOutputColumnName}"), GetType(), new TransformPipelineNode(epInput), -1, routingStructureCr); //Get TrainScore with KMeansPlusPlus. - args = new SubComponent("TrainScore", - new[] { "tr=KMeansPlusPlus feat=" + featureTextColumn }); + args = new TransformString("TrainScore", "tr=KMeansPlusPlus feat=" + featureTextColumn); // REVIEW: Need entrypoint for TrainScore, then add entrypoint pipeline object string kMeansOutputColumnName = "Score"; @@ -1008,8 +1029,7 @@ public override IEnumerable Apply(IntermediateColumn[] colum var routingStructureCc = new ColumnRoutingStructure(columnsSourceCc, columnsDestCc); yield return new SuggestedTransform("Concatenate-Rename Score column generated by Train Score with KMeans to " + featuresKMeansColumn, - new SubComponent("Concat", - $"col={featuresKMeansColumn}:{kMeansOutputColumnName}"), + new TransformString("Concat", $"col={featuresKMeansColumn}:{kMeansOutputColumnName}"), GetType(), new TransformPipelineNode(epInput2), -1, routingStructureCc); tempColumnList.Add(featureTextColumn); @@ -1058,8 +1078,7 @@ public override IEnumerable Apply(IntermediateColumn[] colum yield return InferenceHelpers.TextTransformUnigramTriChar(concatTextColumnName, concatTextColumnTextFeature, string.Empty, GetType()); //Get Tree Featurizer with FastTreeRegression. - var args = new SubComponent("TreeFeaturizationTransform", - new[] { "tr=FastForestRegression{shuffleLabels+ nl=80} feat=" + concatTextColumnTextFeature }); + var args = new TransformString("TreeFeaturizationTransform", "tr=FastForestRegression{shuffleLabels+ nl=80} feat=" + concatTextColumnTextFeature); string treeFeaturizerOutputColName = "Leaves"; ColumnRoutingStructure.AnnotatedName[] columnsSource = { new ColumnRoutingStructure.AnnotatedName { IsNumeric = true, Name = concatTextColumnTextFeature} }; @@ -1119,8 +1138,7 @@ public override IEnumerable Apply(IntermediateColumn[] colum string columnDestRenamed = $"{columnNameSafe}{columnDestSuffix}"; var columnSourceDest = quoted ? $"col={{name={columnDestRenamed} src={columnNameSafe}}}" : $"col={columnDestRenamed}:{columnNameSafe}"; - var args = new SubComponent("Text", - new[] { columnSourceDest }); + var args = new TransformString("Text", columnSourceDest); featureCols.Add(columnDestRenamed); var epInput = new ML.Transforms.TextFeaturizer @@ -1268,7 +1286,7 @@ public override IEnumerable Apply(IntermediateColumn[] colum if (found) { string name = columnNameQuoted.ToString(); - var args = new SubComponent("NAHandle", new[] { columnArgument.ToString() }); + var args = new TransformString("NAHandle", columnArgument.ToString()); var epInput = new ML.Transforms.MissingValueHandler { Column = new[] @@ -1348,8 +1366,7 @@ public override IEnumerable Apply(IntermediateColumn[] colum else arguments = $"col={DefaultColumnNames.Features}:{string.Join(",", colList)}"; - var args = new SubComponent("Concat", - new[] { arguments }); + var args = new TransformString("Concat", arguments); var epInput = new ML.Transforms.ColumnConcatenator { Column = new[] @@ -1438,8 +1455,7 @@ public override IEnumerable Apply(IntermediateColumn[] colum columnArgument.AppendFormat("{0}:{1}", DefaultColumnNames.Name, colSpec); columnNameQuoted.AppendFormat("{0}", colSpec); } - var args = new SubComponent("Copy", - new[] { columnArgument.ToString() }); + var args = new TransformString("Copy", columnArgument.ToString()); var epInput = new ML.Transforms.ColumnCopier { Column = new[] @@ -1485,8 +1501,7 @@ public override IEnumerable Apply(IntermediateColumn[] colum arguments = $"col={{ name={DefaultColumnNames.Name} {quoutedArgument} }}"; else arguments = $"col={DefaultColumnNames.Name}:{string.Join(",", colSpecTextOnly)}"; - var args = new SubComponent("Concat", - new[] { arguments }); + var args = new TransformString("Concat", arguments); var epInput = new ML.Transforms.ColumnConcatenator { Column = new[]