From ab70ee01f2ac7291670cbe4e3b92c97f77536896 Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Thu, 10 Jan 2019 16:29:05 -0800 Subject: [PATCH 1/6] swappng the order or arguments on the constructors of the ConversionExtensionsCatalog. Internalizing the constructors. --- .../Properties/AssemblyInfo.cs | 6 ++++++ .../ConversionsExtensionsCatalog.cs | 20 +++++++++---------- src/Microsoft.ML.Data/Transforms/Hashing.cs | 8 ++++---- .../Transforms/KeyToVector.cs | 6 +++--- .../Transforms/TypeConverting.cs | 8 ++++---- .../Transforms/ValueToKeyMappingEstimator.cs | 8 ++++---- test/Microsoft.ML.Benchmarks/RffTransform.cs | 2 +- .../FeatureContributionTests.cs | 2 +- test/Microsoft.ML.Tests/OnnxConversionTest.cs | 2 +- .../PermutationFeatureImportanceTests.cs | 4 ++-- test/Microsoft.ML.Tests/RangeFilterTests.cs | 2 +- .../CookbookSamplesDynamicApi.cs | 4 ++-- .../Estimators/DecomposableTrainAndPredict.cs | 2 +- .../Scenarios/Api/Estimators/Extensibility.cs | 2 +- .../Api/Estimators/Metacomponents.cs | 2 +- ...PlantClassificationWithStringLabelTests.cs | 2 +- .../Scenarios/TensorflowTests.cs | 2 +- .../TensorflowTests.cs | 2 +- .../TrainerEstimators/MetalinearEstimators.cs | 2 +- .../TrainerEstimators/TrainerEstimators.cs | 2 +- 20 files changed, 47 insertions(+), 41 deletions(-) diff --git a/src/Microsoft.ML.Data/Properties/AssemblyInfo.cs b/src/Microsoft.ML.Data/Properties/AssemblyInfo.cs index abfaa6d2f9..f655581577 100644 --- a/src/Microsoft.ML.Data/Properties/AssemblyInfo.cs +++ b/src/Microsoft.ML.Data/Properties/AssemblyInfo.cs @@ -5,9 +5,11 @@ using System.Runtime.CompilerServices; using Microsoft.ML; +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Benchmarks" + PublicKey.TestValue)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.TestFramework" + PublicKey.TestValue)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Tests" + PublicKey.TestValue)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Core.Tests" + PublicKey.TestValue)] +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.CpuMath.PerformanceTests" + PublicKey.TestValue)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.InferenceTesting" + PublicKey.TestValue)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.OnnxTransformTest" + PublicKey.TestValue)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Predictor.Tests" + PublicKey.TestValue)] @@ -20,6 +22,10 @@ [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Api" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Ensemble" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.FastTree" + PublicKey.Value)] +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.DnnImageFeaturizer.AlexNet" + PublicKey.Value)] +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.DnnImageFeaturizer.ResNet101" + PublicKey.Value)] +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.DnnImageFeaturizer.ResNet18" + PublicKey.Value)] +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.DnnImageFeaturizer.ResNet50" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.HalLearners" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.KMeansClustering" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.LightGBM" + PublicKey.Value)] diff --git a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs index d2481fb4be..746e9e7d4d 100644 --- a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs +++ b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs @@ -21,13 +21,13 @@ public static class ConversionsExtensionsCatalog /// /// The transform's catalog. /// Name of the input column. - /// Name of the column to be transformed. If this is null '' will be used. + /// Name of the column to be transformed. /// Number of bits to hash into. Must be between 1 and 31, inclusive. /// During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - public static HashingEstimator Hash(this TransformsCatalog.ConversionTransforms catalog, string inputColumn, string outputColumn = null, + public static HashingEstimator Hash(this TransformsCatalog.ConversionTransforms catalog, string outputColumn, string inputColumn, int hashBits = HashDefaults.HashBits, int invertHash = HashDefaults.InvertHash) => new HashingEstimator(CatalogUtils.GetEnvironment(catalog), inputColumn, outputColumn, hashBits, invertHash); @@ -44,11 +44,11 @@ public static HashingEstimator Hash(this TransformsCatalog.ConversionTransforms /// /// The transform's catalog. /// Name of the input column. - /// Name of the column to be transformed. If this is null '' will be used. + /// Name of the column to be transformed. /// Number of bits to hash into. Must be between 1 and 31, inclusive. - public static TypeConvertingEstimator ConvertType(this TransformsCatalog.ConversionTransforms catalog, string inputColumn, string outputColumn = null, + public static TypeConvertingEstimator ConvertType(this TransformsCatalog.ConversionTransforms catalog, string outputColumn, string inputColumn, DataKind outputKind = ConvertDefaults.DefaultOutputKind) - => new TypeConvertingEstimator(CatalogUtils.GetEnvironment(catalog), inputColumn, outputColumn, outputKind); + => new TypeConvertingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumn, inputColumn, outputKind); /// /// Changes column type of the input column. @@ -92,24 +92,24 @@ public static KeyToVectorMappingEstimator MapKeyToVector(this TransformsCatalog. /// The name of the output column. /// Whether bagging is used for the conversion. public static KeyToVectorMappingEstimator MapKeyToVector(this TransformsCatalog.ConversionTransforms catalog, - string inputColumn, string outputColumn = null, bool bag = KeyToVectorMappingEstimator.Defaults.Bag) - => new KeyToVectorMappingEstimator(CatalogUtils.GetEnvironment(catalog), inputColumn, outputColumn, bag); + string outputColumn, string inputColumn, bool bag = KeyToVectorMappingEstimator.Defaults.Bag) + => new KeyToVectorMappingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumn, inputColumn, bag); /// /// Converts value types into . /// /// The categorical transform's catalog. /// Name of the column to be transformed. - /// Name of the output column. If this is null '' will be used. + /// Name of the output column. /// Maximum number of keys to keep per column when auto-training. /// How items should be ordered when vectorized. If choosen they will be in the order encountered. /// If , items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a'). public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.ConversionTransforms catalog, + string outputColumn, string inputColumn, - string outputColumn = null, int maxNumTerms = ValueToKeyMappingEstimator.Defaults.MaxNumTerms, ValueToKeyMappingTransformer.SortOrder sort = ValueToKeyMappingEstimator.Defaults.Sort) - => new ValueToKeyMappingEstimator(CatalogUtils.GetEnvironment(catalog), inputColumn, outputColumn, maxNumTerms, sort); + => new ValueToKeyMappingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumn, inputColumn, maxNumTerms, sort); /// /// Converts value types into loading the keys to use from . diff --git a/src/Microsoft.ML.Data/Transforms/Hashing.cs b/src/Microsoft.ML.Data/Transforms/Hashing.cs index 95c7e855b5..7e93061bf9 100644 --- a/src/Microsoft.ML.Data/Transforms/Hashing.cs +++ b/src/Microsoft.ML.Data/Transforms/Hashing.cs @@ -1216,15 +1216,15 @@ internal static bool IsColumnTypeValid(ColumnType type) /// /// Host Environment. /// Name of the column to be transformed. - /// Name of the output column. If this is null '' will be used. + /// Name of the output column. /// Number of bits to hash into. Must be between 1 and 31, inclusive. /// During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - public HashingEstimator(IHostEnvironment env, string inputColumn, string outputColumn = null, + internal HashingEstimator(IHostEnvironment env, string inputColumn, string outputColumn, int hashBits = Defaults.HashBits, int invertHash = Defaults.InvertHash) - : this(env, new HashingTransformer.ColumnInfo(inputColumn, outputColumn ?? inputColumn, hashBits: hashBits, invertHash: invertHash)) + : this(env, new HashingTransformer.ColumnInfo(inputColumn, outputColumn, hashBits: hashBits, invertHash: invertHash)) { } @@ -1233,7 +1233,7 @@ public HashingEstimator(IHostEnvironment env, string inputColumn, string outputC /// /// Host Environment. /// Description of dataset columns and how to process them. - public HashingEstimator(IHostEnvironment env, params HashingTransformer.ColumnInfo[] columns) + internal HashingEstimator(IHostEnvironment env, params HashingTransformer.ColumnInfo[] columns) { Contracts.CheckValue(env, nameof(env)); _host = env.Register(nameof(HashingEstimator)); diff --git a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs index c918dc6c5c..9f58e6e82e 100644 --- a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs +++ b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs @@ -744,13 +744,13 @@ internal static class Defaults public const bool Bag = false; } - public KeyToVectorMappingEstimator(IHostEnvironment env, params KeyToVectorMappingTransformer.ColumnInfo[] columns) + internal KeyToVectorMappingEstimator(IHostEnvironment env, params KeyToVectorMappingTransformer.ColumnInfo[] columns) : this(env, new KeyToVectorMappingTransformer(env, columns)) { } - public KeyToVectorMappingEstimator(IHostEnvironment env, string inputColumn, string outputColumn = null, bool bag = Defaults.Bag) - : this(env, new KeyToVectorMappingTransformer(env, new KeyToVectorMappingTransformer.ColumnInfo(inputColumn, outputColumn ?? inputColumn, bag))) + internal KeyToVectorMappingEstimator(IHostEnvironment env, string outputColumn, string inputColumn, bool bag = Defaults.Bag) + : this(env, new KeyToVectorMappingTransformer(env, new KeyToVectorMappingTransformer.ColumnInfo(inputColumn, outputColumn, bag))) { } diff --git a/src/Microsoft.ML.Data/Transforms/TypeConverting.cs b/src/Microsoft.ML.Data/Transforms/TypeConverting.cs index 3b9a3b372b..06efae4e0b 100644 --- a/src/Microsoft.ML.Data/Transforms/TypeConverting.cs +++ b/src/Microsoft.ML.Data/Transforms/TypeConverting.cs @@ -538,17 +538,17 @@ internal sealed class Defaults /// Name of the input column. /// Name of the output column. /// The expected type of the converted column. - public TypeConvertingEstimator(IHostEnvironment env, - string inputColumn, string outputColumn = null, + internal TypeConvertingEstimator(IHostEnvironment env, + string outputColumn, string inputColumn, DataKind outputKind = Defaults.DefaultOutputKind) - : this(env, new TypeConvertingTransformer.ColumnInfo(inputColumn, outputColumn ?? inputColumn, outputKind)) + : this(env, new TypeConvertingTransformer.ColumnInfo(inputColumn, outputColumn, outputKind)) { } /// /// Create a that takes multiple pairs of columns. /// - public TypeConvertingEstimator(IHostEnvironment env, params TypeConvertingTransformer.ColumnInfo[] columns) : + internal TypeConvertingEstimator(IHostEnvironment env, params TypeConvertingTransformer.ColumnInfo[] columns) : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(TypeConvertingEstimator)), new TypeConvertingTransformer(env, columns)) { } diff --git a/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingEstimator.cs b/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingEstimator.cs index ea71b7a45b..d6aa366d43 100644 --- a/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingEstimator.cs +++ b/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingEstimator.cs @@ -28,16 +28,16 @@ public static class Defaults /// /// Host Environment. /// Name of the column to be transformed. - /// Name of the output column. If this is null '' will be used. + /// Name of the output column. /// Maximum number of keys to keep per column when auto-training. /// How items should be ordered when vectorized. If choosen they will be in the order encountered. /// If , items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a'). - public ValueToKeyMappingEstimator(IHostEnvironment env, string inputColumn, string outputColumn = null, int maxNumTerms = Defaults.MaxNumTerms, ValueToKeyMappingTransformer.SortOrder sort = Defaults.Sort) : - this(env, new [] { new ValueToKeyMappingTransformer.ColumnInfo(inputColumn, outputColumn ?? inputColumn, maxNumTerms, sort) }) + internal ValueToKeyMappingEstimator(IHostEnvironment env, string outputColumn, string inputColumn, int maxNumTerms = Defaults.MaxNumTerms, ValueToKeyMappingTransformer.SortOrder sort = Defaults.Sort) : + this(env, new [] { new ValueToKeyMappingTransformer.ColumnInfo(inputColumn, outputColumn, maxNumTerms, sort) }) { } - public ValueToKeyMappingEstimator(IHostEnvironment env, ValueToKeyMappingTransformer.ColumnInfo[] columns, + internal ValueToKeyMappingEstimator(IHostEnvironment env, ValueToKeyMappingTransformer.ColumnInfo[] columns, string file = null, string termsColumn = null, IComponentFactory loaderFactory = null) { diff --git a/test/Microsoft.ML.Benchmarks/RffTransform.cs b/test/Microsoft.ML.Benchmarks/RffTransform.cs index 7744addefb..fefabd3f16 100644 --- a/test/Microsoft.ML.Benchmarks/RffTransform.cs +++ b/test/Microsoft.ML.Benchmarks/RffTransform.cs @@ -46,7 +46,7 @@ public void CV_Multiclass_Digits_RffTransform_OVAAveragedPerceptron() var pipeline = mlContext.Transforms.Projection.CreateRandomFourierFeatures("Features", "FeaturesRFF") .AppendCacheCheckpoint(mlContext) .Append(mlContext.Transforms.Concatenate("Features", "FeaturesRFF")) - .Append(new ValueToKeyMappingEstimator(mlContext, "Label")) + .Append(new ValueToKeyMappingEstimator(mlContext, "Label", "Label")) .Append(mlContext.MulticlassClassification.Trainers.OneVersusAll(mlContext.BinaryClassification.Trainers.AveragedPerceptron(numIterations: 10))); var cvResults = mlContext.MulticlassClassification.CrossValidate(data, pipeline, numFolds: 5); diff --git a/test/Microsoft.ML.Tests/FeatureContributionTests.cs b/test/Microsoft.ML.Tests/FeatureContributionTests.cs index e41cf6954d..cfcd46f9f0 100644 --- a/test/Microsoft.ML.Tests/FeatureContributionTests.cs +++ b/test/Microsoft.ML.Tests/FeatureContributionTests.cs @@ -269,7 +269,7 @@ private IDataView GetSparseDataset(TaskType task = TaskType.Regression, int numb // Create a keytype for Ranking if (task == TaskType.Ranking) - return pipeline.Append(ML.Transforms.Conversion.MapValueToKey("GroupId")) + return pipeline.Append(ML.Transforms.Conversion.MapValueToKey("GroupId", "GroupId")) .Fit(srcDV).Transform(srcDV); return pipeline.Fit(srcDV).Transform(srcDV); diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs index 5133a58add..2e7df19194 100644 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs @@ -371,7 +371,7 @@ public void MulticlassLogisticRegressionOnnxConversionTest() separatorChar: '\t'); var pipeline = mlContext.Transforms.Normalize("Features"). - Append(mlContext.Transforms.Conversion.MapValueToKey("Label")). + Append(mlContext.Transforms.Conversion.MapValueToKey("Label", "Label")). Append(mlContext.MulticlassClassification.Trainers.LogisticRegression(labelColumn: "Label", featureColumn: "Features", advancedSettings: settings => { diff --git a/test/Microsoft.ML.Tests/PermutationFeatureImportanceTests.cs b/test/Microsoft.ML.Tests/PermutationFeatureImportanceTests.cs index 118145b3b7..df1690aa3c 100644 --- a/test/Microsoft.ML.Tests/PermutationFeatureImportanceTests.cs +++ b/test/Microsoft.ML.Tests/PermutationFeatureImportanceTests.cs @@ -420,7 +420,7 @@ private IDataView GetDenseDataset(TaskType task = TaskType.Regression) // Create a keytype for Ranking if (task == TaskType.Ranking) - return pipeline.Append(ML.Transforms.Conversion.MapValueToKey("GroupId")) + return pipeline.Append(ML.Transforms.Conversion.MapValueToKey("GroupId", "GroupId")) .Fit(srcDV).Transform(srcDV); return pipeline.Fit(srcDV).Transform(srcDV); @@ -496,7 +496,7 @@ private IDataView GetSparseDataset(TaskType task = TaskType.Regression) // Create a keytype for Ranking if (task == TaskType.Ranking) - return pipeline.Append(ML.Transforms.Conversion.MapValueToKey("GroupId")) + return pipeline.Append(ML.Transforms.Conversion.MapValueToKey("GroupId", "GroupId")) .Fit(srcDV).Transform(srcDV); return pipeline.Fit(srcDV).Transform(srcDV); diff --git a/test/Microsoft.ML.Tests/RangeFilterTests.cs b/test/Microsoft.ML.Tests/RangeFilterTests.cs index 131bde9fa7..5c440b7d9f 100644 --- a/test/Microsoft.ML.Tests/RangeFilterTests.cs +++ b/test/Microsoft.ML.Tests/RangeFilterTests.cs @@ -28,7 +28,7 @@ public void RangeFilterTest() var cnt = data1.GetColumn(ML, "Floats").Count(); Assert.Equal(2L, cnt); - data = ML.Transforms.Conversion.Hash("Strings", "Key", hashBits: 20).Fit(data).Transform(data); + data = ML.Transforms.Conversion.Hash("Key", "Strings", hashBits: 20).Fit(data).Transform(data); var data2 = ML.Data.FilterByKeyColumnFraction(data, "Key", upperBound: 0.5); cnt = data2.GetColumn(ML, "Floats").Count(); Assert.Equal(1L, cnt); diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs index 2d00f36957..286b758127 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs @@ -160,7 +160,7 @@ private ITransformer TrainOnIris(string irisDataPath) // Concatenate all the features together into one column 'Features'. mlContext.Transforms.Concatenate("Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") // Note that the label is text, so it needs to be converted to key. - .Append(mlContext.Transforms.Conversion.MapValueToKey("Label"), TransformerScope.TrainTest) + .Append(mlContext.Transforms.Conversion.MapValueToKey("Label", "Label"), TransformerScope.TrainTest) // Cache data in memory for steps after the cache check point stage. .AppendCacheCheckpoint(mlContext) // Use the multi-class SDCA model to predict the label using features. @@ -389,7 +389,7 @@ private void CrossValidationOn(string dataPath) // Concatenate all the features together into one column 'Features'. mlContext.Transforms.Concatenate("Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") // Note that the label is text, so it needs to be converted to key. - .Append(mlContext.Transforms.Conversion.MapValueToKey("Label"), TransformerScope.TrainTest) + .Append(mlContext.Transforms.Conversion.MapValueToKey("Label", "Label"), TransformerScope.TrainTest) // Cache data in memory so that SDCA trainer will be able to randomly access training examples without // reading data from disk multiple times. Data will be cached at its first use in any downstream step. // Notice that unused part in the data may not be cached. diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/DecomposableTrainAndPredict.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/DecomposableTrainAndPredict.cs index 94f53e65f2..00f9281b75 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/DecomposableTrainAndPredict.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/DecomposableTrainAndPredict.cs @@ -30,7 +30,7 @@ void DecomposableTrainAndPredict() var data = ml.Data.ReadFromTextFile(dataPath, separatorChar: ','); var pipeline = new ColumnConcatenatingEstimator (ml, "Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") - .Append(new ValueToKeyMappingEstimator(ml, "Label"), TransformerScope.TrainTest) + .Append(new ValueToKeyMappingEstimator(ml, "Label", "Label"), TransformerScope.TrainTest) .Append(ml.MulticlassClassification.Trainers.StochasticDualCoordinateAscent("Label", "Features",advancedSettings: s => { s.MaxIterations = 100; s.Shuffle = true; s.NumThreads = 1; })) .Append(new KeyToValueMappingEstimator(ml, "PredictedLabel")); diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Extensibility.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Extensibility.cs index 84bd6691e9..105cceb885 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Extensibility.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Extensibility.cs @@ -39,7 +39,7 @@ void Extensibility() }; var pipeline = new ColumnConcatenatingEstimator (ml, "Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") .Append(new CustomMappingEstimator(ml, action, null), TransformerScope.TrainTest) - .Append(new ValueToKeyMappingEstimator(ml, "Label"), TransformerScope.TrainTest) + .Append(new ValueToKeyMappingEstimator(ml, "Label", "Label"), TransformerScope.TrainTest) .Append(ml.MulticlassClassification.Trainers.StochasticDualCoordinateAscent("Label", "Features", advancedSettings: (s) => { s.MaxIterations = 100; s.Shuffle = true; s.NumThreads = 1; })) .Append(new KeyToValueMappingEstimator(ml, "PredictedLabel")); diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Metacomponents.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Metacomponents.cs index 70b6b0bbb5..3921de5fd1 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Metacomponents.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Metacomponents.cs @@ -27,7 +27,7 @@ public void Metacomponents() var sdcaTrainer = ml.BinaryClassification.Trainers.StochasticDualCoordinateAscent("Label", "Features", advancedSettings: (s) => { s.MaxIterations = 100; s.Shuffle = true; s.NumThreads = 1; }); var pipeline = new ColumnConcatenatingEstimator (ml, "Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") - .Append(new ValueToKeyMappingEstimator(ml, "Label"), TransformerScope.TrainTest) + .Append(new ValueToKeyMappingEstimator(ml, "Label", "Label"), TransformerScope.TrainTest) .Append(new Ova(ml, sdcaTrainer)) .Append(new KeyToValueMappingEstimator(ml, "PredictedLabel")); diff --git a/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs b/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs index ff38fbebe5..7fcc88397c 100644 --- a/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs @@ -34,7 +34,7 @@ public void TrainAndPredictIrisModelWithStringLabelTest() // Create Estimator var pipe = mlContext.Transforms.Concatenate("Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") .Append(mlContext.Transforms.Normalize("Features")) - .Append(mlContext.Transforms.Conversion.MapValueToKey("IrisPlantType", "Label"), TransformerScope.TrainTest) + .Append(mlContext.Transforms.Conversion.MapValueToKey("Label","IrisPlantType"), TransformerScope.TrainTest) .AppendCacheCheckpoint(mlContext) .Append(mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent("Label", "Features", advancedSettings: s => s.NumThreads = 1)) .Append(mlContext.Transforms.Conversion.MapKeyToValue(("PredictedLabel", "Plant"))); diff --git a/test/Microsoft.ML.Tests/Scenarios/TensorflowTests.cs b/test/Microsoft.ML.Tests/Scenarios/TensorflowTests.cs index 430ae1c28e..f1e42ca284 100644 --- a/test/Microsoft.ML.Tests/Scenarios/TensorflowTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/TensorflowTests.cs @@ -39,7 +39,7 @@ public void TensorFlowTransforCifarEndToEndTest() .Append(new ImagePixelExtractingEstimator(mlContext, "ImageCropped", "Input", interleave: true)) .Append(new TensorFlowEstimator(mlContext, model_location, new[] { "Input" }, new[] { "Output" })) .Append(new ColumnConcatenatingEstimator(mlContext, "Features", "Output")) - .Append(new ValueToKeyMappingEstimator(mlContext, "Label")) + .Append(new ValueToKeyMappingEstimator(mlContext, "Label", "Label")) .AppendCacheCheckpoint(mlContext) .Append(new SdcaMultiClassTrainer(mlContext)); diff --git a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs index 441fcc4fc1..18a53becf1 100644 --- a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs +++ b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs @@ -444,7 +444,7 @@ public void TensorFlowTransformMNISTLRTrainingTest() ReTrain = true })) .Append(mlContext.Transforms.Concatenate("Features", "Prediction")) - .Append(mlContext.Transforms.Conversion.MapValueToKey("Label", "KeyLabel", maxNumTerms: 10)) + .Append(mlContext.Transforms.Conversion.MapValueToKey("KeyLabel", "Label", maxNumTerms: 10)) .Append(mlContext.MulticlassClassification.Trainers.LightGbm("KeyLabel", "Features")); var trainedModel = pipe.Fit(trainData); diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/MetalinearEstimators.cs b/test/Microsoft.ML.Tests/TrainerEstimators/MetalinearEstimators.cs index 8371e3e415..164b48068a 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/MetalinearEstimators.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/MetalinearEstimators.cs @@ -76,7 +76,7 @@ public void MetacomponentsFeaturesRenamed() var sdcaTrainer = new SdcaBinaryTrainer(Env, "Label", "Vars", advancedSettings: (s) => { s.MaxIterations = 100; s.Shuffle = true; s.NumThreads = 1; }); var pipeline = new ColumnConcatenatingEstimator(Env, "Vars", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") - .Append(new ValueToKeyMappingEstimator(Env, "Label"), TransformerScope.TrainTest) + .Append(new ValueToKeyMappingEstimator(Env, "Label", "Label"), TransformerScope.TrainTest) .Append(new Ova(Env, sdcaTrainer)) .Append(new KeyToValueMappingEstimator(Env, "PredictedLabel")); diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/TrainerEstimators.cs b/test/Microsoft.ML.Tests/TrainerEstimators/TrainerEstimators.cs index 85af744ac7..ae02d6fc39 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/TrainerEstimators.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/TrainerEstimators.cs @@ -193,7 +193,7 @@ private TextLoader.Arguments GetIrisLoaderArgs() } }).Read(GetDataPath(IrisDataPath)); - var pipeline = new ValueToKeyMappingEstimator(Env, "Label"); + var pipeline = new ValueToKeyMappingEstimator(Env, "Label", "Label"); return (pipeline, data); } From 07fba205f56aecc681694fcd274bc56c22bc9d62 Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Sun, 13 Jan 2019 00:22:56 -0800 Subject: [PATCH 2/6] Addressing PR comments --- .../Dynamic/KeyToValue_Term.cs | 20 ++++----- .../Commands/CrossValidationCommand.cs | 2 +- src/Microsoft.ML.Data/TrainContext.cs | 2 +- .../ConversionsExtensionsCatalog.cs | 41 +++++++------------ src/Microsoft.ML.Data/Transforms/Hashing.cs | 19 +++++---- .../Transforms/KeyToVector.cs | 17 ++++---- .../Transforms/TypeConverting.cs | 17 ++++---- .../Transforms/ValueToKeyMappingEstimator.cs | 4 +- .../TransformsStatic.cs | 4 +- src/Microsoft.ML.Transforms/OneHotEncoding.cs | 2 +- .../OneHotHashEncoding.cs | 4 +- .../Text/WordHashBagProducingTransform.cs | 4 +- test/Microsoft.ML.Benchmarks/HashBench.cs | 2 +- .../Transformers/ConvertTests.cs | 36 ++++++++-------- .../Transformers/HashTests.cs | 34 +++++++-------- .../Transformers/KeyToVectorEstimatorTests.cs | 28 ++++++------- 16 files changed, 116 insertions(+), 120 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs index 6d3deb9f84..db09018ffe 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs @@ -12,11 +12,11 @@ public static void KeyToValue_Term() { // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, // as well as the source of randomness. - var ml = new MLContext(); + var mlContext = new MLContext(); // Get a small dataset as an IEnumerable. IEnumerable data = SamplesUtils.DatasetUtils.GetTopicsData(); - var trainData = ml.CreateStreamingDataView(data); + var trainData = mlContext.CreateStreamingDataView(data); // Preview of one of the columns of the the topics data. // The Review column contains the keys associated with a particular body of text. @@ -31,16 +31,16 @@ public static void KeyToValue_Term() // making use of default settings. string defaultColumnName = "DefaultKeys"; // REVIEW create through the catalog extension - var default_pipeline = new WordTokenizingEstimator(ml, "Review") - .Append(new ValueToKeyMappingEstimator(ml, "Review", defaultColumnName)); + var default_pipeline = mlContext.Transforms.Text.TokenizeWords("Review") + .Append(mlContext.Transforms.Conversion.MapValueToKey(defaultColumnName, "Review")); // Another pipeline, that customizes the advanced settings of the TermEstimator. // We can change the maxNumTerm to limit how many keys will get generated out of the set of words, // and condition the order in which they get evaluated by changing sort from the default Occurence (order in which they get encountered) // to value/alphabetically. string customizedColumnName = "CustomizedKeys"; - var customized_pipeline = new WordTokenizingEstimator(ml, "Review") - .Append(new ValueToKeyMappingEstimator(ml, "Review", customizedColumnName, maxNumTerms: 10, sort: ValueToKeyMappingTransformer.SortOrder.Value)); + var customized_pipeline = mlContext.Transforms.Text.TokenizeWords("Review") + .Append(mlContext.Transforms.Conversion.MapValueToKey(customizedColumnName, "Review", maxNumTerms: 10, sort: ValueToKeyMappingTransformer.SortOrder.Value)); // The transformed data. var transformedData_default = default_pipeline.Fit(trainData).Transform(trainData); @@ -61,7 +61,7 @@ public static void KeyToValue_Term() }; // Preview of the DefaultKeys column obtained after processing the input. - var defaultColumn = transformedData_default.GetColumn>(ml, defaultColumnName); + var defaultColumn = transformedData_default.GetColumn>(mlContext, defaultColumnName); printHelper(defaultColumnName, defaultColumn); // DefaultKeys column obtained post-transformation. @@ -72,7 +72,7 @@ public static void KeyToValue_Term() // 9 10 11 12 13 6 // Previewing the CustomizedKeys column obtained after processing the input. - var customizedColumn = transformedData_customized.GetColumn>(ml, customizedColumnName); + var customizedColumn = transformedData_customized.GetColumn>(mlContext, customizedColumnName); printHelper(customizedColumnName, customizedColumn); // CustomizedKeys column obtained post-transformation. @@ -84,11 +84,11 @@ public static void KeyToValue_Term() // Retrieve the original values, by appending the KeyToValue etimator to the existing pipelines // to convert the keys back to the strings. - var pipeline = default_pipeline.Append(new KeyToValueMappingEstimator(ml, defaultColumnName)); + var pipeline = default_pipeline.Append(new KeyToValueMappingEstimator(mlContext, defaultColumnName)); transformedData_default = pipeline.Fit(trainData).Transform(trainData); // Preview of the DefaultColumnName column obtained. - var originalColumnBack = transformedData_default.GetColumn>>(ml, defaultColumnName); + var originalColumnBack = transformedData_default.GetColumn>>(mlContext, defaultColumnName); foreach (var row in originalColumnBack) { diff --git a/src/Microsoft.ML.Data/Commands/CrossValidationCommand.cs b/src/Microsoft.ML.Data/Commands/CrossValidationCommand.cs index 42c5a3fe99..6411002e2e 100644 --- a/src/Microsoft.ML.Data/Commands/CrossValidationCommand.cs +++ b/src/Microsoft.ML.Data/Commands/CrossValidationCommand.cs @@ -330,7 +330,7 @@ private string GetSplitColumn(IChannel ch, IDataView input, ref IDataView output int inc = 0; while (input.Schema.TryGetColumnIndex(stratificationColumn, out tmp)) stratificationColumn = string.Format("{0}_{1:000}", origStratCol, ++inc); - output = new HashingEstimator(Host, origStratCol, stratificationColumn, 30).Fit(input).Transform(input); + output = new HashingEstimator(Host, stratificationColumn, origStratCol, 30).Fit(input).Transform(input); } } diff --git a/src/Microsoft.ML.Data/TrainContext.cs b/src/Microsoft.ML.Data/TrainContext.cs index b0deb9b24b..18e71bf640 100644 --- a/src/Microsoft.ML.Data/TrainContext.cs +++ b/src/Microsoft.ML.Data/TrainContext.cs @@ -151,7 +151,7 @@ private void EnsureStratificationColumn(ref IDataView data, ref string stratific // Generate a new column with the hashed stratification column. while (data.Schema.TryGetColumnIndex(stratificationColumn, out tmp)) stratificationColumn = string.Format("{0}_{1:000}", origStratCol, ++inc); - data = new HashingEstimator(Host, origStratCol, stratificationColumn, 30).Fit(data).Transform(data); + data = new HashingEstimator(Host, stratificationColumn, origStratCol, 30).Fit(data).Transform(data); } } } diff --git a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs index 746e9e7d4d..a1b41d1d38 100644 --- a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs +++ b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs @@ -20,16 +20,17 @@ public static class ConversionsExtensionsCatalog /// Hashes the values in the input column. /// /// The transform's catalog. - /// Name of the input column. /// Name of the column to be transformed. + /// Name of the input column. If set to , the value of the + /// will be used as input. /// Number of bits to hash into. Must be between 1 and 31, inclusive. /// During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - public static HashingEstimator Hash(this TransformsCatalog.ConversionTransforms catalog, string outputColumn, string inputColumn, + public static HashingEstimator Hash(this TransformsCatalog.ConversionTransforms catalog, string outputColumn, string inputColumn = null, int hashBits = HashDefaults.HashBits, int invertHash = HashDefaults.InvertHash) - => new HashingEstimator(CatalogUtils.GetEnvironment(catalog), inputColumn, outputColumn, hashBits, invertHash); + => new HashingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumn, inputColumn ?? outputColumn, hashBits, invertHash); /// /// Hashes the values in the input column. @@ -43,10 +44,11 @@ public static HashingEstimator Hash(this TransformsCatalog.ConversionTransforms /// Changes column type of the input column. /// /// The transform's catalog. - /// Name of the input column. /// Name of the column to be transformed. + /// Name of the input column. If set to , the value of the + /// will be used as input. /// Number of bits to hash into. Must be between 1 and 31, inclusive. - public static TypeConvertingEstimator ConvertType(this TransformsCatalog.ConversionTransforms catalog, string outputColumn, string inputColumn, + public static TypeConvertingEstimator ConvertType(this TransformsCatalog.ConversionTransforms catalog, string outputColumn, string inputColumn = null, DataKind outputKind = ConvertDefaults.DefaultOutputKind) => new TypeConvertingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumn, inputColumn, outputKind); @@ -88,44 +90,31 @@ public static KeyToVectorMappingEstimator MapKeyToVector(this TransformsCatalog. /// Convert the key types back to their original vectors. /// /// The categorical transform's catalog. - /// The name of the input column. + /// The name of the input column. If set to , the value of the + /// will be used as input. /// The name of the output column. /// Whether bagging is used for the conversion. public static KeyToVectorMappingEstimator MapKeyToVector(this TransformsCatalog.ConversionTransforms catalog, - string outputColumn, string inputColumn, bool bag = KeyToVectorMappingEstimator.Defaults.Bag) - => new KeyToVectorMappingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumn, inputColumn, bag); + string outputColumn, string inputColumn = null, bool bag = KeyToVectorMappingEstimator.Defaults.Bag) + => new KeyToVectorMappingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumn, inputColumn ?? outputColumn, bag); /// /// Converts value types into . /// /// The categorical transform's catalog. - /// Name of the column to be transformed. + /// Name of the column to be transformed. If set to , the value of the + /// will be used as input. /// Name of the output column. /// Maximum number of keys to keep per column when auto-training. /// How items should be ordered when vectorized. If choosen they will be in the order encountered. /// If , items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a'). public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.ConversionTransforms catalog, string outputColumn, - string inputColumn, + string inputColumn = null, int maxNumTerms = ValueToKeyMappingEstimator.Defaults.MaxNumTerms, ValueToKeyMappingTransformer.SortOrder sort = ValueToKeyMappingEstimator.Defaults.Sort) => new ValueToKeyMappingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumn, inputColumn, maxNumTerms, sort); - /// - /// Converts value types into loading the keys to use from . - /// - /// The categorical transform's catalog. - /// The data columns to map to keys. - /// The path of the file containing the terms. - /// - /// - public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.ConversionTransforms catalog, - ValueToKeyMappingTransformer.ColumnInfo[] columns, - string file = null, - string termsColumn = null, - IComponentFactory loaderFactory = null) - => new ValueToKeyMappingEstimator(CatalogUtils.GetEnvironment(catalog), columns, file, termsColumn, loaderFactory); - /// /// Maps specified keys to specified values /// @@ -141,7 +130,7 @@ public static ValueMappingEstimator ValueMap keys, IEnumerable values, - params (string source, string name)[] columns) + params (string inputColumn, string outputColumn)[] columns) => new ValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), keys, values, columns); } } diff --git a/src/Microsoft.ML.Data/Transforms/Hashing.cs b/src/Microsoft.ML.Data/Transforms/Hashing.cs index 7e93061bf9..402c8fb3f3 100644 --- a/src/Microsoft.ML.Data/Transforms/Hashing.cs +++ b/src/Microsoft.ML.Data/Transforms/Hashing.cs @@ -135,8 +135,8 @@ public sealed class ColumnInfo /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - public ColumnInfo(string input, - string output = null, + public ColumnInfo(string output, + string input = null, int hashBits = HashingEstimator.Defaults.HashBits, uint seed = HashingEstimator.Defaults.Seed, bool ordered = HashingEstimator.Defaults.Ordered, @@ -146,7 +146,7 @@ public ColumnInfo(string input, throw Contracts.ExceptParam(nameof(invertHash), "Value too small, must be -1 or larger"); if (invertHash != 0 && hashBits >= 31) throw Contracts.ExceptParam(nameof(hashBits), $"Cannot support invertHash for a {0} bit hash. 30 is the maximum possible.", hashBits); - Contracts.CheckNonWhiteSpace(input, nameof(input)); + Contracts.CheckNonWhiteSpace(output, nameof(output)); Input = input; Output = output ?? input; HashBits = hashBits; @@ -157,6 +157,8 @@ public ColumnInfo(string input, internal ColumnInfo(string input, string output, ModelLoadContext ctx) { + Contracts.CheckNonWhiteSpace(output, nameof(output)); + Input = input; Output = output; // *** Binary format *** @@ -386,8 +388,8 @@ private static IDataTransform Create(IHostEnvironment env, Arguments args, IData { var item = args.Column[i]; var kind = item.InvertHash ?? args.InvertHash; - cols[i] = new ColumnInfo(item.Source ?? item.Name, - item.Name, + cols[i] = new ColumnInfo(item.Name, + item.Source ?? item.Name, item.HashBits ?? args.HashBits, item.Seed ?? args.Seed, item.Ordered ?? args.Ordered, @@ -1215,16 +1217,17 @@ internal static bool IsColumnTypeValid(ColumnType type) /// Initializes a new instance of . /// /// Host Environment. - /// Name of the column to be transformed. + /// Name of the column to be transformed. + /// If set to the value specified for the will be used. /// Name of the output column. /// Number of bits to hash into. Must be between 1 and 31, inclusive. /// During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - internal HashingEstimator(IHostEnvironment env, string inputColumn, string outputColumn, + internal HashingEstimator(IHostEnvironment env, string outputColumn, string inputColumn = null, int hashBits = Defaults.HashBits, int invertHash = Defaults.InvertHash) - : this(env, new HashingTransformer.ColumnInfo(inputColumn, outputColumn, hashBits: hashBits, invertHash: invertHash)) + : this(env, new HashingTransformer.ColumnInfo(outputColumn, inputColumn ?? outputColumn, hashBits: hashBits, invertHash: invertHash)) { } diff --git a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs index 9f58e6e82e..8375a86dc8 100644 --- a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs +++ b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs @@ -100,11 +100,11 @@ public sealed class ColumnInfo /// Name of input column. /// Name of the column resulting from the transformation of . Null means is replaced. /// Whether to combine multiple indicator vectors into a single bag vector instead of concatenating them. This is only relevant when the input column is a vector. - public ColumnInfo(string input, string output = null, bool bag = KeyToVectorMappingEstimator.Defaults.Bag) + public ColumnInfo(string output, string input = null, bool bag = KeyToVectorMappingEstimator.Defaults.Bag) { - Contracts.CheckNonWhiteSpace(input, nameof(input)); - Input = input; - Output = output ?? input; + Contracts.CheckNonWhiteSpace(output, nameof(output)); + Input = input ?? output; + Output = output; Bag = bag; } } @@ -202,7 +202,7 @@ private KeyToVectorMappingTransformer(IHost host, ModelLoadContext ctx) _columns = new ColumnInfo[columnsLength]; for (int i = 0; i < columnsLength; i++) - _columns[i] = new ColumnInfo(ColumnPairs[i].input, ColumnPairs[i].output, bags[i]); + _columns[i] = new ColumnInfo(ColumnPairs[i].output, ColumnPairs[i].input, bags[i]); } // Factory method for SignatureDataTransform. @@ -218,8 +218,9 @@ private static IDataTransform Create(IHostEnvironment env, Arguments args, IData { var item = args.Column[i]; - cols[i] = new ColumnInfo(item.Source ?? item.Name, + cols[i] = new ColumnInfo( item.Name, + item.Source ?? item.Name, item.Bag ?? args.Bag); }; return new KeyToVectorMappingTransformer(env, cols).MakeDataTransform(input); @@ -749,8 +750,8 @@ internal KeyToVectorMappingEstimator(IHostEnvironment env, params KeyToVectorMap { } - internal KeyToVectorMappingEstimator(IHostEnvironment env, string outputColumn, string inputColumn, bool bag = Defaults.Bag) - : this(env, new KeyToVectorMappingTransformer(env, new KeyToVectorMappingTransformer.ColumnInfo(inputColumn, outputColumn, bag))) + internal KeyToVectorMappingEstimator(IHostEnvironment env, string outputColumn, string inputColumn = null, bool bag = Defaults.Bag) + : this(env, new KeyToVectorMappingTransformer(env, new KeyToVectorMappingTransformer.ColumnInfo(outputColumn, inputColumn ?? outputColumn, bag))) { } diff --git a/src/Microsoft.ML.Data/Transforms/TypeConverting.cs b/src/Microsoft.ML.Data/Transforms/TypeConverting.cs index 06efae4e0b..6b8a149cf4 100644 --- a/src/Microsoft.ML.Data/Transforms/TypeConverting.cs +++ b/src/Microsoft.ML.Data/Transforms/TypeConverting.cs @@ -185,8 +185,10 @@ public sealed class ColumnInfo /// Name of output column. /// The expected kind of the converted column. /// New key range, if we work with key type. - public ColumnInfo(string input, string output, DataKind outputKind, KeyRange outputKeyRange = null) + public ColumnInfo(string output, string input, DataKind outputKind, KeyRange outputKeyRange = null) { + Contracts.CheckNonWhiteSpace(output, nameof(output)); + Input = input; Output = output; OutputKind = outputKind; @@ -211,7 +213,7 @@ private static (string input, string output)[] GetColumnPairs(ColumnInfo[] colum /// The expected type of the converted column. /// New key range if we work with key type. public TypeConvertingTransformer(IHostEnvironment env, string inputColumn, string outputColumn, DataKind outputKind, KeyRange outputKeyRange = null) - : this(env, new ColumnInfo(inputColumn, outputColumn, outputKind, outputKeyRange)) + : this(env, new ColumnInfo(outputColumn, inputColumn, outputKind, outputKeyRange)) { } @@ -296,7 +298,7 @@ private TypeConvertingTransformer(IHost host, ModelLoadContext ctx) range.Max = count; range.Contiguous = ctx.Reader.ReadBoolByte(); } - _columns[i] = new ColumnInfo(ColumnPairs[i].input, ColumnPairs[i].output, kind, range); + _columns[i] = new ColumnInfo(ColumnPairs[i].output, ColumnPairs[i].input, kind, range); } } @@ -344,7 +346,7 @@ internal static IDataTransform Create(IHostEnvironment env, Arguments args, IDat { kind = tempResultType.Value; } - cols[i] = new ColumnInfo(item.Source ?? item.Name, item.Name, kind, range); + cols[i] = new ColumnInfo(item.Name, item.Source ?? item.Name, kind, range); }; return new TypeConvertingTransformer(env, cols).MakeDataTransform(input); } @@ -535,13 +537,14 @@ internal sealed class Defaults /// Convinence constructor for simple one column case. /// /// Host Environment. - /// Name of the input column. + /// Name of the input column. If set to , the value of the + /// will be used as input. /// Name of the output column. /// The expected type of the converted column. internal TypeConvertingEstimator(IHostEnvironment env, - string outputColumn, string inputColumn, + string outputColumn, string inputColumn = null, DataKind outputKind = Defaults.DefaultOutputKind) - : this(env, new TypeConvertingTransformer.ColumnInfo(inputColumn, outputColumn, outputKind)) + : this(env, new TypeConvertingTransformer.ColumnInfo(outputColumn, inputColumn ?? outputColumn, outputKind)) { } diff --git a/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingEstimator.cs b/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingEstimator.cs index d6aa366d43..0be5b3d66a 100644 --- a/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingEstimator.cs +++ b/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingEstimator.cs @@ -32,8 +32,8 @@ public static class Defaults /// Maximum number of keys to keep per column when auto-training. /// How items should be ordered when vectorized. If choosen they will be in the order encountered. /// If , items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a'). - internal ValueToKeyMappingEstimator(IHostEnvironment env, string outputColumn, string inputColumn, int maxNumTerms = Defaults.MaxNumTerms, ValueToKeyMappingTransformer.SortOrder sort = Defaults.Sort) : - this(env, new [] { new ValueToKeyMappingTransformer.ColumnInfo(inputColumn, outputColumn, maxNumTerms, sort) }) + internal ValueToKeyMappingEstimator(IHostEnvironment env, string outputColumn, string inputColumn = null, int maxNumTerms = Defaults.MaxNumTerms, ValueToKeyMappingTransformer.SortOrder sort = Defaults.Sort) : + this(env, new [] { new ValueToKeyMappingTransformer.ColumnInfo(inputColumn ?? outputColumn, outputColumn, maxNumTerms, sort) }) { } diff --git a/src/Microsoft.ML.StaticPipe/TransformsStatic.cs b/src/Microsoft.ML.StaticPipe/TransformsStatic.cs index 9cd4091fda..3b40dc2410 100644 --- a/src/Microsoft.ML.StaticPipe/TransformsStatic.cs +++ b/src/Microsoft.ML.StaticPipe/TransformsStatic.cs @@ -582,7 +582,7 @@ public override IEstimator Reconcile(IHostEnvironment env, for (int i = 0; i < toOutput.Length; ++i) { var col = (IColInput)toOutput[i]; - infos[i] = new KeyToVectorMappingTransformer.ColumnInfo(inputNames[col.Input], outputNames[toOutput[i]], col.Bag); + infos[i] = new KeyToVectorMappingTransformer.ColumnInfo(outputNames[toOutput[i]], inputNames[col.Input], col.Bag); } return new KeyToVectorMappingEstimator(env, infos); } @@ -937,7 +937,7 @@ public override IEstimator Reconcile(IHostEnvironment env, Pipelin for (int i = 0; i < toOutput.Length; ++i) { var tcol = (IConvertCol)toOutput[i]; - infos[i] = new TypeConvertingTransformer.ColumnInfo(inputNames[tcol.Input], outputNames[toOutput[i]], tcol.Kind); + infos[i] = new TypeConvertingTransformer.ColumnInfo(outputNames[tcol.Input], inputNames[toOutput[i]], tcol.Kind); } return new TypeConvertingEstimator(env, infos); } diff --git a/src/Microsoft.ML.Transforms/OneHotEncoding.cs b/src/Microsoft.ML.Transforms/OneHotEncoding.cs index faf682da4c..74d99856b0 100644 --- a/src/Microsoft.ML.Transforms/OneHotEncoding.cs +++ b/src/Microsoft.ML.Transforms/OneHotEncoding.cs @@ -255,7 +255,7 @@ public OneHotEncodingEstimator(IHostEnvironment env, ColumnInfo[] columns, if (binaryCols.Count > 0) toBinVector = new KeyToBinaryVectorMappingEstimator(_host, binaryCols.Select(x => new KeyToBinaryVectorMappingTransformer.ColumnInfo(x.input, x.output)).ToArray()); if (cols.Count > 0) - toVector = new KeyToVectorMappingEstimator(_host, cols.Select(x => new KeyToVectorMappingTransformer.ColumnInfo(x.input, x.output, x.bag)).ToArray()); + toVector = new KeyToVectorMappingEstimator(_host, cols.Select(x => new KeyToVectorMappingTransformer.ColumnInfo(x.output, x.input, x.bag)).ToArray()); if (toBinVector != null && toVector != null) _toSomething = toVector.Append(toBinVector); diff --git a/src/Microsoft.ML.Transforms/OneHotHashEncoding.cs b/src/Microsoft.ML.Transforms/OneHotHashEncoding.cs index 05478790d9..dbb080a363 100644 --- a/src/Microsoft.ML.Transforms/OneHotHashEncoding.cs +++ b/src/Microsoft.ML.Transforms/OneHotHashEncoding.cs @@ -234,7 +234,7 @@ public ColumnInfo(string input, string output, bool ordered = Defaults.Ordered, int invertHash = Defaults.InvertHash) { - HashInfo = new HashingTransformer.ColumnInfo(input, output, hashBits, seed, ordered, invertHash); + HashInfo = new HashingTransformer.ColumnInfo(output, input, hashBits, seed, ordered, invertHash); OutputKind = outputKind; } } @@ -300,7 +300,7 @@ public OneHotHashEncodingEstimator(IHostEnvironment env, params ColumnInfo[] col if (binaryCols.Count > 0) toBinVector = new KeyToBinaryVectorMappingEstimator(_host, binaryCols.Select(x => new KeyToBinaryVectorMappingTransformer.ColumnInfo(x.input, x.output)).ToArray()); if (cols.Count > 0) - toVector = new KeyToVectorMappingEstimator(_host, cols.Select(x => new KeyToVectorMappingTransformer.ColumnInfo(x.input, x.output, x.bag)).ToArray()); + toVector = new KeyToVectorMappingEstimator(_host, cols.Select(x => new KeyToVectorMappingTransformer.ColumnInfo(x.output, x.input, x.bag)).ToArray()); if (toBinVector != null && toVector != null) _toSomething = toVector.Append(toBinVector); diff --git a/src/Microsoft.ML.Transforms/Text/WordHashBagProducingTransform.cs b/src/Microsoft.ML.Transforms/Text/WordHashBagProducingTransform.cs index e911dbbafe..c644dd02a6 100644 --- a/src/Microsoft.ML.Transforms/Text/WordHashBagProducingTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/WordHashBagProducingTransform.cs @@ -359,8 +359,8 @@ public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataV }); } - hashColumns.Add(new HashingTransformer.ColumnInfo(termLoaderArgs == null ? column.Source[isrc] : tmpName, - tmpName, 30, column.Seed ?? args.Seed, false, column.InvertHash ?? args.InvertHash)); + hashColumns.Add(new HashingTransformer.ColumnInfo(tmpName, termLoaderArgs == null ? column.Source[isrc] : tmpName, + 30, column.Seed ?? args.Seed, false, column.InvertHash ?? args.InvertHash)); } ngramHashColumns[iinfo] = diff --git a/test/Microsoft.ML.Benchmarks/HashBench.cs b/test/Microsoft.ML.Benchmarks/HashBench.cs index ca7f70baca..5a9a6f031e 100644 --- a/test/Microsoft.ML.Benchmarks/HashBench.cs +++ b/test/Microsoft.ML.Benchmarks/HashBench.cs @@ -72,7 +72,7 @@ private void InitMap(T val, ColumnType type, int hashBits = 20, ValueGetter dst = val; _inRow = RowImpl.Create(type, getter); // One million features is a nice, typical number. - var info = new HashingTransformer.ColumnInfo("Foo", "Bar", hashBits: hashBits); + var info = new HashingTransformer.ColumnInfo("Bar", "Foo", hashBits: hashBits); var xf = new HashingTransformer(_env, new[] { info }); var mapper = xf.GetRowToRowMapper(_inRow.Schema); var column = mapper.OutputSchema["Bar"]; diff --git a/test/Microsoft.ML.Tests/Transformers/ConvertTests.cs b/test/Microsoft.ML.Tests/Transformers/ConvertTests.cs index e2a23374b4..eb2a44d31d 100644 --- a/test/Microsoft.ML.Tests/Transformers/ConvertTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/ConvertTests.cs @@ -73,8 +73,8 @@ public void TestConvertWorkout() var data = new[] { new TestClass() { A = 1, B = new int[2] { 1,4 } }, new TestClass() { A = 2, B = new int[2] { 3,4 } }}; var dataView = ComponentCreation.CreateDataView(Env, data); - var pipe = new TypeConvertingEstimator(Env, columns: new[] {new TypeConvertingTransformer.ColumnInfo("A", "ConvA", DataKind.R4), - new TypeConvertingTransformer.ColumnInfo("B", "ConvB", DataKind.R4)}); + var pipe = new TypeConvertingEstimator(Env, columns: new[] {new TypeConvertingTransformer.ColumnInfo("ConvA", "A", DataKind.R4), + new TypeConvertingTransformer.ColumnInfo("ConvB", "B", DataKind.R4)}); TestEstimatorCore(pipe, dataView); var allTypesData = new[] @@ -113,18 +113,18 @@ public void TestConvertWorkout() var allTypesDataView = ComponentCreation.CreateDataView(Env, allTypesData); var allTypesPipe = new TypeConvertingEstimator(Env, columns: new[] { - new TypeConvertingTransformer.ColumnInfo("AA", "ConvA", DataKind.R4), - new TypeConvertingTransformer.ColumnInfo("AB", "ConvB", DataKind.R4), - new TypeConvertingTransformer.ColumnInfo("AC", "ConvC", DataKind.R4), - new TypeConvertingTransformer.ColumnInfo("AD", "ConvD", DataKind.R4), - new TypeConvertingTransformer.ColumnInfo("AE", "ConvE", DataKind.R4), - new TypeConvertingTransformer.ColumnInfo("AF", "ConvF", DataKind.R4), - new TypeConvertingTransformer.ColumnInfo("AG", "ConvG", DataKind.R4), - new TypeConvertingTransformer.ColumnInfo("AH", "ConvH", DataKind.R4), - new TypeConvertingTransformer.ColumnInfo("AK", "ConvK", DataKind.R4), - new TypeConvertingTransformer.ColumnInfo("AL", "ConvL", DataKind.R4), - new TypeConvertingTransformer.ColumnInfo("AM", "ConvM", DataKind.R4), - new TypeConvertingTransformer.ColumnInfo("AN", "ConvN", DataKind.R4)} + new TypeConvertingTransformer.ColumnInfo("ConvA", "AA", DataKind.R4), + new TypeConvertingTransformer.ColumnInfo("ConvB", "AB", DataKind.R4), + new TypeConvertingTransformer.ColumnInfo("ConvC", "AC", DataKind.R4), + new TypeConvertingTransformer.ColumnInfo("ConvD", "AD", DataKind.R4), + new TypeConvertingTransformer.ColumnInfo("ConvE", "AE", DataKind.R4), + new TypeConvertingTransformer.ColumnInfo("ConvF", "AF", DataKind.R4), + new TypeConvertingTransformer.ColumnInfo("ConvG", "AG", DataKind.R4), + new TypeConvertingTransformer.ColumnInfo("ConvH", "AH", DataKind.R4), + new TypeConvertingTransformer.ColumnInfo("ConvK", "AK", DataKind.R4), + new TypeConvertingTransformer.ColumnInfo("ConvL", "AL", DataKind.R4), + new TypeConvertingTransformer.ColumnInfo("ConvM", "AM", DataKind.R4), + new TypeConvertingTransformer.ColumnInfo("ConvN", "AN", DataKind.R4)} ); TestEstimatorCore(allTypesPipe, allTypesDataView); @@ -153,8 +153,8 @@ public void TestOldSavingAndLoading() var data = new[] { new TestClass() { A = 1, B = new int[2] { 1,4 } }, new TestClass() { A = 2, B = new int[2] { 3,4 } }}; var dataView = ComponentCreation.CreateDataView(Env, data); - var pipe = new TypeConvertingEstimator(Env, columns: new[] {new TypeConvertingTransformer.ColumnInfo("A", "ConvA", DataKind.R8), - new TypeConvertingTransformer.ColumnInfo("B", "ConvB", DataKind.R8)}); + var pipe = new TypeConvertingEstimator(Env, columns: new[] {new TypeConvertingTransformer.ColumnInfo("ConvA", "A", DataKind.R8), + new TypeConvertingTransformer.ColumnInfo("ConvB", "B", DataKind.R8)}); var result = pipe.Fit(dataView).Transform(dataView); var resultRoles = new RoleMappedData(result); @@ -175,8 +175,8 @@ public void TestMetadata() new OneHotEncodingEstimator.ColumnInfo("A", "CatA", OneHotEncodingTransformer.OutputKind.Ind), new OneHotEncodingEstimator.ColumnInfo("B", "CatB", OneHotEncodingTransformer.OutputKind.Key) }).Append(new TypeConvertingEstimator(Env, new[] { - new TypeConvertingTransformer.ColumnInfo("CatA", "ConvA", DataKind.R8), - new TypeConvertingTransformer.ColumnInfo("CatB", "ConvB", DataKind.U2) + new TypeConvertingTransformer.ColumnInfo("ConvA", "CatA", DataKind.R8), + new TypeConvertingTransformer.ColumnInfo("ConvB", "CatB", DataKind.U2) })); var dataView = ComponentCreation.CreateDataView(Env, data); dataView = pipe.Fit(dataView).Transform(dataView); diff --git a/test/Microsoft.ML.Tests/Transformers/HashTests.cs b/test/Microsoft.ML.Tests/Transformers/HashTests.cs index 6104b92fca..04a6a15af1 100644 --- a/test/Microsoft.ML.Tests/Transformers/HashTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/HashTests.cs @@ -46,10 +46,10 @@ public void HashWorkout() var dataView = ComponentCreation.CreateDataView(Env, data); var pipe = new HashingEstimator(Env, new[]{ - new HashingTransformer.ColumnInfo("A", "HashA", hashBits:4, invertHash:-1), - new HashingTransformer.ColumnInfo("B", "HashB", hashBits:3, ordered:true), - new HashingTransformer.ColumnInfo("C", "HashC", seed:42), - new HashingTransformer.ColumnInfo("A", "HashD"), + new HashingTransformer.ColumnInfo("HashA", "A", hashBits:4, invertHash:-1), + new HashingTransformer.ColumnInfo("HashB", "B", hashBits:3, ordered:true), + new HashingTransformer.ColumnInfo("HashC", "C", seed:42), + new HashingTransformer.ColumnInfo("HashD", "A"), }); TestEstimatorCore(pipe, dataView); @@ -68,9 +68,9 @@ public void TestMetadata() var dataView = ComponentCreation.CreateDataView(Env, data); var pipe = new HashingEstimator(Env, new[] { - new HashingTransformer.ColumnInfo("A", "HashA", invertHash:1, hashBits:10), - new HashingTransformer.ColumnInfo("A", "HashAUnlim", invertHash:-1, hashBits:10), - new HashingTransformer.ColumnInfo("A", "HashAUnlimOrdered", invertHash:-1, hashBits:10, ordered:true) + new HashingTransformer.ColumnInfo("HashA", "A", invertHash:1, hashBits:10), + new HashingTransformer.ColumnInfo("HashAUnlim", "A", invertHash:-1, hashBits:10), + new HashingTransformer.ColumnInfo("HashAUnlimOrdered", "A", invertHash:-1, hashBits:10, ordered:true) }); var result = pipe.Fit(dataView).Transform(dataView); ValidateMetadata(result); @@ -108,10 +108,10 @@ public void TestOldSavingAndLoading() var data = new[] { new TestClass() { A = 1, B = 2, C = 3, }, new TestClass() { A = 4, B = 5, C = 6 } }; var dataView = ComponentCreation.CreateDataView(Env, data); var pipe = new HashingEstimator(Env, new[]{ - new HashingTransformer.ColumnInfo("A", "HashA", hashBits:4, invertHash:-1), - new HashingTransformer.ColumnInfo("B", "HashB", hashBits:3, ordered:true), - new HashingTransformer.ColumnInfo("C", "HashC", seed:42), - new HashingTransformer.ColumnInfo("A", "HashD"), + new HashingTransformer.ColumnInfo("HashA", "A", hashBits:4, invertHash:-1), + new HashingTransformer.ColumnInfo("HashB", "B", hashBits:3, ordered:true), + new HashingTransformer.ColumnInfo("HashC", "C", seed:42), + new HashingTransformer.ColumnInfo("HashD", "A"), }); var result = pipe.Fit(dataView).Transform(dataView); var resultRoles = new RoleMappedData(result); @@ -132,7 +132,7 @@ private void HashTestCore(T val, PrimitiveType type, uint expected, uint expe var inRow = MetadataUtils.MetadataAsRow(builder.GetMetadata()); // First do an unordered hash. - var info = new HashingTransformer.ColumnInfo("Foo", "Bar", hashBits: bits); + var info = new HashingTransformer.ColumnInfo("Bar", "Foo", hashBits: bits); var xf = new HashingTransformer(Env, new[] { info }); var mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.OutputSchema.TryGetColumnIndex("Bar", out int outCol); @@ -144,7 +144,7 @@ private void HashTestCore(T val, PrimitiveType type, uint expected, uint expe Assert.Equal(expected, result); // Next do an ordered hash. - info = new HashingTransformer.ColumnInfo("Foo", "Bar", hashBits: bits, ordered: true); + info = new HashingTransformer.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: true); xf = new HashingTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol); @@ -162,7 +162,7 @@ private void HashTestCore(T val, PrimitiveType type, uint expected, uint expe builder.Add("Foo", new VectorType(type, vecLen), (ref VBuffer dst) => denseVec.CopyTo(ref dst)); inRow = MetadataUtils.MetadataAsRow(builder.GetMetadata()); - info = new HashingTransformer.ColumnInfo("Foo", "Bar", hashBits: bits, ordered: false); + info = new HashingTransformer.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: false); xf = new HashingTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol); @@ -177,7 +177,7 @@ private void HashTestCore(T val, PrimitiveType type, uint expected, uint expe Assert.All(vecResult.DenseValues(), v => Assert.Equal(expected, v)); // Now do ordered with the dense vector. - info = new HashingTransformer.ColumnInfo("Foo", "Bar", hashBits: bits, ordered: true); + info = new HashingTransformer.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: true); xf = new HashingTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol); @@ -196,7 +196,7 @@ private void HashTestCore(T val, PrimitiveType type, uint expected, uint expe builder.Add("Foo", new VectorType(type, vecLen), (ref VBuffer dst) => sparseVec.CopyTo(ref dst)); inRow = MetadataUtils.MetadataAsRow(builder.GetMetadata()); - info = new HashingTransformer.ColumnInfo("Foo", "Bar", hashBits: bits, ordered: false); + info = new HashingTransformer.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: false); xf = new HashingTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol); @@ -209,7 +209,7 @@ private void HashTestCore(T val, PrimitiveType type, uint expected, uint expe Assert.Equal(expected, vecResult.GetItemOrDefault(3)); Assert.Equal(expected, vecResult.GetItemOrDefault(7)); - info = new HashingTransformer.ColumnInfo("Foo", "Bar", hashBits: bits, ordered: true); + info = new HashingTransformer.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: true); xf = new HashingTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol); diff --git a/test/Microsoft.ML.Tests/Transformers/KeyToVectorEstimatorTests.cs b/test/Microsoft.ML.Tests/Transformers/KeyToVectorEstimatorTests.cs index 967231f604..2a228129f3 100644 --- a/test/Microsoft.ML.Tests/Transformers/KeyToVectorEstimatorTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/KeyToVectorEstimatorTests.cs @@ -57,10 +57,10 @@ public void KeyToVectorWorkout() new ValueToKeyMappingTransformer.ColumnInfo("C", "TermC", textKeyValues:true) }).Fit(dataView).Transform(dataView); - var pipe = new KeyToVectorMappingEstimator(Env, new KeyToVectorMappingTransformer.ColumnInfo("TermA", "CatA", false), - new KeyToVectorMappingTransformer.ColumnInfo("TermB", "CatB", true), - new KeyToVectorMappingTransformer.ColumnInfo("TermC", "CatC", true), - new KeyToVectorMappingTransformer.ColumnInfo("TermC", "CatCNonBag", false)); + var pipe = new KeyToVectorMappingEstimator(Env, new KeyToVectorMappingTransformer.ColumnInfo("CatA", "TermA", false), + new KeyToVectorMappingTransformer.ColumnInfo("CatB", "TermB", true), + new KeyToVectorMappingTransformer.ColumnInfo("CatC", "TermC", true), + new KeyToVectorMappingTransformer.ColumnInfo("CatCNonBag", "TermC", false)); TestEstimatorCore(pipe, dataView); Done(); } @@ -121,14 +121,14 @@ public void TestMetadataPropagation() dataView = termTransformer.Transform(dataView); var pipe = new KeyToVectorMappingEstimator(Env, - new KeyToVectorMappingTransformer.ColumnInfo("TA", "CatA", true), - new KeyToVectorMappingTransformer.ColumnInfo("TB", "CatB", false), - new KeyToVectorMappingTransformer.ColumnInfo("TC", "CatC", false), - new KeyToVectorMappingTransformer.ColumnInfo("TD", "CatD", true), - new KeyToVectorMappingTransformer.ColumnInfo("TE", "CatE", false), - new KeyToVectorMappingTransformer.ColumnInfo("TF", "CatF", true), - new KeyToVectorMappingTransformer.ColumnInfo("TG", "CatG", true), - new KeyToVectorMappingTransformer.ColumnInfo("TH", "CatH", false) + new KeyToVectorMappingTransformer.ColumnInfo("CatA", "TA", true), + new KeyToVectorMappingTransformer.ColumnInfo("CatB", "TB", false), + new KeyToVectorMappingTransformer.ColumnInfo("CatC", "TC", false), + new KeyToVectorMappingTransformer.ColumnInfo("CatD", "TD", true), + new KeyToVectorMappingTransformer.ColumnInfo("CatE", "TE", false), + new KeyToVectorMappingTransformer.ColumnInfo("CatF", "TF", true), + new KeyToVectorMappingTransformer.ColumnInfo("CatG", "TG", true), + new KeyToVectorMappingTransformer.ColumnInfo("CatH", "TH", false) ); var result = pipe.Fit(dataView).Transform(dataView); @@ -221,8 +221,8 @@ public void TestOldSavingAndLoading() var transformer = est.Fit(dataView); dataView = transformer.Transform(dataView); var pipe = new KeyToVectorMappingEstimator(Env, - new KeyToVectorMappingTransformer.ColumnInfo("TermA", "CatA", false), - new KeyToVectorMappingTransformer.ColumnInfo("TermB", "CatB", true) + new KeyToVectorMappingTransformer.ColumnInfo("CatA", "TermA", false), + new KeyToVectorMappingTransformer.ColumnInfo("CatB", "TermB", true) ); var result = pipe.Fit(dataView).Transform(dataView); var resultRoles = new RoleMappedData(result); From 88ff2840516435cd9b9af6e639ec8ea7b0b4e35a Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Mon, 14 Jan 2019 00:05:54 -0800 Subject: [PATCH 3/6] Fixing the Ngram workout --- test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs index 432bbb8291..06743db102 100644 --- a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs @@ -249,7 +249,7 @@ public void NgramWorkout() .Read(sentimentDataPath); var est = new WordTokenizingEstimator(Env, "text", "text") - .Append(new ValueToKeyMappingEstimator(Env, "text", "terms")) + .Append(new ValueToKeyMappingEstimator(Env, "terms", "text")) .Append(new NgramExtractingEstimator(Env, "terms", "ngrams")) .Append(new NgramHashingEstimator(Env, "terms", "ngramshash")); From 8a3cd833068871d7bcf55c2bc5c6897a7910b6da Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Mon, 14 Jan 2019 13:16:59 -0800 Subject: [PATCH 4/6] Fixes identified by tests --- src/Microsoft.ML.EntryPoints/FeatureCombiner.cs | 4 ++-- src/Microsoft.ML.StaticPipe/TransformsStatic.cs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Microsoft.ML.EntryPoints/FeatureCombiner.cs b/src/Microsoft.ML.EntryPoints/FeatureCombiner.cs index 1f0fe94f1b..ed6f282b2f 100644 --- a/src/Microsoft.ML.EntryPoints/FeatureCombiner.cs +++ b/src/Microsoft.ML.EntryPoints/FeatureCombiner.cs @@ -173,7 +173,7 @@ private static IDataView ApplyConvert(List { var colName = GetUniqueName(); concatNames.Add(new KeyValuePair(col.Name, colName)); - Utils.Add(ref ktv, new KeyToVectorMappingTransformer.ColumnInfo(col.Name, colName)); + Utils.Add(ref ktv, new KeyToVectorMappingTransformer.ColumnInfo(colName, col.Name)); continue; } } @@ -184,7 +184,7 @@ private static IDataView ApplyConvert(List // This happens when the training is done on an XDF and the scoring is done on a data frame. var colName = GetUniqueName(); concatNames.Add(new KeyValuePair(col.Name, colName)); - Utils.Add(ref cvt, new TypeConvertingTransformer.ColumnInfo(col.Name, colName, DataKind.R4)); + Utils.Add(ref cvt, new TypeConvertingTransformer.ColumnInfo(colName, col.Name, DataKind.R4)); continue; } } diff --git a/src/Microsoft.ML.StaticPipe/TransformsStatic.cs b/src/Microsoft.ML.StaticPipe/TransformsStatic.cs index 3b40dc2410..aed8dc609a 100644 --- a/src/Microsoft.ML.StaticPipe/TransformsStatic.cs +++ b/src/Microsoft.ML.StaticPipe/TransformsStatic.cs @@ -937,7 +937,7 @@ public override IEstimator Reconcile(IHostEnvironment env, Pipelin for (int i = 0; i < toOutput.Length; ++i) { var tcol = (IConvertCol)toOutput[i]; - infos[i] = new TypeConvertingTransformer.ColumnInfo(outputNames[tcol.Input], inputNames[toOutput[i]], tcol.Kind); + infos[i] = new TypeConvertingTransformer.ColumnInfo(outputNames[toOutput[i]], inputNames[tcol.Input], tcol.Kind); } return new TypeConvertingEstimator(env, infos); } From e27b45340a58a9625e9d9e838053647275fa52b4 Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Tue, 15 Jan 2019 10:29:55 -0800 Subject: [PATCH 5/6] renamed outputColumn -> name and inputColumn -> source --- .../ConversionsExtensionsCatalog.cs | 40 +++++++-------- src/Microsoft.ML.Data/Transforms/Hashing.cs | 45 ++++++++--------- .../Transforms/KeyToVector.cs | 28 +++++------ .../Transforms/TypeConverting.cs | 50 +++++++++---------- .../Transforms/ValueToKeyMappingEstimator.cs | 8 +-- .../FeatureCombiner.cs | 6 +-- .../OneHotHashEncoding.cs | 6 +-- .../FeatureContributionTests.cs | 2 +- test/Microsoft.ML.Tests/OnnxConversionTest.cs | 2 +- .../PermutationFeatureImportanceTests.cs | 4 +- .../CookbookSamplesDynamicApi.cs | 4 +- 11 files changed, 97 insertions(+), 98 deletions(-) diff --git a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs index a1b41d1d38..974cf1c995 100644 --- a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs +++ b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs @@ -20,17 +20,17 @@ public static class ConversionsExtensionsCatalog /// Hashes the values in the input column. /// /// The transform's catalog. - /// Name of the column to be transformed. - /// Name of the input column. If set to , the value of the + /// Name of the column to be transformed. + /// Name of the input column. If set to , the value of the /// will be used as input. /// Number of bits to hash into. Must be between 1 and 31, inclusive. /// During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - public static HashingEstimator Hash(this TransformsCatalog.ConversionTransforms catalog, string outputColumn, string inputColumn = null, + public static HashingEstimator Hash(this TransformsCatalog.ConversionTransforms catalog, string name, string source = null, int hashBits = HashDefaults.HashBits, int invertHash = HashDefaults.InvertHash) - => new HashingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumn, inputColumn ?? outputColumn, hashBits, invertHash); + => new HashingEstimator(CatalogUtils.GetEnvironment(catalog), name, source ?? name, hashBits, invertHash); /// /// Hashes the values in the input column. @@ -44,13 +44,13 @@ public static HashingEstimator Hash(this TransformsCatalog.ConversionTransforms /// Changes column type of the input column. /// /// The transform's catalog. - /// Name of the column to be transformed. - /// Name of the input column. If set to , the value of the + /// Name of the column to be transformed. If set to , the value of the /// will be used as input. + /// Name of the new column produced. /// Number of bits to hash into. Must be between 1 and 31, inclusive. - public static TypeConvertingEstimator ConvertType(this TransformsCatalog.ConversionTransforms catalog, string outputColumn, string inputColumn = null, + public static TypeConvertingEstimator ConvertType(this TransformsCatalog.ConversionTransforms catalog, string name, string source = null, DataKind outputKind = ConvertDefaults.DefaultOutputKind) - => new TypeConvertingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumn, inputColumn, outputKind); + => new TypeConvertingEstimator(CatalogUtils.GetEnvironment(catalog), name, source, outputKind); /// /// Changes column type of the input column. @@ -64,9 +64,9 @@ public static TypeConvertingEstimator ConvertType(this TransformsCatalog.Convers /// Convert the key types back to their original values. /// /// The categorical transform's catalog. - /// Name of the input column. - public static KeyToValueMappingEstimator MapKeyToValue(this TransformsCatalog.ConversionTransforms catalog, string inputColumn) - => new KeyToValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), inputColumn); + /// Name of the input column. + public static KeyToValueMappingEstimator MapKeyToValue(this TransformsCatalog.ConversionTransforms catalog, string source) + => new KeyToValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), source); /// /// Convert the key types (name of the column specified in the first item of the tuple) back to their original values @@ -90,30 +90,30 @@ public static KeyToVectorMappingEstimator MapKeyToVector(this TransformsCatalog. /// Convert the key types back to their original vectors. /// /// The categorical transform's catalog. - /// The name of the input column. If set to , the value of the + /// Name of the column to be transformed. If set to , the value of the /// will be used as input. - /// The name of the output column. + /// Name of the new column produced. /// Whether bagging is used for the conversion. public static KeyToVectorMappingEstimator MapKeyToVector(this TransformsCatalog.ConversionTransforms catalog, - string outputColumn, string inputColumn = null, bool bag = KeyToVectorMappingEstimator.Defaults.Bag) - => new KeyToVectorMappingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumn, inputColumn ?? outputColumn, bag); + string name, string source = null, bool bag = KeyToVectorMappingEstimator.Defaults.Bag) + => new KeyToVectorMappingEstimator(CatalogUtils.GetEnvironment(catalog), name, source ?? name, bag); /// /// Converts value types into . /// /// The categorical transform's catalog. - /// Name of the column to be transformed. If set to , the value of the + /// Name of the column to be transformed. If set to , the value of the /// will be used as input. - /// Name of the output column. + /// Name of the new column produced. /// Maximum number of keys to keep per column when auto-training. /// How items should be ordered when vectorized. If choosen they will be in the order encountered. /// If , items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a'). public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.ConversionTransforms catalog, - string outputColumn, - string inputColumn = null, + string name, + string source = null, int maxNumTerms = ValueToKeyMappingEstimator.Defaults.MaxNumTerms, ValueToKeyMappingTransformer.SortOrder sort = ValueToKeyMappingEstimator.Defaults.Sort) - => new ValueToKeyMappingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumn, inputColumn, maxNumTerms, sort); + => new ValueToKeyMappingEstimator(CatalogUtils.GetEnvironment(catalog), name, source, maxNumTerms, sort); /// /// Maps specified keys to specified values diff --git a/src/Microsoft.ML.Data/Transforms/Hashing.cs b/src/Microsoft.ML.Data/Transforms/Hashing.cs index 402c8fb3f3..305db81f96 100644 --- a/src/Microsoft.ML.Data/Transforms/Hashing.cs +++ b/src/Microsoft.ML.Data/Transforms/Hashing.cs @@ -116,8 +116,8 @@ public bool TryUnparse(StringBuilder sb) public sealed class ColumnInfo { - public readonly string Input; - public readonly string Output; + public readonly string Source; + public readonly string Name; public readonly int HashBits; public readonly uint Seed; public readonly bool Ordered; @@ -126,8 +126,8 @@ public sealed class ColumnInfo /// /// Describes how the transformer handles one column pair. /// - /// Name of input column. - /// Name of the column resulting from the transformation of . Null means is replaced. + /// Name of column to work on. + /// Name of the column resulting from the transformation of . Null means is replaced. /// Number of bits to hash into. Must be between 1 and 31, inclusive. /// Hashing seed. /// Whether the position of each term should be included in the hash. @@ -135,8 +135,8 @@ public sealed class ColumnInfo /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - public ColumnInfo(string output, - string input = null, + public ColumnInfo(string name, + string source = null, int hashBits = HashingEstimator.Defaults.HashBits, uint seed = HashingEstimator.Defaults.Seed, bool ordered = HashingEstimator.Defaults.Ordered, @@ -146,21 +146,21 @@ public ColumnInfo(string output, throw Contracts.ExceptParam(nameof(invertHash), "Value too small, must be -1 or larger"); if (invertHash != 0 && hashBits >= 31) throw Contracts.ExceptParam(nameof(hashBits), $"Cannot support invertHash for a {0} bit hash. 30 is the maximum possible.", hashBits); - Contracts.CheckNonWhiteSpace(output, nameof(output)); - Input = input; - Output = output ?? input; + Contracts.CheckNonWhiteSpace(name, nameof(name)); + Source = source; + Name = name ?? source; HashBits = hashBits; Seed = seed; Ordered = ordered; InvertHash = invertHash; } - internal ColumnInfo(string input, string output, ModelLoadContext ctx) + internal ColumnInfo(string source, string name, ModelLoadContext ctx) { - Contracts.CheckNonWhiteSpace(output, nameof(output)); + Contracts.CheckNonWhiteSpace(name, nameof(name)); - Input = input; - Output = output; + Source = source; + Name = name; // *** Binary format *** // int: HashBits // uint: HashSeed @@ -215,16 +215,16 @@ protected override void CheckInputColumn(Schema inputSchema, int col, int srcCol throw Host.ExceptParam(nameof(inputSchema), HashingEstimator.ExpectedColumnType); } - private static (string input, string output)[] GetColumnPairs(ColumnInfo[] columns) + private static (string source, string name)[] GetColumnPairs(ColumnInfo[] columns) { Contracts.CheckNonEmpty(columns, nameof(columns)); - return columns.Select(x => (x.Input, x.Output)).ToArray(); + return columns.Select(x => (x.Source, x.Name)).ToArray(); } private ColumnType GetOutputType(Schema inputSchema, ColumnInfo column) { var keyCount = column.HashBits < 31 ? 1 << column.HashBits : 0; - inputSchema.TryGetColumnIndex(column.Input, out int srcCol); + inputSchema.TryGetColumnIndex(column.Source, out int srcCol); var itemType = new KeyType(DataKind.U4, 0, keyCount, keyCount > 0); var srcType = inputSchema[srcCol].Type; if (srcType is VectorType vectorType) @@ -317,7 +317,7 @@ private Delegate GetGetterCore(Row input, int iinfo, out Action disposer) Host.AssertValue(input); Host.Assert(0 <= iinfo && iinfo < _columns.Length); disposer = null; - input.Schema.TryGetColumnIndex(_columns[iinfo].Input, out int srcCol); + input.Schema.TryGetColumnIndex(_columns[iinfo].Source, out int srcCol); var srcType = input.Schema[srcCol].Type; if (!(srcType is VectorType vectorType)) return ComposeGetterOne(input, iinfo, srcCol, srcType); @@ -932,7 +932,7 @@ private InvertHashHelper(Row row, ColumnInfo ex) { Contracts.AssertValue(row); Row = row; - row.Schema.TryGetColumnIndex(ex.Input, out int srcCol); + row.Schema.TryGetColumnIndex(ex.Source, out int srcCol); _srcCol = srcCol; _srcType = row.Schema[srcCol].Type; _ex = ex; @@ -951,8 +951,7 @@ private InvertHashHelper(Row row, ColumnInfo ex) /// A hash getter, built on top of . public static InvertHashHelper Create(Row row, ColumnInfo ex, int invertHashMaxCount, Delegate dstGetter) { - row.Schema.TryGetColumnIndex(ex.Input, out int srcCol); - + row.Schema.TryGetColumnIndex(ex.Source, out int srcCol); ColumnType typeSrc = row.Schema[srcCol].Type; VectorType vectorTypeSrc = typeSrc as VectorType; @@ -1251,8 +1250,8 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema) var result = inputSchema.ToDictionary(x => x.Name); foreach (var colInfo in _columns) { - if (!inputSchema.TryFindColumn(colInfo.Input, out var col)) - throw _host.ExceptSchemaMismatch(nameof(inputSchema), "input", colInfo.Input); + if (!inputSchema.TryFindColumn(colInfo.Source, out var col)) + throw _host.ExceptSchemaMismatch(nameof(inputSchema), "input", colInfo.Source); if (!IsColumnTypeValid(col.ItemType)) throw _host.ExceptParam(nameof(inputSchema), ExpectedColumnType); var metadata = new List(); @@ -1260,7 +1259,7 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema) metadata.Add(slotMeta); if (colInfo.InvertHash != 0) metadata.Add(new SchemaShape.Column(MetadataUtils.Kinds.KeyValues, SchemaShape.Column.VectorKind.Vector, TextType.Instance, false)); - result[colInfo.Output] = new SchemaShape.Column(colInfo.Output, col.ItemType is VectorType ? SchemaShape.Column.VectorKind.Vector : SchemaShape.Column.VectorKind.Scalar, NumberType.U4, true, new SchemaShape(metadata)); + result[colInfo.Output] = new SchemaShape.Column(colInfo.Name, col.ItemType is VectorType ? SchemaShape.Column.VectorKind.Vector : SchemaShape.Column.VectorKind.Scalar, NumberType.U4, true, new SchemaShape(metadata)); } return new SchemaShape(result.Values); } diff --git a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs index 8375a86dc8..38b8a0033c 100644 --- a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs +++ b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs @@ -90,21 +90,21 @@ public sealed class Arguments /// public sealed class ColumnInfo { - public readonly string Input; - public readonly string Output; + public readonly string Source; + public readonly string Name; public readonly bool Bag; /// /// Describes how the transformer handles one column pair. /// - /// Name of input column. - /// Name of the column resulting from the transformation of . Null means is replaced. + /// Name of columnto use. + /// Name of the column resulting from the transformation of . Null means is replaced. /// Whether to combine multiple indicator vectors into a single bag vector instead of concatenating them. This is only relevant when the input column is a vector. - public ColumnInfo(string output, string input = null, bool bag = KeyToVectorMappingEstimator.Defaults.Bag) + public ColumnInfo(string name, string source = null, bool bag = KeyToVectorMappingEstimator.Defaults.Bag) { - Contracts.CheckNonWhiteSpace(output, nameof(output)); - Input = input ?? output; - Output = output; + Contracts.CheckNonWhiteSpace(name, nameof(name)); + Source = source ?? name; + Name = name; Bag = bag; } } @@ -114,10 +114,10 @@ public ColumnInfo(string output, string input = null, bool bag = KeyToVectorMapp public IReadOnlyCollection Columns => _columns.AsReadOnly(); private readonly ColumnInfo[] _columns; - private static (string input, string output)[] GetColumnPairs(ColumnInfo[] columns) + private static (string source, string name)[] GetColumnPairs(ColumnInfo[] columns) { Contracts.CheckValue(columns, nameof(columns)); - return columns.Select(x => (x.Input, x.Output)).ToArray(); + return columns.Select(x => (x.Source, x.Name)).ToArray(); } private string TestIsKey(ColumnType type) @@ -766,10 +766,10 @@ public override SchemaShape GetOutputSchema(SchemaShape inputSchema) var result = inputSchema.ToDictionary(x => x.Name); foreach (var colInfo in Transformer.Columns) { - if (!inputSchema.TryFindColumn(colInfo.Input, out var col)) - throw Host.ExceptSchemaMismatch(nameof(inputSchema), "input", colInfo.Input); + if (!inputSchema.TryFindColumn(colInfo.Source, out var col)) + throw Host.ExceptSchemaMismatch(nameof(inputSchema), "input", colInfo.Source); if ((col.ItemType.GetItemType().RawKind == default) || !(col.ItemType is VectorType || col.ItemType is PrimitiveType)) - throw Host.ExceptSchemaMismatch(nameof(inputSchema), "input", colInfo.Input); + throw Host.ExceptSchemaMismatch(nameof(inputSchema), "input", colInfo.Source); var metadata = new List(); if (col.Metadata.TryFindColumn(MetadataUtils.Kinds.KeyValues, out var keyMeta)) @@ -780,7 +780,7 @@ public override SchemaShape GetOutputSchema(SchemaShape inputSchema) if (!colInfo.Bag || (col.Kind == SchemaShape.Column.VectorKind.Scalar)) metadata.Add(new SchemaShape.Column(MetadataUtils.Kinds.IsNormalized, SchemaShape.Column.VectorKind.Scalar, BoolType.Instance, false)); - result[colInfo.Output] = new SchemaShape.Column(colInfo.Output, SchemaShape.Column.VectorKind.Vector, NumberType.R4, false, new SchemaShape(metadata)); + result[colInfo.Name] = new SchemaShape.Column(colInfo.Name, SchemaShape.Column.VectorKind.Vector, NumberType.R4, false, new SchemaShape(metadata)); } return new SchemaShape(result.Values); diff --git a/src/Microsoft.ML.Data/Transforms/TypeConverting.cs b/src/Microsoft.ML.Data/Transforms/TypeConverting.cs index 6b8a149cf4..3b905b8014 100644 --- a/src/Microsoft.ML.Data/Transforms/TypeConverting.cs +++ b/src/Microsoft.ML.Data/Transforms/TypeConverting.cs @@ -173,24 +173,24 @@ private static VersionInfo GetVersionInfo() /// public sealed class ColumnInfo { - public readonly string Input; - public readonly string Output; + public readonly string Source; + public readonly string Name; public readonly DataKind OutputKind; public readonly KeyRange OutputKeyRange; /// /// Describes how the transformer handles one column pair. /// - /// Name of input column. - /// Name of output column. + /// Name of input column. + /// Name of output column. /// The expected kind of the converted column. /// New key range, if we work with key type. - public ColumnInfo(string output, string input, DataKind outputKind, KeyRange outputKeyRange = null) + public ColumnInfo(string name, string source, DataKind outputKind, KeyRange outputKeyRange = null) { - Contracts.CheckNonWhiteSpace(output, nameof(output)); + Contracts.CheckNonWhiteSpace(name, nameof(name)); - Input = input; - Output = output; + Source = source; + Name = name; OutputKind = outputKind; OutputKeyRange = outputKeyRange; } @@ -198,22 +198,22 @@ public ColumnInfo(string output, string input, DataKind outputKind, KeyRange out private readonly ColumnInfo[] _columns; - private static (string input, string output)[] GetColumnPairs(ColumnInfo[] columns) + private static (string source, string name)[] GetColumnPairs(ColumnInfo[] columns) { Contracts.CheckNonEmpty(columns, nameof(columns)); - return columns.Select(x => (x.Input, x.Output)).ToArray(); + return columns.Select(x => (x.Source, x.Name)).ToArray(); } /// /// Convinence constructor for simple one column case. /// /// Host Environment. - /// Name of the output column. - /// Name of the column to be transformed. If this is null '' will be used. + /// Name of the column produced. + /// Name of the column to be transformed. If this is null '' will be used. /// The expected type of the converted column. /// New key range if we work with key type. - public TypeConvertingTransformer(IHostEnvironment env, string inputColumn, string outputColumn, DataKind outputKind, KeyRange outputKeyRange = null) - : this(env, new ColumnInfo(outputColumn, inputColumn, outputKind, outputKeyRange)) + public TypeConvertingTransformer(IHostEnvironment env, string source, string name, DataKind outputKind, KeyRange outputKeyRange = null) + : this(env, new ColumnInfo(name, source, outputKind, outputKeyRange)) { } @@ -414,7 +414,7 @@ public Mapper(TypeConvertingTransformer parent, Schema inputSchema) { throw Host.ExceptParam(nameof(inputSchema), "source column '{0}' with item type '{1}' is not compatible with destination type '{2}'", - _parent._columns[i].Input, srcCol.Type, itemType); + _parent._columns[i].Source, srcCol.Type, itemType); } } } @@ -469,7 +469,7 @@ protected override Schema.DetachedColumn[] GetOutputColumnsCore() ValueGetter getter = (ref bool dst) => dst = true; builder.Add(MetadataUtils.Kinds.IsNormalized, BoolType.Instance, getter); } - result[i] = new Schema.DetachedColumn(_parent._columns[i].Output, _types[i], builder.GetMetadata()); + result[i] = new Schema.DetachedColumn(_parent._columns[i].Name, _types[i], builder.GetMetadata()); } return result; } @@ -490,17 +490,17 @@ public void SaveAsOnnx(OnnxContext ctx) for (int iinfo = 0; iinfo < _parent._columns.Length; ++iinfo) { - string sourceColumnName = _parent._columns[iinfo].Input; + string sourceColumnName = _parent._columns[iinfo].Source; if (!ctx.ContainsColumn(sourceColumnName)) { - ctx.RemoveColumn(_parent._columns[iinfo].Output, false); + ctx.RemoveColumn(_parent._columns[iinfo].Name, false); continue; } if (!SaveAsOnnxCore(ctx, iinfo, ctx.GetVariableName(sourceColumnName), - ctx.AddIntermediateVariable(_types[iinfo], _parent._columns[iinfo].Output))) + ctx.AddIntermediateVariable(_types[iinfo], _parent._columns[iinfo].Name))) { - ctx.RemoveColumn(_parent._columns[iinfo].Output, true); + ctx.RemoveColumn(_parent._columns[iinfo].Name, true); } } } @@ -562,12 +562,12 @@ public override SchemaShape GetOutputSchema(SchemaShape inputSchema) var result = inputSchema.ToDictionary(x => x.Name); foreach (var colInfo in Transformer.Columns) { - if (!inputSchema.TryFindColumn(colInfo.Input, out var col)) - throw Host.ExceptSchemaMismatch(nameof(inputSchema), "input", colInfo.Input); + if (!inputSchema.TryFindColumn(colInfo.Source, out var col)) + throw Host.ExceptSchemaMismatch(nameof(inputSchema), "input", colInfo.Source); if (!TypeConvertingTransformer.GetNewType(Host, col.ItemType, colInfo.OutputKind, colInfo.OutputKeyRange, out PrimitiveType newType)) - throw Host.ExceptParam(nameof(inputSchema), $"Can't convert {colInfo.Input} into {newType.ToString()}"); + throw Host.ExceptParam(nameof(inputSchema), $"Can't convert {colInfo.Source} into {newType.ToString()}"); if (!Data.Conversion.Conversions.Instance.TryGetStandardConversion(col.ItemType, newType, out Delegate del, out bool identity)) - throw Host.ExceptParam(nameof(inputSchema), $"Don't know how to convert {colInfo.Input} into {newType.ToString()}"); + throw Host.ExceptParam(nameof(inputSchema), $"Don't know how to convert {colInfo.Source} into {newType.ToString()}"); var metadata = new List(); if (col.ItemType is BoolType && newType is NumberType) metadata.Add(new SchemaShape.Column(MetadataUtils.Kinds.IsNormalized, SchemaShape.Column.VectorKind.Scalar, BoolType.Instance, false)); @@ -580,7 +580,7 @@ public override SchemaShape GetOutputSchema(SchemaShape inputSchema) if (col.Metadata.TryFindColumn(MetadataUtils.Kinds.IsNormalized, out var normMeta)) if (col.ItemType is NumberType && newType is NumberType) metadata.Add(new SchemaShape.Column(MetadataUtils.Kinds.KeyValues, SchemaShape.Column.VectorKind.Vector, normMeta.ItemType, false)); - result[colInfo.Output] = new SchemaShape.Column(colInfo.Output, col.Kind, newType, false, col.Metadata); + result[colInfo.Name] = new SchemaShape.Column(colInfo.Name, col.Kind, newType, false, col.Metadata); } return new SchemaShape(result.Values); } diff --git a/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingEstimator.cs b/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingEstimator.cs index 0be5b3d66a..8d2305757e 100644 --- a/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingEstimator.cs +++ b/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingEstimator.cs @@ -27,13 +27,13 @@ public static class Defaults /// Initializes a new instance of . /// /// Host Environment. - /// Name of the column to be transformed. - /// Name of the output column. + /// Name of the column to be transformed. + /// Name of the column produced. /// Maximum number of keys to keep per column when auto-training. /// How items should be ordered when vectorized. If choosen they will be in the order encountered. /// If , items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a'). - internal ValueToKeyMappingEstimator(IHostEnvironment env, string outputColumn, string inputColumn = null, int maxNumTerms = Defaults.MaxNumTerms, ValueToKeyMappingTransformer.SortOrder sort = Defaults.Sort) : - this(env, new [] { new ValueToKeyMappingTransformer.ColumnInfo(inputColumn ?? outputColumn, outputColumn, maxNumTerms, sort) }) + internal ValueToKeyMappingEstimator(IHostEnvironment env, string name, string source = null, int maxNumTerms = Defaults.MaxNumTerms, ValueToKeyMappingTransformer.SortOrder sort = Defaults.Sort) : + this(env, new [] { new ValueToKeyMappingTransformer.ColumnInfo(source ?? name, name, maxNumTerms, sort) }) { } diff --git a/src/Microsoft.ML.EntryPoints/FeatureCombiner.cs b/src/Microsoft.ML.EntryPoints/FeatureCombiner.cs index ed6f282b2f..91b65e2779 100644 --- a/src/Microsoft.ML.EntryPoints/FeatureCombiner.cs +++ b/src/Microsoft.ML.EntryPoints/FeatureCombiner.cs @@ -94,19 +94,19 @@ private static IDataView ApplyKeyToVec(List (x.Input, x.Output)).ToArray()) + viewTrain = new KeyToValueMappingTransformer(host, ktv.Select(x => (x.Source, x.Name)).ToArray()) .Transform(viewTrain); viewTrain = ValueToKeyMappingTransformer.Create(host, new ValueToKeyMappingTransformer.Arguments() { Column = ktv - .Select(c => new ValueToKeyMappingTransformer.Column() { Name = c.Output, Source = c.Output, Terms = GetTerms(viewTrain, c.Input) }) + .Select(c => new ValueToKeyMappingTransformer.Column() { Name = c.Name, Source = c.Name, Terms = GetTerms(viewTrain, c.Source) }) .ToArray(), TextKeyValues = true }, viewTrain); - viewTrain = new KeyToVectorMappingTransformer(host, ktv.Select(c => new KeyToVectorMappingTransformer.ColumnInfo(c.Output, c.Output)).ToArray()).Transform(viewTrain); + viewTrain = new KeyToVectorMappingTransformer(host, ktv.Select(c => new KeyToVectorMappingTransformer.ColumnInfo(c.Name, c.Name)).ToArray()).Transform(viewTrain); } return viewTrain; } diff --git a/src/Microsoft.ML.Transforms/OneHotHashEncoding.cs b/src/Microsoft.ML.Transforms/OneHotHashEncoding.cs index dbb080a363..d4bbb764d4 100644 --- a/src/Microsoft.ML.Transforms/OneHotHashEncoding.cs +++ b/src/Microsoft.ML.Transforms/OneHotHashEncoding.cs @@ -285,13 +285,13 @@ public OneHotHashEncodingEstimator(IHostEnvironment env, params ColumnInfo[] col case OneHotEncodingTransformer.OutputKind.Bin: if ((column.HashInfo.InvertHash) != 0) ch.Warning("Invert hashing is being used with binary encoding."); - binaryCols.Add((column.HashInfo.Output, column.HashInfo.Output)); + binaryCols.Add((column.HashInfo.Name, column.HashInfo.Name)); break; case OneHotEncodingTransformer.OutputKind.Ind: - cols.Add((column.HashInfo.Output, column.HashInfo.Output, false)); + cols.Add((column.HashInfo.Name, column.HashInfo.Name, false)); break; case OneHotEncodingTransformer.OutputKind.Bag: - cols.Add((column.HashInfo.Output, column.HashInfo.Output, true)); + cols.Add((column.HashInfo.Name, column.HashInfo.Name, true)); break; } } diff --git a/test/Microsoft.ML.Tests/FeatureContributionTests.cs b/test/Microsoft.ML.Tests/FeatureContributionTests.cs index cfcd46f9f0..e41cf6954d 100644 --- a/test/Microsoft.ML.Tests/FeatureContributionTests.cs +++ b/test/Microsoft.ML.Tests/FeatureContributionTests.cs @@ -269,7 +269,7 @@ private IDataView GetSparseDataset(TaskType task = TaskType.Regression, int numb // Create a keytype for Ranking if (task == TaskType.Ranking) - return pipeline.Append(ML.Transforms.Conversion.MapValueToKey("GroupId", "GroupId")) + return pipeline.Append(ML.Transforms.Conversion.MapValueToKey("GroupId")) .Fit(srcDV).Transform(srcDV); return pipeline.Fit(srcDV).Transform(srcDV); diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs index 2e7df19194..5133a58add 100644 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs @@ -371,7 +371,7 @@ public void MulticlassLogisticRegressionOnnxConversionTest() separatorChar: '\t'); var pipeline = mlContext.Transforms.Normalize("Features"). - Append(mlContext.Transforms.Conversion.MapValueToKey("Label", "Label")). + Append(mlContext.Transforms.Conversion.MapValueToKey("Label")). Append(mlContext.MulticlassClassification.Trainers.LogisticRegression(labelColumn: "Label", featureColumn: "Features", advancedSettings: settings => { diff --git a/test/Microsoft.ML.Tests/PermutationFeatureImportanceTests.cs b/test/Microsoft.ML.Tests/PermutationFeatureImportanceTests.cs index df1690aa3c..118145b3b7 100644 --- a/test/Microsoft.ML.Tests/PermutationFeatureImportanceTests.cs +++ b/test/Microsoft.ML.Tests/PermutationFeatureImportanceTests.cs @@ -420,7 +420,7 @@ private IDataView GetDenseDataset(TaskType task = TaskType.Regression) // Create a keytype for Ranking if (task == TaskType.Ranking) - return pipeline.Append(ML.Transforms.Conversion.MapValueToKey("GroupId", "GroupId")) + return pipeline.Append(ML.Transforms.Conversion.MapValueToKey("GroupId")) .Fit(srcDV).Transform(srcDV); return pipeline.Fit(srcDV).Transform(srcDV); @@ -496,7 +496,7 @@ private IDataView GetSparseDataset(TaskType task = TaskType.Regression) // Create a keytype for Ranking if (task == TaskType.Ranking) - return pipeline.Append(ML.Transforms.Conversion.MapValueToKey("GroupId", "GroupId")) + return pipeline.Append(ML.Transforms.Conversion.MapValueToKey("GroupId")) .Fit(srcDV).Transform(srcDV); return pipeline.Fit(srcDV).Transform(srcDV); diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs index 286b758127..2d00f36957 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs @@ -160,7 +160,7 @@ private ITransformer TrainOnIris(string irisDataPath) // Concatenate all the features together into one column 'Features'. mlContext.Transforms.Concatenate("Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") // Note that the label is text, so it needs to be converted to key. - .Append(mlContext.Transforms.Conversion.MapValueToKey("Label", "Label"), TransformerScope.TrainTest) + .Append(mlContext.Transforms.Conversion.MapValueToKey("Label"), TransformerScope.TrainTest) // Cache data in memory for steps after the cache check point stage. .AppendCacheCheckpoint(mlContext) // Use the multi-class SDCA model to predict the label using features. @@ -389,7 +389,7 @@ private void CrossValidationOn(string dataPath) // Concatenate all the features together into one column 'Features'. mlContext.Transforms.Concatenate("Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") // Note that the label is text, so it needs to be converted to key. - .Append(mlContext.Transforms.Conversion.MapValueToKey("Label", "Label"), TransformerScope.TrainTest) + .Append(mlContext.Transforms.Conversion.MapValueToKey("Label"), TransformerScope.TrainTest) // Cache data in memory so that SDCA trainer will be able to randomly access training examples without // reading data from disk multiple times. Data will be cached at its first use in any downstream step. // Notice that unused part in the data may not be cached. From 60ab6cd01e9180c00b6ecf57107c586a2d97a87e Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Wed, 16 Jan 2019 10:27:45 -0800 Subject: [PATCH 6/6] Addressing the PR comments. --- .../ConversionsExtensionsCatalog.cs | 20 ++++++++----------- src/Microsoft.ML.Data/Transforms/Hashing.cs | 17 ++++++++-------- .../Transforms/KeyToVector.cs | 17 +++++++++++----- .../Transforms/TypeConverting.cs | 17 ++++++++-------- .../Transforms/ValueToKeyMappingEstimator.cs | 4 ++-- test/Microsoft.ML.Benchmarks/RffTransform.cs | 2 +- .../UnitTests/TestEntryPoints.cs | 2 +- .../Estimators/DecomposableTrainAndPredict.cs | 2 +- .../Scenarios/Api/Estimators/Extensibility.cs | 2 +- .../Api/Estimators/Metacomponents.cs | 2 +- .../Scenarios/TensorflowTests.cs | 2 +- .../TrainerEstimators/MetalinearEstimators.cs | 2 +- .../TrainerEstimators/TrainerEstimators.cs | 2 +- 13 files changed, 46 insertions(+), 45 deletions(-) diff --git a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs index 974cf1c995..ec6afaa74a 100644 --- a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs +++ b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs @@ -20,9 +20,8 @@ public static class ConversionsExtensionsCatalog /// Hashes the values in the input column. /// /// The transform's catalog. - /// Name of the column to be transformed. - /// Name of the input column. If set to , the value of the - /// will be used as input. + /// Name of the column resulting from the transformation of . + /// Name of column to transform. If set to , the value of the will be used as source. /// Number of bits to hash into. Must be between 1 and 31, inclusive. /// During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. @@ -44,9 +43,8 @@ public static HashingEstimator Hash(this TransformsCatalog.ConversionTransforms /// Changes column type of the input column. /// /// The transform's catalog. - /// Name of the column to be transformed. If set to , the value of the - /// will be used as input. - /// Name of the new column produced. + /// Name of the column resulting from the transformation of . + /// Name of column to transform. If set to , the value of the will be used as source. /// Number of bits to hash into. Must be between 1 and 31, inclusive. public static TypeConvertingEstimator ConvertType(this TransformsCatalog.ConversionTransforms catalog, string name, string source = null, DataKind outputKind = ConvertDefaults.DefaultOutputKind) @@ -90,9 +88,8 @@ public static KeyToVectorMappingEstimator MapKeyToVector(this TransformsCatalog. /// Convert the key types back to their original vectors. /// /// The categorical transform's catalog. - /// Name of the column to be transformed. If set to , the value of the - /// will be used as input. - /// Name of the new column produced. + /// Name of the column resulting from the transformation of . + /// Name of column to transform. If set to , the value of the will be used as source. /// Whether bagging is used for the conversion. public static KeyToVectorMappingEstimator MapKeyToVector(this TransformsCatalog.ConversionTransforms catalog, string name, string source = null, bool bag = KeyToVectorMappingEstimator.Defaults.Bag) @@ -102,9 +99,8 @@ public static KeyToVectorMappingEstimator MapKeyToVector(this TransformsCatalog. /// Converts value types into . /// /// The categorical transform's catalog. - /// Name of the column to be transformed. If set to , the value of the - /// will be used as input. - /// Name of the new column produced. + /// Name of the column resulting from the transformation of . + /// Name of column to transform. If set to , the value of the will be used as source. /// Maximum number of keys to keep per column when auto-training. /// How items should be ordered when vectorized. If choosen they will be in the order encountered. /// If , items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a'). diff --git a/src/Microsoft.ML.Data/Transforms/Hashing.cs b/src/Microsoft.ML.Data/Transforms/Hashing.cs index 305db81f96..88de9aa32c 100644 --- a/src/Microsoft.ML.Data/Transforms/Hashing.cs +++ b/src/Microsoft.ML.Data/Transforms/Hashing.cs @@ -116,8 +116,8 @@ public bool TryUnparse(StringBuilder sb) public sealed class ColumnInfo { - public readonly string Source; public readonly string Name; + public readonly string Source; public readonly int HashBits; public readonly uint Seed; public readonly bool Ordered; @@ -126,8 +126,8 @@ public sealed class ColumnInfo /// /// Describes how the transformer handles one column pair. /// - /// Name of column to work on. - /// Name of the column resulting from the transformation of . Null means is replaced. + /// Name of the column resulting from the transformation of . + /// Name of column to transform. If set to , the value of the will be used as source. /// Number of bits to hash into. Must be between 1 and 31, inclusive. /// Hashing seed. /// Whether the position of each term should be included in the hash. @@ -1216,17 +1216,16 @@ internal static bool IsColumnTypeValid(ColumnType type) /// Initializes a new instance of . /// /// Host Environment. - /// Name of the column to be transformed. - /// If set to the value specified for the will be used. - /// Name of the output column. + /// Name of the column resulting from the transformation of . + /// Name of column to transform. If set to , the value of the will be used as source. /// Number of bits to hash into. Must be between 1 and 31, inclusive. /// During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - internal HashingEstimator(IHostEnvironment env, string outputColumn, string inputColumn = null, + internal HashingEstimator(IHostEnvironment env, string name, string source = null, int hashBits = Defaults.HashBits, int invertHash = Defaults.InvertHash) - : this(env, new HashingTransformer.ColumnInfo(outputColumn, inputColumn ?? outputColumn, hashBits: hashBits, invertHash: invertHash)) + : this(env, new HashingTransformer.ColumnInfo(name, source ?? name, hashBits: hashBits, invertHash: invertHash)) { } @@ -1259,7 +1258,7 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema) metadata.Add(slotMeta); if (colInfo.InvertHash != 0) metadata.Add(new SchemaShape.Column(MetadataUtils.Kinds.KeyValues, SchemaShape.Column.VectorKind.Vector, TextType.Instance, false)); - result[colInfo.Output] = new SchemaShape.Column(colInfo.Name, col.ItemType is VectorType ? SchemaShape.Column.VectorKind.Vector : SchemaShape.Column.VectorKind.Scalar, NumberType.U4, true, new SchemaShape(metadata)); + result[colInfo.Name] = new SchemaShape.Column(colInfo.Name, col.ItemType is VectorType ? SchemaShape.Column.VectorKind.Vector : SchemaShape.Column.VectorKind.Scalar, NumberType.U4, true, new SchemaShape(metadata)); } return new SchemaShape(result.Values); } diff --git a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs index 38b8a0033c..4f3ceaf279 100644 --- a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs +++ b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs @@ -90,15 +90,15 @@ public sealed class Arguments /// public sealed class ColumnInfo { - public readonly string Source; public readonly string Name; + public readonly string Source; public readonly bool Bag; /// /// Describes how the transformer handles one column pair. /// - /// Name of columnto use. - /// Name of the column resulting from the transformation of . Null means is replaced. + /// Name of the column resulting from the transformation of . + /// Name of column to transform. If set to , the value of the will be used as source. /// Whether to combine multiple indicator vectors into a single bag vector instead of concatenating them. This is only relevant when the input column is a vector. public ColumnInfo(string name, string source = null, bool bag = KeyToVectorMappingEstimator.Defaults.Bag) { @@ -750,8 +750,15 @@ internal KeyToVectorMappingEstimator(IHostEnvironment env, params KeyToVectorMap { } - internal KeyToVectorMappingEstimator(IHostEnvironment env, string outputColumn, string inputColumn = null, bool bag = Defaults.Bag) - : this(env, new KeyToVectorMappingTransformer(env, new KeyToVectorMappingTransformer.ColumnInfo(outputColumn, inputColumn ?? outputColumn, bag))) + /// + /// Convert the key types back to their original vectors. + /// + /// The environmnet to use. + /// Name of the column resulting from the transformation of . + /// Name of column to transform. If set to , the value of the will be used as source. + /// Whether bagging is used for the conversion. + internal KeyToVectorMappingEstimator(IHostEnvironment env, string name, string source = null, bool bag = Defaults.Bag) + : this(env, new KeyToVectorMappingTransformer(env, new KeyToVectorMappingTransformer.ColumnInfo(name, source ?? name, bag))) { } diff --git a/src/Microsoft.ML.Data/Transforms/TypeConverting.cs b/src/Microsoft.ML.Data/Transforms/TypeConverting.cs index 3b905b8014..a83bba0fff 100644 --- a/src/Microsoft.ML.Data/Transforms/TypeConverting.cs +++ b/src/Microsoft.ML.Data/Transforms/TypeConverting.cs @@ -173,16 +173,16 @@ private static VersionInfo GetVersionInfo() /// public sealed class ColumnInfo { - public readonly string Source; public readonly string Name; + public readonly string Source; public readonly DataKind OutputKind; public readonly KeyRange OutputKeyRange; /// /// Describes how the transformer handles one column pair. /// - /// Name of input column. - /// Name of output column. + /// Name of the column resulting from the transformation of . + /// Name of column to transform. /// The expected kind of the converted column. /// New key range, if we work with key type. public ColumnInfo(string name, string source, DataKind outputKind, KeyRange outputKeyRange = null) @@ -209,7 +209,7 @@ private static (string source, string name)[] GetColumnPairs(ColumnInfo[] column /// /// Host Environment. /// Name of the column produced. - /// Name of the column to be transformed. If this is null '' will be used. + /// Name of the column to transform. If this is null '' will be used. /// The expected type of the converted column. /// New key range if we work with key type. public TypeConvertingTransformer(IHostEnvironment env, string source, string name, DataKind outputKind, KeyRange outputKeyRange = null) @@ -537,14 +537,13 @@ internal sealed class Defaults /// Convinence constructor for simple one column case. /// /// Host Environment. - /// Name of the input column. If set to , the value of the - /// will be used as input. - /// Name of the output column. + /// Name of the column resulting from the transformation of . + /// Name of column to transform. If set to , the value of the will be used as source. /// The expected type of the converted column. internal TypeConvertingEstimator(IHostEnvironment env, - string outputColumn, string inputColumn = null, + string name, string source = null, DataKind outputKind = Defaults.DefaultOutputKind) - : this(env, new TypeConvertingTransformer.ColumnInfo(outputColumn, inputColumn ?? outputColumn, outputKind)) + : this(env, new TypeConvertingTransformer.ColumnInfo(name, source ?? name, outputKind)) { } diff --git a/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingEstimator.cs b/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingEstimator.cs index 8d2305757e..193409b4b2 100644 --- a/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingEstimator.cs +++ b/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingEstimator.cs @@ -27,8 +27,8 @@ public static class Defaults /// Initializes a new instance of . /// /// Host Environment. - /// Name of the column to be transformed. - /// Name of the column produced. + /// Name of the column resulting from the transformation of . + /// Name of column to transform. If set to , the value of the will be used as source. /// Maximum number of keys to keep per column when auto-training. /// How items should be ordered when vectorized. If choosen they will be in the order encountered. /// If , items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a'). diff --git a/test/Microsoft.ML.Benchmarks/RffTransform.cs b/test/Microsoft.ML.Benchmarks/RffTransform.cs index fefabd3f16..7744addefb 100644 --- a/test/Microsoft.ML.Benchmarks/RffTransform.cs +++ b/test/Microsoft.ML.Benchmarks/RffTransform.cs @@ -46,7 +46,7 @@ public void CV_Multiclass_Digits_RffTransform_OVAAveragedPerceptron() var pipeline = mlContext.Transforms.Projection.CreateRandomFourierFeatures("Features", "FeaturesRFF") .AppendCacheCheckpoint(mlContext) .Append(mlContext.Transforms.Concatenate("Features", "FeaturesRFF")) - .Append(new ValueToKeyMappingEstimator(mlContext, "Label", "Label")) + .Append(new ValueToKeyMappingEstimator(mlContext, "Label")) .Append(mlContext.MulticlassClassification.Trainers.OneVersusAll(mlContext.BinaryClassification.Trainers.AveragedPerceptron(numIterations: 10))); var cvResults = mlContext.MulticlassClassification.CrossValidate(data, pipeline, numFolds: 5); diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs index 4660a8d2a4..3c64969469 100644 --- a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs +++ b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs @@ -748,7 +748,7 @@ public void EntryPointPipelineEnsemble() }).Fit(data).Transform(data); data = new ColumnConcatenatingTransformer(Env, "Features", new[] { "Features1", "Features2" }).Transform(data); - data = new ValueToKeyMappingEstimator(Env, "Label", "Label", sort: ValueToKeyMappingTransformer.SortOrder.Value).Fit(data).Transform(data); + data = new ValueToKeyMappingEstimator(Env, "Label", sort: ValueToKeyMappingTransformer.SortOrder.Value).Fit(data).Transform(data); var lrInput = new LogisticRegression.Arguments { diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/DecomposableTrainAndPredict.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/DecomposableTrainAndPredict.cs index 00f9281b75..94f53e65f2 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/DecomposableTrainAndPredict.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/DecomposableTrainAndPredict.cs @@ -30,7 +30,7 @@ void DecomposableTrainAndPredict() var data = ml.Data.ReadFromTextFile(dataPath, separatorChar: ','); var pipeline = new ColumnConcatenatingEstimator (ml, "Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") - .Append(new ValueToKeyMappingEstimator(ml, "Label", "Label"), TransformerScope.TrainTest) + .Append(new ValueToKeyMappingEstimator(ml, "Label"), TransformerScope.TrainTest) .Append(ml.MulticlassClassification.Trainers.StochasticDualCoordinateAscent("Label", "Features",advancedSettings: s => { s.MaxIterations = 100; s.Shuffle = true; s.NumThreads = 1; })) .Append(new KeyToValueMappingEstimator(ml, "PredictedLabel")); diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Extensibility.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Extensibility.cs index 105cceb885..84bd6691e9 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Extensibility.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Extensibility.cs @@ -39,7 +39,7 @@ void Extensibility() }; var pipeline = new ColumnConcatenatingEstimator (ml, "Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") .Append(new CustomMappingEstimator(ml, action, null), TransformerScope.TrainTest) - .Append(new ValueToKeyMappingEstimator(ml, "Label", "Label"), TransformerScope.TrainTest) + .Append(new ValueToKeyMappingEstimator(ml, "Label"), TransformerScope.TrainTest) .Append(ml.MulticlassClassification.Trainers.StochasticDualCoordinateAscent("Label", "Features", advancedSettings: (s) => { s.MaxIterations = 100; s.Shuffle = true; s.NumThreads = 1; })) .Append(new KeyToValueMappingEstimator(ml, "PredictedLabel")); diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Metacomponents.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Metacomponents.cs index 3921de5fd1..70b6b0bbb5 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Metacomponents.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Metacomponents.cs @@ -27,7 +27,7 @@ public void Metacomponents() var sdcaTrainer = ml.BinaryClassification.Trainers.StochasticDualCoordinateAscent("Label", "Features", advancedSettings: (s) => { s.MaxIterations = 100; s.Shuffle = true; s.NumThreads = 1; }); var pipeline = new ColumnConcatenatingEstimator (ml, "Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") - .Append(new ValueToKeyMappingEstimator(ml, "Label", "Label"), TransformerScope.TrainTest) + .Append(new ValueToKeyMappingEstimator(ml, "Label"), TransformerScope.TrainTest) .Append(new Ova(ml, sdcaTrainer)) .Append(new KeyToValueMappingEstimator(ml, "PredictedLabel")); diff --git a/test/Microsoft.ML.Tests/Scenarios/TensorflowTests.cs b/test/Microsoft.ML.Tests/Scenarios/TensorflowTests.cs index f1e42ca284..430ae1c28e 100644 --- a/test/Microsoft.ML.Tests/Scenarios/TensorflowTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/TensorflowTests.cs @@ -39,7 +39,7 @@ public void TensorFlowTransforCifarEndToEndTest() .Append(new ImagePixelExtractingEstimator(mlContext, "ImageCropped", "Input", interleave: true)) .Append(new TensorFlowEstimator(mlContext, model_location, new[] { "Input" }, new[] { "Output" })) .Append(new ColumnConcatenatingEstimator(mlContext, "Features", "Output")) - .Append(new ValueToKeyMappingEstimator(mlContext, "Label", "Label")) + .Append(new ValueToKeyMappingEstimator(mlContext, "Label")) .AppendCacheCheckpoint(mlContext) .Append(new SdcaMultiClassTrainer(mlContext)); diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/MetalinearEstimators.cs b/test/Microsoft.ML.Tests/TrainerEstimators/MetalinearEstimators.cs index 164b48068a..8371e3e415 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/MetalinearEstimators.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/MetalinearEstimators.cs @@ -76,7 +76,7 @@ public void MetacomponentsFeaturesRenamed() var sdcaTrainer = new SdcaBinaryTrainer(Env, "Label", "Vars", advancedSettings: (s) => { s.MaxIterations = 100; s.Shuffle = true; s.NumThreads = 1; }); var pipeline = new ColumnConcatenatingEstimator(Env, "Vars", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") - .Append(new ValueToKeyMappingEstimator(Env, "Label", "Label"), TransformerScope.TrainTest) + .Append(new ValueToKeyMappingEstimator(Env, "Label"), TransformerScope.TrainTest) .Append(new Ova(Env, sdcaTrainer)) .Append(new KeyToValueMappingEstimator(Env, "PredictedLabel")); diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/TrainerEstimators.cs b/test/Microsoft.ML.Tests/TrainerEstimators/TrainerEstimators.cs index ae02d6fc39..85af744ac7 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/TrainerEstimators.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/TrainerEstimators.cs @@ -193,7 +193,7 @@ private TextLoader.Arguments GetIrisLoaderArgs() } }).Read(GetDataPath(IrisDataPath)); - var pipeline = new ValueToKeyMappingEstimator(Env, "Label", "Label"); + var pipeline = new ValueToKeyMappingEstimator(Env, "Label"); return (pipeline, data); }