diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMapping.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMapping.cs new file mode 100644 index 0000000000..9a0e7aabf3 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMapping.cs @@ -0,0 +1,70 @@ +using System; +using System.Collections.Generic; +using Microsoft.ML; + +namespace Samples.Dynamic +{ + public static class CustomMapping + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Get a small dataset as an IEnumerable and convert it to an IDataView. + var samples = new List + { + new InputData { Age = 26 }, + new InputData { Age = 35 }, + new InputData { Age = 34 }, + new InputData { Age = 28 }, + }; + var data = mlContext.Data.LoadFromEnumerable(samples); + + // We define the custom mapping between input and output rows that will be applied by the transformation. + Action mapping = + (input, output) => output.IsUnderThirty = input.Age < 30; + + // Custom transformations can be used to transform data directly, or as part of a pipeline of estimators. + // Note: If contractName is null in the CustomMapping estimator, any pipeline of estimators containing it, + // cannot be saved and loaded back. + var pipeline = mlContext.Transforms.CustomMapping(mapping, contractName: null); + + // Now we can transform the data and look at the output to confirm the behavior of the estimator. + // This operation doesn't actually evaluate data until we read the data below. + var transformer = pipeline.Fit(data); + var transformedData = transformer.Transform(data); + + var dataEnumerable = mlContext.Data.CreateEnumerable(transformedData, reuseRowObject: true); + Console.WriteLine("Age\t IsUnderThirty"); + foreach (var row in dataEnumerable) + Console.WriteLine($"{row.Age}\t {row.IsUnderThirty}"); + + // Expected output: + // Age IsUnderThirty + // 26 True + // 35 False + // 34 False + // 28 True + } + + // Defines only the column to be generated by the custom mapping transformation in addition to the columns already present. + private class CustomMappingOutput + { + public bool IsUnderThirty { get; set; } + } + + // Defines the schema of the input data. + private class InputData + { + public float Age { get; set; } + } + + // Defines the schema of the transformed data, which includes the new column IsUnderThirty. + private class TransformedData : InputData + { + public bool IsUnderThirty { get; set; } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSample.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSample.cs deleted file mode 100644 index d4b5b5904e..0000000000 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSample.cs +++ /dev/null @@ -1,80 +0,0 @@ -using System; -namespace Microsoft.ML.Samples.Dynamic -{ - public static class CustomMapping - { - public static void Example() - { - // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, - // as well as the source of randomness. - var mlContext = new MLContext(); - - // Get a small dataset as an IEnumerable and convert it to an IDataView. - var data = SamplesUtils.DatasetUtils.GetInfertData(); - var trainData = mlContext.Data.LoadFromEnumerable(data); - - // Preview of the data. - // Age RowNum Education ... - // 26 0 0-5yrs ... - // 42 1 0-5yrs ... - // 39 2 12+yrs ... - // 34 3 0-5yrs ... - // 35 4 6-11yrs ... - - // We define the custom mapping between input and output rows that will be applied by the transformation. - Action mapping = - (input, output) => output.IsUnderThirty = input.Age < 30; - - // Custom transformations can be used to transform data directly, or as part of a pipeline. Below we transform data directly. - var estimator = mlContext.Transforms.CustomMapping(mapping, null); - var transformedData = estimator.Fit(trainData).Transform(trainData); - - // Preview 5 lines of the transformed data. - transformedData = mlContext.Data.TakeRows(transformedData, 5); - var dataEnumerable = mlContext.Data.CreateEnumerable(transformedData, reuseRowObject: true); - Console.WriteLine("IsUnderThirty\t Age\t RowNum\t Education\t ..."); - foreach (var row in dataEnumerable) - Console.WriteLine($"{row.IsUnderThirty}\t {row.Age}\t {row.RowNum}\t {row.Education}\t ..."); - // Expected output: - // IsUnderThirty Age RowNum Education ... - // True 26 0 0-5yrs ... - // False 42 1 0-5yrs ... - // False 39 2 12+yrs ... - // False 34 3 0-5yrs ... - // False 35 4 6-11yrs ... - - - // Here instead we use it as part of a pipeline of estimators. - var pipeline = mlContext.Transforms.CustomMapping(mapping, null) - .Append(mlContext.Transforms.Concatenate(outputColumnName: "Features", inputColumnNames: new[] { "Parity", "Induced" })) - // It is useful to add a caching checkpoint before a trainer that does several passes over the data. - .AppendCacheCheckpoint(mlContext) - // We use binary FastTree to predict the label column that was generated by the custom mapping at the first step of the pipeline. - .Append(mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: "IsUnderThirty")); - - // We can train the pipeline and use it to transform data. - transformedData = pipeline.Fit(trainData).Transform(trainData); - } - - // This defines only the column to be generated by the transformation in addition to the columns already present. - public class OutputRow - { - public bool IsUnderThirty { get; set; } - } - - // Represents the transformed infertility dataset. - public class SampleInfertDataTransformed - { - public bool IsUnderThirty { get; set; } - public float Age { get; set; } - public int RowNum { get; set; } - public string Education { get; set; } - public float Parity { get; set; } - public float Induced { get; set; } - public float Case { get; set; } - public float Spontaneous { get; set; } - public float Stratum { get; set; } - public float PooledStratum { get; set; } - } - } -} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSaveAndLoad.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSaveAndLoad.cs new file mode 100644 index 0000000000..8bc4439190 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSaveAndLoad.cs @@ -0,0 +1,87 @@ +using System; +using System.Collections.Generic; +using Microsoft.ML; +using Microsoft.ML.Transforms; + +namespace Samples.Dynamic +{ + public static class CustomMappingSaveAndLoad + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Get a small dataset as an IEnumerable and convert it to an IDataView. + var samples = new List + { + new InputData { Age = 26 }, + new InputData { Age = 35 }, + new InputData { Age = 34 }, + new InputData { Age = 28 }, + }; + var data = mlContext.Data.LoadFromEnumerable(samples); + + // Custom transformations can be used to transform data directly, or as part of a pipeline of estimators. + var pipeline = mlContext.Transforms.CustomMapping(new IsUnderThirtyCustomAction().GetMapping(), contractName: "IsUnderThirty"); + var transformer = pipeline.Fit(data); + + // To save and load the CustomMapping estimator, the assembly in which the custom action is defined needs to be registered in the + // environment. The following registers the assembly where IsUnderThirtyCustomAction is defined. + mlContext.ComponentCatalog.RegisterAssembly(typeof(IsUnderThirtyCustomAction).Assembly); + + // Now the transform pipeline can be saved and loaded through the usual MLContext method. + mlContext.Model.Save(transformer, data.Schema, "customTransform.zip"); + var loadedTransform = mlContext.Model.Load("customTransform.zip", out var inputSchema); + + // Now we can transform the data and look at the output to confirm the behavior of the estimator. + // This operation doesn't actually evaluate data until we read the data below. + var transformedData = loadedTransform.Transform(data); + + var dataEnumerable = mlContext.Data.CreateEnumerable(transformedData, reuseRowObject: true); + Console.WriteLine("Age\tIsUnderThirty"); + foreach (var row in dataEnumerable) + Console.WriteLine($"{row.Age}\t {row.IsUnderThirty}"); + + // Expected output: + // Age IsUnderThirty + // 26 True + // 35 False + // 34 False + // 28 True + } + + // The custom action needs to implement the abstract class CustomMappingFactory, and needs to have attribute + // CustomMappingFactoryAttribute with argument equal to the contractName used to define the CustomMapping estimator + // which uses the action. + [CustomMappingFactoryAttribute("IsUnderThirty")] + private class IsUnderThirtyCustomAction : CustomMappingFactory + { + // We define the custom mapping between input and output rows that will be applied by the transformation. + public static void CustomAction(InputData input, CustomMappingOutput output) + => output.IsUnderThirty = input.Age < 30; + + public override Action GetMapping() + => CustomAction; + } + + // Defines only the column to be generated by the custom mapping transformation in addition to the columns already present. + private class CustomMappingOutput + { + public bool IsUnderThirty { get; set; } + } + + // Defines the schema of the input data. + private class InputData + { + public float Age { get; set; } + } + + // Defines the schema of the transformed data, which includes the new column IsUnderThirty. + private class TransformedData : InputData + { + public bool IsUnderThirty { get; set; } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs index 15d448deee..c33333a8ab 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs @@ -7,26 +7,24 @@ namespace Microsoft.ML.Samples.Dynamic { public static class IndicateMissingValues { - public static void Example() { // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, // as well as the source of randomness. var mlContext = new MLContext(); + // Get a small dataset as an IEnumerable and convert it to an IDataView. var samples = new List() { - new DataPoint(){ Label = 3, Features = new float[3] {1, 1, 0} }, - new DataPoint(){ Label = 32, Features = new float[3] {0, float.NaN, 1} }, - new DataPoint(){ Label = float.NaN, Features = new float[3] {-1, float.NaN, -3} }, + new DataPoint(){ Features = new float[3] {1, 1, 0} }, + new DataPoint(){ Features = new float[3] {0, float.NaN, 1} }, + new DataPoint(){ Features = new float[3] {-1, float.NaN, -3} }, }; - // Convert training data to IDataView, the general data type used in ML.NET. var data = mlContext.Data.LoadFromEnumerable(samples); - // IndicateMissingValues is used to create a boolean containing - // 'true' where the value in the input column is NaN. This value can be used - // to replace missing values with other values. - IEstimator pipeline = mlContext.Transforms.IndicateMissingValues("MissingIndicator", "Features"); + // IndicateMissingValues is used to create a boolean containing 'true' where the value in the + // input column is missing. For floats and doubles, missing values are represented as NaN. + var pipeline = mlContext.Transforms.IndicateMissingValues("MissingIndicator", "Features"); // Now we can transform the data and look at the output to confirm the behavior of the estimator. // This operation doesn't actually evaluate data until we read the data below. @@ -36,32 +34,18 @@ public static void Example() // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. var rowEnumerable = mlContext.Data.CreateEnumerable(transformedData, reuseRowObject: false); - // a small printing utility - Func vectorPrinter = (object[] vector) => - { - string preview = "["; - foreach (var slot in vector) - preview += $"{slot} "; - return preview += "]"; - - }; - // And finally, we can write out the rows of the dataset, looking at the columns of interest. foreach (var row in rowEnumerable) - { - Console.WriteLine($"Label: {row.Label} Features: {vectorPrinter(row.Features.Cast().ToArray())} MissingIndicator: {vectorPrinter(row.MissingIndicator.Cast().ToArray())}"); - } + Console.WriteLine($"Features: [{string.Join(", ", row.Features)}]\t MissingIndicator: [{string.Join(", ", row.MissingIndicator)}]"); // Expected output: - // - // Label: 3 Features: [1 1 0] MissingIndicator: [False False False] - // Label: 32 Features: [0 NaN 1] MissingIndicator: [False True False] - // Label: NaN Features: [-1 NaN -3 ] MissingIndicator: [False True False] + // Features: [1, 1, 0] MissingIndicator: [False, False, False] + // Features: [0, NaN, 1] MissingIndicator: [False, True, False] + // Features: [-1, NaN, -3] MissingIndicator: [False, True, False] } private class DataPoint { - public float Label { get; set; } [VectorType(3)] public float[] Features { get; set; } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValuesMultiColumn.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValuesMultiColumn.cs new file mode 100644 index 0000000000..830fb9d047 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValuesMultiColumn.cs @@ -0,0 +1,68 @@ +using System; +using System.Collections.Generic; +using Microsoft.ML; +using Microsoft.ML.Data; + +namespace Samples.Dynamic +{ + public static class IndicateMissingValuesMultiColumn + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Get a small dataset as an IEnumerable and convert it to an IDataView. + var samples = new List() + { + new DataPoint(){ Features1 = new float[3] {1, 1, 0}, Features2 = new float[2] {1, 1} }, + new DataPoint(){ Features1 = new float[3] {0, float.NaN, 1}, Features2 = new float[2] {float.NaN, 1} }, + new DataPoint(){ Features1 = new float[3] {-1, float.NaN, -3}, Features2 = new float[2] {1, float.PositiveInfinity} }, + }; + var data = mlContext.Data.LoadFromEnumerable(samples); + + // IndicateMissingValues is used to create a boolean containing 'true' where the value in the + // input column is missing. For floats and doubles, missing values are NaN. + // We can use an array of InputOutputColumnPair to apply the MissingValueIndicatorEstimator + // to multiple columns in one pass over the data. + var pipeline = mlContext.Transforms.IndicateMissingValues(new[] { + new InputOutputColumnPair("MissingIndicator1", "Features1"), + new InputOutputColumnPair("MissingIndicator2", "Features2") + }); + + // Now we can transform the data and look at the output to confirm the behavior of the estimator. + // This operation doesn't actually evaluate data until we read the data below. + var tansformer = pipeline.Fit(data); + var transformedData = tansformer.Transform(data); + + // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. + var rowEnumerable = mlContext.Data.CreateEnumerable(transformedData, reuseRowObject: false); + + // And finally, we can write out the rows of the dataset, looking at the columns of interest. + foreach (var row in rowEnumerable) + Console.WriteLine($"Features1: [{string.Join(", ", row.Features1)}]\t MissingIndicator1: [{string.Join(", ", row.MissingIndicator1)}]\t " + + $"Features2: [{string.Join(", ", row.Features2)}]\t MissingIndicator2: [{string.Join(", ", row.MissingIndicator2)}]"); + + // Expected output: + // Features1: [1, 1, 0] MissingIndicator1: [False, False, False] Features2: [1, 1] MissingIndicator2: [False, False] + // Features1: [0, NaN, 1] MissingIndicator1: [False, True, False] Features2: [NaN, 1] MissingIndicator2: [True, False] + // Features1: [-1, NaN, -3] MissingIndicator1: [False, True, False] Features2: [1, ∞] MissingIndicator2: [False, False] + } + + private class DataPoint + { + [VectorType(3)] + public float[] Features1 { get; set; } + [VectorType(2)] + public float[] Features2 { get; set; } + } + + private sealed class SampleDataTransformed : DataPoint + { + public bool[] MissingIndicator1 { get; set; } + public bool[] MissingIndicator2 { get; set; } + + } + } +} \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs index 01fce1ad06..356451577e 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs @@ -13,29 +13,20 @@ public static void Example() // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, // as well as the source of randomness. var mlContext = new MLContext(); + + // Get a small dataset as an IEnumerable and convert it to an IDataView. var samples = new List() { - new DataPoint(){ Label = 3, Features = new float[3] {1, 1, 0} }, - new DataPoint(){ Label = 32, Features = new float[3] {0, float.NaN, 1} }, - new DataPoint(){ Label = 5, Features = new float[3] {-1, 2, -3} }, - new DataPoint(){ Label = 9, Features = new float[3] {-1, 6, -3} }, + new DataPoint(){ Features = new float[3] {float.PositiveInfinity, 1, 0} }, + new DataPoint(){ Features = new float[3] {0, float.NaN, 1} }, + new DataPoint(){ Features = new float[3] {-1, 2, -3} }, + new DataPoint(){ Features = new float[3] {-1, float.NaN, -3} }, }; - // Convert training data to IDataView, the general data type used in ML.NET. var data = mlContext.Data.LoadFromEnumerable(samples); - // ReplaceMissingValues is used to create a column where missing values are replaced according to the ReplacementMode. - var meanPipeline = mlContext.Transforms.ReplaceMissingValues("MissingReplaced", "Features", MissingValueReplacingEstimator.ReplacementMode.Mean); - - // Now we can transform the data and look at the output to confirm the behavior of the estimator. - // This operation doesn't actually evaluate data until we read the data below. - var meanTransformer = meanPipeline.Fit(data); - var meanTransformedData = meanTransformer.Transform(data); - - // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. - var meanRowEnumerable = mlContext.Data.CreateEnumerable(meanTransformedData, reuseRowObject: false); - - // ReplaceMissingValues is used to create a column where missing values are replaced according to the ReplacementMode. - var defaultPipeline = mlContext.Transforms.ReplaceMissingValues("MissingReplaced", "Features", MissingValueReplacingEstimator.ReplacementMode.DefaultValue); + // Here we use the default replacement mode, which replaces the value with the default value for its type. + var defaultPipeline = mlContext.Transforms.ReplaceMissingValues("MissingReplaced", "Features", + MissingValueReplacingEstimator.ReplacementMode.DefaultValue); // Now we can transform the data and look at the output to confirm the behavior of the estimator. // This operation doesn't actually evaluate data until we read the data below. @@ -45,49 +36,41 @@ public static void Example() // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. var defaultRowEnumerable = mlContext.Data.CreateEnumerable(defaultTransformedData, reuseRowObject: false); - // a small printing utility - Func vectorPrinter = (object[] vector) => - { - string preview = "["; - foreach (var slot in vector) - preview += $"{slot} "; - return preview += "]"; - - }; - // And finally, we can write out the rows of the dataset, looking at the columns of interest. - foreach (var row in meanRowEnumerable) - { - Console.WriteLine($"Label: {row.Label} Features: {vectorPrinter(row.Features.Cast().ToArray())} MissingReplaced: {vectorPrinter(row.MissingReplaced.Cast().ToArray())}"); - } + foreach (var row in defaultRowEnumerable) + Console.WriteLine($"Features: [{string.Join(", ", row.Features)}]\t MissingReplaced: [{string.Join(", ", row.MissingReplaced)}]"); // Expected output: - // Notice how the NaN of the Features column for the second row is replaced by the mean of (1, 2, 6) the values in that row - // - //Label: 3 Features: [1 1 0] MissingReplaced: [1 1 0] - //Label: 32 Features: [0 NaN 1] MissingReplaced: [0 3 1] - //Label: 5 Features: [-1 2 - 3] MissingReplaced: [-1 2 -3] - //Label: 9 Features: [-1 6 - 3] MissingReplaced: [-1 6 -3] + // Features: [∞, 1, 0] MissingReplaced: [∞, 1, 0] + // Features: [0, NaN, 1] MissingReplaced: [0, 0, 1] + // Features: [-1, 2, -3] MissingReplaced: [-1, 2, -3] + // Features: [-1, NaN, -3] MissingReplaced: [-1, 0, -3] + + // Here we use the mean replacement mode, which replaces the value with the mean of the non values that were not missing. + var meanPipeline = mlContext.Transforms.ReplaceMissingValues("MissingReplaced", "Features", + MissingValueReplacingEstimator.ReplacementMode.Mean); + + // Now we can transform the data and look at the output to confirm the behavior of the estimator. + // This operation doesn't actually evaluate data until we read the data below. + var meanTransformer = meanPipeline.Fit(data); + var meanTransformedData = meanTransformer.Transform(data); + + // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. + var meanRowEnumerable = mlContext.Data.CreateEnumerable(meanTransformedData, reuseRowObject: false); // And finally, we can write out the rows of the dataset, looking at the columns of interest. - foreach (var row in defaultRowEnumerable) - { - Console.WriteLine($"Label: {row.Label} Features: {vectorPrinter(row.Features.Cast().ToArray())} MissingReplaced: {vectorPrinter(row.MissingReplaced.Cast().ToArray())}"); - } + foreach (var row in meanRowEnumerable) + Console.WriteLine($"Features: [{string.Join(", ", row.Features)}]\t MissingReplaced: [{string.Join(", ", row.MissingReplaced)}]"); // Expected output: - // Notice how the NaN of the Features column for the second row is replaced by 0, the default value for floats. - // - //Label: 3 Features: [1 1 0] MissingReplaced: [1 1 0] - //Label: 32 Features: [0 NaN 1] MissingReplaced: [0 0 1] - //Label: 5 Features: [-1 2 - 3] MissingReplaced: [-1 2 - 3] - //Label: 9 Features: [-1 6 - 3] MissingReplaced: [-1 6 - 3] + // Features: [∞, 1, 0] MissingReplaced: [∞, 1, 0] + // Features: [0, NaN, 1] MissingReplaced: [0, 1.5, 1] + // Features: [-1, 2, -3] MissingReplaced: [-1, 2, -3] + // Features: [-1, NaN, -3] MissingReplaced: [-1, 1.5, -3] } private class DataPoint { - public float Label { get; set; } - [VectorType(3)] public float[] Features { get; set; } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValuesMultiColumn.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValuesMultiColumn.cs new file mode 100644 index 0000000000..aa5d1acf5b --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValuesMultiColumn.cs @@ -0,0 +1,96 @@ +using System; +using System.Collections.Generic; +using Microsoft.ML; +using Microsoft.ML.Data; +using Microsoft.ML.Transforms; + +namespace Samples.Dynamic +{ + class ReplaceMissingValuesMultiColumn + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Get a small dataset as an IEnumerable and convert it to an IDataView. + var samples = new List() + { + new DataPoint(){ Features1 = new float[3] {1, 1, 0}, Features2 = new float[2] {1, 1} }, + new DataPoint(){ Features1 = new float[3] {0, float.NaN, 1}, Features2 = new float[2] {0, 1} }, + new DataPoint(){ Features1 = new float[3] {-1, float.NaN, -3}, Features2 = new float[2] {-1, float.NaN} }, + new DataPoint(){ Features1 = new float[3] {-1, 6, -3}, Features2 = new float[2] {0, float.PositiveInfinity} }, + }; + var data = mlContext.Data.LoadFromEnumerable(samples); + + // Here we use the default replacement mode, which replaces the value with the default value for its type. + var defaultPipeline = mlContext.Transforms.ReplaceMissingValues(new[] { + new InputOutputColumnPair("MissingReplaced1", "Features1"), + new InputOutputColumnPair("MissingReplaced2", "Features2") + }, + MissingValueReplacingEstimator.ReplacementMode.DefaultValue); + + // Now we can transform the data and look at the output to confirm the behavior of the estimator. + // This operation doesn't actually evaluate data until we read the data below. + var defaultTransformer = defaultPipeline.Fit(data); + var defaultTransformedData = defaultTransformer.Transform(data); + + // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. + var defaultRowEnumerable = mlContext.Data.CreateEnumerable(defaultTransformedData, reuseRowObject: false); + + // And finally, we can write out the rows of the dataset, looking at the columns of interest. + foreach (var row in defaultRowEnumerable) + Console.WriteLine($"Features1: [{string.Join(", ", row.Features1)}]\t MissingReplaced1: [{string.Join(", ", row.MissingReplaced1)}]\t " + + $"Features2: [{ string.Join(", ", row.Features2)}]\t MissingReplaced2: [{string.Join(", ", row.MissingReplaced2)}]"); + + // Expected output: + // Features1: [1, 1, 0] MissingReplaced1: [1, 1, 0] Features2: [1, 1] MissingReplaced2: [1, 1] + // Features1: [0, NaN, 1] MissingReplaced1: [0, 0, 1] Features2: [0, 1] MissingReplaced2: [0, 1] + // Features1: [-1, NaN, -3] MissingReplaced1: [-1, 0, -3] Features2: [-1, NaN] MissingReplaced2: [-1, 0] + // Features1: [-1, 6, -3] MissingReplaced1: [-1, 6, -3] Features2: [0, ∞] MissingReplaced2: [0, ∞] + + // Here we use the mean replacement mode, which replaces the value with the mean of the non values that were not missing. + var meanPipeline = mlContext.Transforms.ReplaceMissingValues(new[] { + new InputOutputColumnPair("MissingReplaced1", "Features1"), + new InputOutputColumnPair("MissingReplaced2", "Features2") + }, + MissingValueReplacingEstimator.ReplacementMode.Mean); + + // Now we can transform the data and look at the output to confirm the behavior of the estimator. + // This operation doesn't actually evaluate data until we read the data below. + var meanTransformer = meanPipeline.Fit(data); + var meanTransformedData = meanTransformer.Transform(data); + + // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. + var meanRowEnumerable = mlContext.Data.CreateEnumerable(meanTransformedData, reuseRowObject: false); + + // And finally, we can write out the rows of the dataset, looking at the columns of interest. + foreach (var row in meanRowEnumerable) + Console.WriteLine($"Features1: [{string.Join(", ", row.Features1)}]\t MissingReplaced1: [{string.Join(", ", row.MissingReplaced1)}]\t " + + $"Features2: [{ string.Join(", ", row.Features2)}]\t MissingReplaced2: [{string.Join(", ", row.MissingReplaced2)}]"); + + // Expected output: + // Features1: [1, 1, 0] MissingReplaced1: [1, 1, 0] Features2: [1, 1] MissingReplaced2: [1, 1] + // Features1: [0, NaN, 1] MissingReplaced1: [0, 3.5, 1] Features2: [0, 1] MissingReplaced2: [0, 1] + // Features1: [-1, NaN, -3] MissingReplaced1: [-1, 3.5, -3] Features2: [-1, NaN] MissingReplaced2: [-1, 1] + // Features1: [-1, 6, -3] MissingReplaced1: [-1, 6, -3] Features2: [0, ∞] MissingReplaced2: [0, ∞] + } + + private class DataPoint + { + [VectorType(3)] + public float[] Features1 { get; set; } + [VectorType(2)] + public float[] Features2 { get; set; } + } + + private sealed class SampleDataTransformed : DataPoint + { + [VectorType(3)] + public float[] MissingReplaced1 { get; set; } + [VectorType(2)] + public float[] MissingReplaced2 { get; set; } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj index 9772e70704..2fee715827 100644 --- a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj +++ b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj @@ -1,8 +1,10 @@  - + netcoreapp2.1 Exe + false + false diff --git a/src/Microsoft.ML.Transforms/CustomMappingCatalog.cs b/src/Microsoft.ML.Transforms/CustomMappingCatalog.cs index a3f9320e30..0bab219ee4 100644 --- a/src/Microsoft.ML.Transforms/CustomMappingCatalog.cs +++ b/src/Microsoft.ML.Transforms/CustomMappingCatalog.cs @@ -26,7 +26,8 @@ public static class CustomMappingCatalog /// /// /// /// public static CustomMappingEstimator CustomMapping(this TransformsCatalog catalog, Action mapAction, string contractName, diff --git a/src/Microsoft.ML.Transforms/ExtensionsCatalog.cs b/src/Microsoft.ML.Transforms/ExtensionsCatalog.cs index 30685d8067..5ff3f5a47c 100644 --- a/src/Microsoft.ML.Transforms/ExtensionsCatalog.cs +++ b/src/Microsoft.ML.Transforms/ExtensionsCatalog.cs @@ -24,7 +24,7 @@ public static class ExtensionsCatalog /// /// /// /// public static MissingValueIndicatorEstimator IndicateMissingValues(this TransformsCatalog catalog, @@ -38,6 +38,12 @@ public static MissingValueIndicatorEstimator IndicateMissingValues(this Transfor /// /// The transform extensions' catalog. /// Specifies the names of the columns on which to apply the transformation. + /// + /// + /// + /// public static MissingValueIndicatorEstimator IndicateMissingValues(this TransformsCatalog catalog, InputOutputColumnPair[] columns) { var env = CatalogUtils.GetEnvironment(catalog); @@ -63,7 +69,7 @@ public static MissingValueIndicatorEstimator IndicateMissingValues(this Transfor /// /// /// /// public static MissingValueReplacingEstimator ReplaceMissingValues(this TransformsCatalog catalog, @@ -83,6 +89,12 @@ public static MissingValueReplacingEstimator ReplaceMissingValues(this Transform /// If true, per-slot imputation of replacement is performed. /// Otherwise, replacement value is imputed for the entire vector column. This setting is ignored for scalars and variable vectors, /// where imputation is always for the entire column. + /// + /// + /// + /// public static MissingValueReplacingEstimator ReplaceMissingValues(this TransformsCatalog catalog, InputOutputColumnPair[] columns, MissingValueReplacingEstimator.ReplacementMode replacementMode = MissingValueReplacingEstimator.Defaults.Mode,