Skip to content

Samples for CustomMapping, IndicateMissingValues, ReplaceMissingValues #3275

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
using System;
using System.Collections.Generic;
using Microsoft.ML;

namespace Samples.Dynamic
{
public static class CustomMapping
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
var mlContext = new MLContext();

// Get a small dataset as an IEnumerable and convert it to an IDataView.
var samples = new List<InputData>
{
new InputData { Age = 26 },
new InputData { Age = 35 },
new InputData { Age = 34 },
new InputData { Age = 28 },
};
var data = mlContext.Data.LoadFromEnumerable(samples);

// We define the custom mapping between input and output rows that will be applied by the transformation.
Action<InputData, CustomMappingOutput > mapping =
(input, output) => output.IsUnderThirty = input.Age < 30;

// Custom transformations can be used to transform data directly, or as part of a pipeline of estimators.
// Note: If contractName is null in the CustomMapping estimator, any pipeline of estimators containing it,
// cannot be saved and loaded back.
var pipeline = mlContext.Transforms.CustomMapping(mapping, contractName: null);

// Now we can transform the data and look at the output to confirm the behavior of the estimator.
// This operation doesn't actually evaluate data until we read the data below.
var transformer = pipeline.Fit(data);
var transformedData = transformer.Transform(data);

var dataEnumerable = mlContext.Data.CreateEnumerable<TransformedData>(transformedData, reuseRowObject: true);
Console.WriteLine("Age\t IsUnderThirty");
foreach (var row in dataEnumerable)
Console.WriteLine($"{row.Age}\t {row.IsUnderThirty}");

// Expected output:
// Age IsUnderThirty
// 26 True
// 35 False
// 34 False
// 28 True
}

// Defines only the column to be generated by the custom mapping transformation in addition to the columns already present.
private class CustomMappingOutput
{
public bool IsUnderThirty { get; set; }
}

// Defines the schema of the input data.
private class InputData
{
public float Age { get; set; }
}

// Defines the schema of the transformed data, which includes the new column IsUnderThirty.
private class TransformedData : InputData
{
public bool IsUnderThirty { get; set; }
}
}
}

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
using System;
using System.Collections.Generic;
using Microsoft.ML;
using Microsoft.ML.Transforms;

namespace Samples.Dynamic
{
public static class CustomMappingSaveAndLoad
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
var mlContext = new MLContext();

// Get a small dataset as an IEnumerable and convert it to an IDataView.
var samples = new List<InputData>
{
new InputData { Age = 26 },
new InputData { Age = 35 },
new InputData { Age = 34 },
new InputData { Age = 28 },
};
var data = mlContext.Data.LoadFromEnumerable(samples);

// Custom transformations can be used to transform data directly, or as part of a pipeline of estimators.
var pipeline = mlContext.Transforms.CustomMapping(new IsUnderThirtyCustomAction().GetMapping(), contractName: "IsUnderThirty");
var transformer = pipeline.Fit(data);

// To save and load the CustomMapping estimator, the assembly in which the custom action is defined needs to be registered in the
// environment. The following registers the assembly where IsUnderThirtyCustomAction is defined.
mlContext.ComponentCatalog.RegisterAssembly(typeof(IsUnderThirtyCustomAction).Assembly);

// Now the transform pipeline can be saved and loaded through the usual MLContext method.
mlContext.Model.Save(transformer, data.Schema, "customTransform.zip");
var loadedTransform = mlContext.Model.Load("customTransform.zip", out var inputSchema);

// Now we can transform the data and look at the output to confirm the behavior of the estimator.
// This operation doesn't actually evaluate data until we read the data below.
var transformedData = loadedTransform.Transform(data);

var dataEnumerable = mlContext.Data.CreateEnumerable<TransformedData>(transformedData, reuseRowObject: true);
Console.WriteLine("Age\tIsUnderThirty");
foreach (var row in dataEnumerable)
Console.WriteLine($"{row.Age}\t {row.IsUnderThirty}");

// Expected output:
// Age IsUnderThirty
// 26 True
// 35 False
// 34 False
// 28 True
}

// The custom action needs to implement the abstract class CustomMappingFactory, and needs to have attribute
// CustomMappingFactoryAttribute with argument equal to the contractName used to define the CustomMapping estimator
// which uses the action.
[CustomMappingFactoryAttribute("IsUnderThirty")]
private class IsUnderThirtyCustomAction : CustomMappingFactory<InputData, CustomMappingOutput>
{
// We define the custom mapping between input and output rows that will be applied by the transformation.
public static void CustomAction(InputData input, CustomMappingOutput output)
=> output.IsUnderThirty = input.Age < 30;

public override Action<InputData, CustomMappingOutput> GetMapping()
=> CustomAction;
}

// Defines only the column to be generated by the custom mapping transformation in addition to the columns already present.
private class CustomMappingOutput
{
public bool IsUnderThirty { get; set; }
}

// Defines the schema of the input data.
private class InputData
{
public float Age { get; set; }
}

// Defines the schema of the transformed data, which includes the new column IsUnderThirty.
private class TransformedData : InputData
{
public bool IsUnderThirty { get; set; }
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,26 +7,24 @@ namespace Microsoft.ML.Samples.Dynamic
{
public static class IndicateMissingValues
{

public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
var mlContext = new MLContext();

// Get a small dataset as an IEnumerable and convert it to an IDataView.
var samples = new List<DataPoint>()
{
new DataPoint(){ Label = 3, Features = new float[3] {1, 1, 0} },
new DataPoint(){ Label = 32, Features = new float[3] {0, float.NaN, 1} },
new DataPoint(){ Label = float.NaN, Features = new float[3] {-1, float.NaN, -3} },
new DataPoint(){ Features = new float[3] {1, 1, 0} },
new DataPoint(){ Features = new float[3] {0, float.NaN, 1} },
new DataPoint(){ Features = new float[3] {-1, float.NaN, -3} },
};
// Convert training data to IDataView, the general data type used in ML.NET.
var data = mlContext.Data.LoadFromEnumerable(samples);

// IndicateMissingValues is used to create a boolean containing
// 'true' where the value in the input column is NaN. This value can be used
// to replace missing values with other values.
IEstimator<ITransformer> pipeline = mlContext.Transforms.IndicateMissingValues("MissingIndicator", "Features");
// IndicateMissingValues is used to create a boolean containing 'true' where the value in the
// input column is missing. For floats and doubles, missing values are represented as NaN.
var pipeline = mlContext.Transforms.IndicateMissingValues("MissingIndicator", "Features");

// Now we can transform the data and look at the output to confirm the behavior of the estimator.
// This operation doesn't actually evaluate data until we read the data below.
Expand All @@ -36,32 +34,18 @@ public static void Example()
// We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below.
var rowEnumerable = mlContext.Data.CreateEnumerable<SampleDataTransformed>(transformedData, reuseRowObject: false);

// a small printing utility
Func<object[], string> vectorPrinter = (object[] vector) =>
{
string preview = "[";
foreach (var slot in vector)
preview += $"{slot} ";
return preview += "]";

};

// And finally, we can write out the rows of the dataset, looking at the columns of interest.
foreach (var row in rowEnumerable)
{
Console.WriteLine($"Label: {row.Label} Features: {vectorPrinter(row.Features.Cast<object>().ToArray())} MissingIndicator: {vectorPrinter(row.MissingIndicator.Cast<object>().ToArray())}");
}
Console.WriteLine($"Features: [{string.Join(", ", row.Features)}]\t MissingIndicator: [{string.Join(", ", row.MissingIndicator)}]");

// Expected output:
//
// Label: 3 Features: [1 1 0] MissingIndicator: [False False False]
// Label: 32 Features: [0 NaN 1] MissingIndicator: [False True False]
// Label: NaN Features: [-1 NaN -3 ] MissingIndicator: [False True False]
// Features: [1, 1, 0] MissingIndicator: [False, False, False]
// Features: [0, NaN, 1] MissingIndicator: [False, True, False]
// Features: [-1, NaN, -3] MissingIndicator: [False, True, False]
}

private class DataPoint
{
public float Label { get; set; }
[VectorType(3)]
public float[] Features { get; set; }
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
using System;
using System.Collections.Generic;
using Microsoft.ML;
using Microsoft.ML.Data;

namespace Samples.Dynamic
{
public static class IndicateMissingValuesMultiColumn
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
var mlContext = new MLContext();

// Get a small dataset as an IEnumerable and convert it to an IDataView.
var samples = new List<DataPoint>()
{
new DataPoint(){ Features1 = new float[3] {1, 1, 0}, Features2 = new float[2] {1, 1} },
new DataPoint(){ Features1 = new float[3] {0, float.NaN, 1}, Features2 = new float[2] {float.NaN, 1} },
new DataPoint(){ Features1 = new float[3] {-1, float.NaN, -3}, Features2 = new float[2] {1, float.PositiveInfinity} },
};
var data = mlContext.Data.LoadFromEnumerable(samples);

// IndicateMissingValues is used to create a boolean containing 'true' where the value in the
// input column is missing. For floats and doubles, missing values are NaN.
// We can use an array of InputOutputColumnPair to apply the MissingValueIndicatorEstimator
// to multiple columns in one pass over the data.
var pipeline = mlContext.Transforms.IndicateMissingValues(new[] {
new InputOutputColumnPair("MissingIndicator1", "Features1"),
new InputOutputColumnPair("MissingIndicator2", "Features2")
});

// Now we can transform the data and look at the output to confirm the behavior of the estimator.
// This operation doesn't actually evaluate data until we read the data below.
var tansformer = pipeline.Fit(data);
var transformedData = tansformer.Transform(data);

// We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below.
var rowEnumerable = mlContext.Data.CreateEnumerable<SampleDataTransformed>(transformedData, reuseRowObject: false);

// And finally, we can write out the rows of the dataset, looking at the columns of interest.
foreach (var row in rowEnumerable)
Console.WriteLine($"Features1: [{string.Join(", ", row.Features1)}]\t MissingIndicator1: [{string.Join(", ", row.MissingIndicator1)}]\t " +
$"Features2: [{string.Join(", ", row.Features2)}]\t MissingIndicator2: [{string.Join(", ", row.MissingIndicator2)}]");

// Expected output:
// Features1: [1, 1, 0] MissingIndicator1: [False, False, False] Features2: [1, 1] MissingIndicator2: [False, False]
// Features1: [0, NaN, 1] MissingIndicator1: [False, True, False] Features2: [NaN, 1] MissingIndicator2: [True, False]
// Features1: [-1, NaN, -3] MissingIndicator1: [False, True, False] Features2: [1, ∞] MissingIndicator2: [False, False]
}

private class DataPoint
{
[VectorType(3)]
public float[] Features1 { get; set; }
[VectorType(2)]
public float[] Features2 { get; set; }
}

private sealed class SampleDataTransformed : DataPoint
{
public bool[] MissingIndicator1 { get; set; }
public bool[] MissingIndicator2 { get; set; }

}
}
}
Loading