-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Add sample for IndicateMissingValues #2814
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
using System; | ||
using System.Linq; | ||
using Microsoft.ML.Data; | ||
|
||
namespace Microsoft.ML.Samples.Dynamic | ||
{ | ||
public static class IndicateMissingValues | ||
{ | ||
public static void Example() | ||
{ | ||
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, | ||
// as well as the source of randomness. | ||
var mlContext = new MLContext(); | ||
|
||
// Create a TextLoader for the breast-cancer dataset, and load it into an IDataView. | ||
var loader = mlContext.Data.CreateTextLoader(new[] | ||
{ | ||
new TextLoader.Column("Label", DataKind.Single, 0), | ||
new TextLoader.Column("Features1", DataKind.Single, 1, 5), | ||
new TextLoader.Column("Features2", DataKind.Single, 7, 9), | ||
new TextLoader.Column("Missing", DataKind.Single, 6) | ||
}); | ||
var file = SamplesUtils.DatasetUtils.DownloadBreastCancerDataset(); | ||
var data = loader.Load(new MultiFileSource(file)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think we want to move forward with just a small in-memory IEnumerable for the data, and skip the TextLoader and the real datasets. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can find a sample with small in-memory dataset in this comments: #2726 (comment) In reply to: 261707265 [](ancestors = 261707265) |
||
|
||
// Preview of the data. Some rows have a missing value in column 6. | ||
// | ||
// 0 5 1 1 1 2 1 3 1 1 | ||
// 0 5 4 4 5 7 10 3 2 1 | ||
// 0 3 1 1 1 2 2 3 1 1 | ||
// 0 6 8 8 1 3 4 3 7 1 | ||
// 0 4 1 1 3 2 1 3 1 1 | ||
// ... | ||
// 1 8 4 5 1 2 ? 7 3 1 | ||
|
||
// IndicateMissingValues is used to create a boolean column containing | ||
// 'true' where the value in the input column is NaN. This value can be used | ||
// to replace missing values with other values. | ||
// In this example, we replace the missing value with the vector (1, -1), and non-missing | ||
// values with the vector (0, x) (where x is the value in the input column). | ||
IEstimator<ITransformer> pipeline = mlContext.Transforms.IndicateMissingValues("MissingIndicator", "Missing"); | ||
pipeline = pipeline.Append(mlContext.Transforms.CustomMapping<MissingValue, ReplacedMissingValue>( | ||
(m, r) => r.MissingReplaced = new float[2] { m.MissingIndicator ? 1 : 0, m.MissingIndicator ? -1 : m.Missing }, null)); | ||
pipeline = pipeline.Append(mlContext.Transforms.Concatenate("Features", "Features1", "Features2", "MissingReplaced")); | ||
|
||
// Now we can transform the data and look at the output to confirm the behavior of IndicateMissingValues. | ||
// Don't forget that this operation doesn't actually evaluate data until we read the data below. | ||
var model = pipeline.Fit(data); | ||
var transformedData = model.Transform(data); | ||
|
||
// We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. | ||
var rowEnumerable = mlContext.Data.CreateEnumerable<SampleDataTransformed>(transformedData, reuseRowObject: false); | ||
|
||
// And finally, we can write out the rows of the dataset, looking at the columns of interest. | ||
Console.WriteLine($"Missing, MissingIndicator, and MissingReplaced columns obtained post-transformation."); | ||
foreach (var row in rowEnumerable.Skip(20).Take(5)) | ||
{ | ||
Console.WriteLine($"Missing: {row.Missing} MissingIndicator: {row.MissingIndicator} MissingReplaced: ({row.MissingReplaced[0]}, {row.MissingReplaced[1]})"); | ||
} | ||
|
||
// Expected output: | ||
// Missing, MissingIndicator, and MissingReplaced columns obtained post - transformation. | ||
// Missing: 10 MissingIndicator: False MissingReplaced: (0, 10) | ||
// Missing: 7 MissingIndicator: False MissingReplaced: (0, 7) | ||
// Missing: 1 MissingIndicator: False MissingReplaced: (0, 1) | ||
// Missing: NaN MissingIndicator: True MissingReplaced: (1, -1) | ||
// Missing: 1 MissingIndicator: False MissingReplaced: (0, 1) | ||
} | ||
|
||
private class MissingValue | ||
{ | ||
public float Missing { get; set; } | ||
public bool MissingIndicator { get; set; } | ||
} | ||
|
||
private class ReplacedMissingValue | ||
{ | ||
[VectorType(2)] | ||
public float[] MissingReplaced { get; set; } | ||
} | ||
|
||
private class SampleDataTransformed | ||
{ | ||
public float Missing { get; set; } | ||
public bool MissingIndicator { get; set; } | ||
public float[] MissingReplaced { get; set; } | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please link it from the extension it documents.
see the:
https://github.com/dotnet/machinelearning/blob/master/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs#L83
something like:
///
///
///
///