Skip to content

[WIP] Transformers for AutoML #4157

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions Microsoft.ML.sln
Original file line number Diff line number Diff line change
@@ -273,6 +273,14 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.AutoML.Samples
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.Samples.GPU", "docs\samples\Microsoft.ML.Samples.GPU\Microsoft.ML.Samples.GPU.csproj", "{3C8F910B-7F23-4D25-B521-6D5AC9570ADD}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.Featurizers", "src\Microsoft.ML.Featurizers\Microsoft.ML.Featurizers.csproj", "{E2DD0721-5B0F-4606-8182-4C7EFB834518}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Microsoft.ML.Featurizers", "Microsoft.ML.Featurizers", "{1BA5C784-52E8-4A87-8525-26B2452F2882}"
ProjectSection(SolutionItems) = preProject
pkg\Microsoft.ML.Featurizers\Microsoft.ML.Featurizers.nupkgproj = pkg\Microsoft.ML.Featurizers\Microsoft.ML.Featurizers.nupkgproj
pkg\Microsoft.ML.Featurizers\Microsoft.ML.Featurizers.symbols.nupkgproj = pkg\Microsoft.ML.Featurizers\Microsoft.ML.Featurizers.symbols.nupkgproj
EndProjectSection
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.CodeGenerator", "src\Microsoft.ML.CodeGenerator\Microsoft.ML.CodeGenerator.csproj", "{56CB0850-7341-4D71-9AE4-9EFC472D93DD}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.CodeGenerator.Tests", "test\Microsoft.ML.CodeGenerator.Tests\Microsoft.ML.CodeGenerator.Tests.csproj", "{46CC5637-3DDF-4100-93FC-44BB87B2DB81}"
@@ -1690,6 +1698,30 @@ Global
{46CC5637-3DDF-4100-93FC-44BB87B2DB81}.Release-netfx|Any CPU.Build.0 = Release-netfx|Any CPU
{46CC5637-3DDF-4100-93FC-44BB87B2DB81}.Release-netfx|x64.ActiveCfg = Release-netfx|Any CPU
{46CC5637-3DDF-4100-93FC-44BB87B2DB81}.Release-netfx|x64.Build.0 = Release-netfx|Any CPU
{E2DD0721-5B0F-4606-8182-4C7EFB834518}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{E2DD0721-5B0F-4606-8182-4C7EFB834518}.Debug|Any CPU.Build.0 = Debug|Any CPU
{E2DD0721-5B0F-4606-8182-4C7EFB834518}.Debug|x64.ActiveCfg = Debug|Any CPU
{E2DD0721-5B0F-4606-8182-4C7EFB834518}.Debug|x64.Build.0 = Debug|Any CPU
{E2DD0721-5B0F-4606-8182-4C7EFB834518}.Debug-netcoreapp3_0|Any CPU.ActiveCfg = Debug-netcoreapp3_0|Any CPU
{E2DD0721-5B0F-4606-8182-4C7EFB834518}.Debug-netcoreapp3_0|Any CPU.Build.0 = Debug-netcoreapp3_0|Any CPU
{E2DD0721-5B0F-4606-8182-4C7EFB834518}.Debug-netcoreapp3_0|x64.ActiveCfg = Debug-netcoreapp3_0|Any CPU
{E2DD0721-5B0F-4606-8182-4C7EFB834518}.Debug-netcoreapp3_0|x64.Build.0 = Debug-netcoreapp3_0|Any CPU
{E2DD0721-5B0F-4606-8182-4C7EFB834518}.Debug-netfx|Any CPU.ActiveCfg = Debug-netfx|Any CPU
{E2DD0721-5B0F-4606-8182-4C7EFB834518}.Debug-netfx|Any CPU.Build.0 = Debug-netfx|Any CPU
{E2DD0721-5B0F-4606-8182-4C7EFB834518}.Debug-netfx|x64.ActiveCfg = Debug-netfx|Any CPU
{E2DD0721-5B0F-4606-8182-4C7EFB834518}.Debug-netfx|x64.Build.0 = Debug-netfx|Any CPU
{E2DD0721-5B0F-4606-8182-4C7EFB834518}.Release|Any CPU.ActiveCfg = Release|Any CPU
{E2DD0721-5B0F-4606-8182-4C7EFB834518}.Release|Any CPU.Build.0 = Release|Any CPU
{E2DD0721-5B0F-4606-8182-4C7EFB834518}.Release|x64.ActiveCfg = Release|Any CPU
{E2DD0721-5B0F-4606-8182-4C7EFB834518}.Release|x64.Build.0 = Release|Any CPU
{E2DD0721-5B0F-4606-8182-4C7EFB834518}.Release-netcoreapp3_0|Any CPU.ActiveCfg = Release-netcoreapp3_0|Any CPU
{E2DD0721-5B0F-4606-8182-4C7EFB834518}.Release-netcoreapp3_0|Any CPU.Build.0 = Release-netcoreapp3_0|Any CPU
{E2DD0721-5B0F-4606-8182-4C7EFB834518}.Release-netcoreapp3_0|x64.ActiveCfg = Release-netcoreapp3_0|Any CPU
{E2DD0721-5B0F-4606-8182-4C7EFB834518}.Release-netcoreapp3_0|x64.Build.0 = Release-netcoreapp3_0|Any CPU
{E2DD0721-5B0F-4606-8182-4C7EFB834518}.Release-netfx|Any CPU.ActiveCfg = Release-netfx|Any CPU
{E2DD0721-5B0F-4606-8182-4C7EFB834518}.Release-netfx|Any CPU.Build.0 = Release-netfx|Any CPU
{E2DD0721-5B0F-4606-8182-4C7EFB834518}.Release-netfx|x64.ActiveCfg = Release-netfx|Any CPU
{E2DD0721-5B0F-4606-8182-4C7EFB834518}.Release-netfx|x64.Build.0 = Release-netfx|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
@@ -1779,6 +1811,8 @@ Global
{56CB0850-7341-4D71-9AE4-9EFC472D93DD} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
{46CC5637-3DDF-4100-93FC-44BB87B2DB81} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
{3817A875-278C-4140-BF66-3C4A8CA55F0D} = {D3D38B03-B557-484D-8348-8BADEE4DF592}
{E2DD0721-5B0F-4606-8182-4C7EFB834518} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
{1BA5C784-52E8-4A87-8525-26B2452F2882} = {D3D38B03-B557-484D-8348-8BADEE4DF592}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {41165AF1-35BB-4832-A189-73060F82B01D}
4 changes: 2 additions & 2 deletions build/BranchInfo.props
Original file line number Diff line number Diff line change
@@ -30,12 +30,12 @@
<MajorVersion>1</MajorVersion>
<MinorVersion>4</MinorVersion>
<PatchVersion>0</PatchVersion>
<PreReleaseLabel>preview3</PreReleaseLabel>
<PreReleaseLabel>preview2</PreReleaseLabel>
</PropertyGroup>
<PropertyGroup Condition="'$(IsStableProject)' != 'true'">
<MajorVersion>0</MajorVersion>
<MinorVersion>16</MinorVersion>
<PatchVersion>0</PatchVersion>
<PreReleaseLabel>preview3</PreReleaseLabel>
<PreReleaseLabel>preview2</PreReleaseLabel>
</PropertyGroup>
</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
using System;
using System.Collections.Generic;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Featurizers;

namespace Samples.Dynamic
{
public static class CategoryImputer
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for
// exception tracking and logging, as well as the source of randomness.
var mlContext = new MLContext();

// Create a small dataset as an IEnumerable.
var samples = new List<InputData>()
{
new InputData(){ Feature1 = 1f },

new InputData(){ Feature1 = float.NaN },

new InputData(){ Feature1 = 1f },

new InputData(){ Feature1 = float.NaN },

new InputData(){ Feature1 = 9f },
};

// Convert training data to IDataView.
var dataview = mlContext.Data.LoadFromEnumerable(samples);

// A pipeline for filling in the missing values in the feature1 column
var pipeline = mlContext.Transforms.CatagoryImputerTransformer("Feature1");

// The transformed data.
var transformedData = pipeline.Fit(dataview).Transform(dataview);

// Now let's take a look at what this did. The NaN values should be filled in with the most frequent value, 1.
// We can extract the newly created columns as an IEnumerable of TransformedData.
var featuresColumn = mlContext.Data.CreateEnumerable<TransformedData>(
transformedData, reuseRowObject: false);

// And we can write out a few rows
Console.WriteLine($"Features column obtained post-transformation.");
foreach (var featureRow in featuresColumn)
Console.WriteLine(featureRow.Feature1);

// Expected output:
// Features column obtained post-transformation.
// 1
// 1
// 1
// 1
// 9
}

private class InputData
{
public float Feature1;
}

private sealed class TransformedData
{
public float Feature1 { get; set; }
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
using System;
using System.Collections.Generic;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Featurizers;

namespace Samples.Dynamic
{
public static class DateTimeTransformer
{
private class DateTimeInput
{
public long Date;
}

public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for
// exception tracking and logging, as well as the source of randomness.
var mlContext = new MLContext();

// Create a small dataset as an IEnumerable.
// Future Date - 2025 June 30
var samples = new[] { new DateTimeInput() { Date = 1751241600 } };

// Convert training data to IDataView.
var dataview = mlContext.Data.LoadFromEnumerable(samples);

// A pipeline for splitting the time features into individual columns
var pipeline = mlContext.Transforms.DateTimeTransformer("Date", "DTC");

// The transformed data.
var transformedData = pipeline.Fit(dataview).Transform(dataview);

// Now let's take a look at what this did. We should have created 21 more columns with all the
// DateTime information split into its own columns
var featuresColumn = mlContext.Data.CreateEnumerable<TransformedData>(
transformedData, reuseRowObject: false);

// And we can write out a few rows
Console.WriteLine($"Features column obtained post-transformation.");
foreach (var featureRow in featuresColumn)
Console.WriteLine(featureRow.Date + ", " + featureRow.DTCYear + ", " + featureRow.DTCMonth + ", " +
featureRow.DTCDay + ", " + featureRow.DTCHour + ", " + featureRow.DTCMinute + ", " +
featureRow.DTCSecond + ", " + featureRow.DTCAmPm + ", " + featureRow.DTCHour12 + ", " +
featureRow.DTCDayOfWeek + ", " + featureRow.DTCDayOfQuarter + ", " + featureRow.DTCDayOfYear +
", " + featureRow.DTCWeekOfMonth + ", " + featureRow.DTCQuarterOfYear + ", " + featureRow.DTCHalfOfYear +
", " + featureRow.DTCWeekIso + ", " + featureRow.DTCYearIso + ", " + featureRow.DTCMonthLabel + ", " +
featureRow.DTCAmPmLabel + ", " + featureRow.DTCDayOfWeekLabel + ", " + featureRow.DTCHolidayName + ", " +
featureRow.DTCIsPaidTimeOff);

// Expected output:
// Features columns obtained post-transformation.
// 1751241600, 2025, 6, 30, 0, 0, 0, 0, 0, 1, 91, 180, 4, 2, 1, 27, 2025, June, am, Monday, , 0
}

// These columns start with DTC because that is the prefix we picked
private sealed class TransformedData
{
public long Date { get; set; }
public int DTCYear { get; set; }
public byte DTCMonth { get; set; }
public byte DTCDay { get; set; }
public byte DTCHour { get; set; }
public byte DTCMinute { get; set; }
public byte DTCSecond { get; set; }
public byte DTCAmPm { get; set; }
public byte DTCHour12 { get; set; }
public byte DTCDayOfWeek { get; set; }
public byte DTCDayOfQuarter { get; set; }
public ushort DTCDayOfYear { get; set; }
public ushort DTCWeekOfMonth { get; set; }
public byte DTCQuarterOfYear { get; set; }
public byte DTCHalfOfYear { get; set; }
public byte DTCWeekIso { get; set; }
public int DTCYearIso { get; set; }
public string DTCMonthLabel { get; set; }
public string DTCAmPmLabel { get; set; }
public string DTCDayOfWeekLabel { get; set; }
public string DTCHolidayName { get; set; }
public byte DTCIsPaidTimeOff { get; set; }
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
using System;
using System.Collections.Generic;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Featurizers;

namespace Samples.Dynamic
{
public static class DateTimeTransformerDropColumns
{
private class DateTimeInput
{
public long Date;
}

public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for
// exception tracking and logging, as well as the source of randomness.
var mlContext = new MLContext();

// Create a small dataset as an IEnumerable.
// Future Date - 2025 June 30
var samples = new[] { new DateTimeInput() { Date = 1751241600 } };

// Convert training data to IDataView.
var dataview = mlContext.Data.LoadFromEnumerable(samples);

// A pipeline for splitting the time features into individual columns
// All the columns listed here will be dropped.
var pipeline = mlContext.Transforms.DateTimeTransformer("Date", "DTC", DateTimeTransformerEstimator.ColumnsProduced.IsPaidTimeOff,
DateTimeTransformerEstimator.ColumnsProduced.Day, DateTimeTransformerEstimator.ColumnsProduced.QuarterOfYear,
DateTimeTransformerEstimator.ColumnsProduced.AmPm, DateTimeTransformerEstimator.ColumnsProduced.HolidayName);

// The transformed data.
var transformedData = pipeline.Fit(dataview).Transform(dataview);

// Now let's take a look at what this did. We should have created 16 more columns with all the
// DateTime information split into its own columns
var featuresColumn = mlContext.Data.CreateEnumerable<TransformedData>(
transformedData, reuseRowObject: false);

// And we can write out a few rows
Console.WriteLine($"Features column obtained post-transformation.");
foreach (var featureRow in featuresColumn)
Console.WriteLine(featureRow.Date + ", " + featureRow.DTCYear + ", " + featureRow.DTCMonth + ", " +
featureRow.DTCHour + ", " + featureRow.DTCMinute + ", " + featureRow.DTCSecond + ", " +
featureRow.DTCHour12 + ", " + featureRow.DTCDayOfWeek + ", " + featureRow.DTCDayOfQuarter + ", " +
featureRow.DTCDayOfYear + ", " + featureRow.DTCWeekOfMonth + ", " + featureRow.DTCHalfOfYear +
", " + featureRow.DTCWeekIso + ", " + featureRow.DTCYearIso + ", " + featureRow.DTCMonthLabel + ", " +
featureRow.DTCAmPmLabel + ", " + featureRow.DTCDayOfWeekLabel);

// Expected output:
// Features columns obtained post-transformation.
// 1751241600, 2025, 6, 30, 0, 0, 0, 0, 0, 1, 91, 180, 4, 2, 1, 27, 2025, June, am, Monday
}

// These columns start with DTC because that is the prefix we picked
private sealed class TransformedData
{
public long Date { get; set; }
public int DTCYear { get; set; }
public byte DTCMonth { get; set; }
public byte DTCHour { get; set; }
public byte DTCMinute { get; set; }
public byte DTCSecond { get; set; }
public byte DTCHour12 { get; set; }
public byte DTCDayOfWeek { get; set; }
public byte DTCDayOfQuarter { get; set; }
public ushort DTCDayOfYear { get; set; }
public ushort DTCWeekOfMonth { get; set; }
public byte DTCHalfOfYear { get; set; }
public byte DTCWeekIso { get; set; }
public int DTCYearIso { get; set; }
public string DTCMonthLabel { get; set; }
public string DTCAmPmLabel { get; set; }
public string DTCDayOfWeekLabel { get; set; }
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
using System;
using System.Collections.Generic;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Featurizers;

namespace Samples.Dynamic
{
public static class RobustScaler
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for
// exception tracking and logging, as well as the source of randomness.
var mlContext = new MLContext();

// Create a small dataset as an IEnumerable.
var samples = new List<InputData>()
{
new InputData(){ Feature1 = 1f },

new InputData(){ Feature1 = 3f },

new InputData(){ Feature1 = 5f },

new InputData(){ Feature1 = 7f },

new InputData(){ Feature1 = 9f },
};

// Convert training data to IDataView.
var dataview = mlContext.Data.LoadFromEnumerable(samples);

// A pipeline for centering and scaling the feature1 column
var pipeline = mlContext.Transforms.RobustScalerTransformer("Feature1");

// The transformed data.
var transformedData = pipeline.Fit(dataview).Transform(dataview);

// Now let's take a look at what this did. The values should be centered around 0 and scaled.
// We can extract the newly created columns as an IEnumerable of TransformedData.
var featuresColumn = mlContext.Data.CreateEnumerable<TransformedData>(
transformedData, reuseRowObject: false);

// And we can write out a few rows
Console.WriteLine($"Features column obtained post-transformation.");
foreach (var featureRow in featuresColumn)
Console.WriteLine(featureRow.Feature1);

// Expected output:
// Features column obtained post-transformation.
// -1
// -.5
// 0
// .5
// 1
}

private class InputData
{
public float Feature1;
}

private sealed class TransformedData
{
public float Feature1 { get; set; }
}
}
}
Loading