Skip to content

Conversion of NAIndicatorTransform to estimator with related pigstensions #1217

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Oct 20, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
e0c85cd
started conversion work
artidoro Oct 5, 2018
9e23955
started conversion work
artidoro Oct 5, 2018
11b7dd9
Merge branch 'master' of https://github.com/dotnet/machinelearning in…
artidoro Oct 8, 2018
e0e7d3b
wrote a first version of the conversion, need to debug it and make su…
artidoro Oct 10, 2018
593f7d5
Merge branch 'master' of https://github.com/dotnet/machinelearning in…
artidoro Oct 10, 2018
9c1e758
finished debugging tests
artidoro Oct 10, 2018
5e20acd
Merge branch 'master' of https://github.com/dotnet/machinelearning in…
artidoro Oct 10, 2018
4062bed
cleanup
artidoro Oct 10, 2018
76a893a
fixing an issue
artidoro Oct 11, 2018
8f331fd
fixed review comments
artidoro Oct 16, 2018
fe94d30
fixed some review comments and added a test
artidoro Oct 16, 2018
122d48c
fixed entrypointcatalog test
artidoro Oct 16, 2018
ef8c999
Merge branch 'master' of https://github.com/dotnet/machinelearning in…
artidoro Oct 16, 2018
28e9570
propagated metadata
artidoro Oct 17, 2018
f720a75
fixed review comments
artidoro Oct 18, 2018
90784fb
Merge branch 'master' of https://github.com/dotnet/machinelearning in…
artidoro Oct 18, 2018
5d4b6bc
fixed review comments
artidoro Oct 18, 2018
dd49b4b
Merge branch 'master' into naindicator
artidoro Oct 19, 2018
0f4507f
removed unused object
artidoro Oct 19, 2018
25c9b9e
Merge branch 'naindicator' of https://github.com/artidoro/machinelear…
artidoro Oct 19, 2018
4a6a5d3
Merge branch 'master' of https://github.com/dotnet/machinelearning in…
artidoro Oct 19, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/Microsoft.ML.Transforms/NAHandleTransform.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
using Microsoft.ML.Runtime.Data.Conversion;
using Microsoft.ML.Runtime.EntryPoints;
using Microsoft.ML.Runtime.Internal.Utilities;
using Microsoft.ML.Transforms;

[assembly: LoadableClass(NAHandleTransform.Summary, typeof(IDataTransform), typeof(NAHandleTransform), typeof(NAHandleTransform.Arguments), typeof(SignatureDataTransform),
NAHandleTransform.FriendlyName, "NAHandleTransform", NAHandleTransform.ShortName, "NA", DocName = "transform/NAHandle.md")]
Expand Down Expand Up @@ -212,7 +213,7 @@ public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataV

// Create the indicator columns.
if (naIndicatorCols.Count > 0)
output = new NAIndicatorTransform(h, new NAIndicatorTransform.Arguments() { Column = naIndicatorCols.ToArray() }, input);
output = NAIndicatorTransform.Create(h, new NAIndicatorTransform.Arguments() { Column = naIndicatorCols.ToArray() }, input);

// Convert the indicator columns to the correct type so that they can be concatenated to the NAReplace outputs.
if (naConvCols.Count > 0)
Expand Down
3 changes: 2 additions & 1 deletion src/Microsoft.ML.Transforms/NAHandling.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

using Microsoft.ML.Runtime.Data;
using Microsoft.ML.Runtime.EntryPoints;
using Microsoft.ML.Transforms;

[assembly: EntryPointModule(typeof(NAHandling))]

Expand Down Expand Up @@ -71,7 +72,7 @@ public static CommonOutputs.TransformOutput Handle(IHostEnvironment env, NAHandl
public static CommonOutputs.TransformOutput Indicator(IHostEnvironment env, NAIndicatorTransform.Arguments input)
{
var h = EntryPointUtils.CheckArgsAndCreateHost(env, "NAIndicator", input);
var xf = new NAIndicatorTransform(h, input, input.Data);
var xf = new NAIndicatorTransform(h, input).Transform(input.Data);
return new CommonOutputs.TransformOutput()
{
Model = new TransformModel(h, xf, input.Data),
Expand Down
728 changes: 477 additions & 251 deletions src/Microsoft.ML.Transforms/NAIndicatorTransform.cs

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ Transforms.ManyHeterogeneousModelCombiner Combines a sequence of TransformModels
Transforms.MeanVarianceNormalizer Normalizes the data based on the computed mean and variance of the data. Microsoft.ML.Runtime.Data.Normalize MeanVar Microsoft.ML.Runtime.Data.NormalizeTransform+MeanVarArguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
Transforms.MinMaxNormalizer Normalizes the data based on the observed minimum and maximum values of the data. Microsoft.ML.Runtime.Data.Normalize MinMax Microsoft.ML.Runtime.Data.NormalizeTransform+MinMaxArguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
Transforms.MissingValueHandler Handle missing values by replacing them with either the default value or the mean/min/max value (for non-text columns only). An indicator column can optionally be concatenated, if theinput column type is numeric. Microsoft.ML.Runtime.Data.NAHandling Handle Microsoft.ML.Runtime.Data.NAHandleTransform+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
Transforms.MissingValueIndicator Create a boolean output column with the same number of slots as the input column, where the output value is true if the value in the input column is missing. Microsoft.ML.Runtime.Data.NAHandling Indicator Microsoft.ML.Runtime.Data.NAIndicatorTransform+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
Transforms.MissingValueIndicator Create a boolean output column with the same number of slots as the input column, where the output value is true if the value in the input column is missing. Microsoft.ML.Runtime.Data.NAHandling Indicator Microsoft.ML.Transforms.NAIndicatorTransform+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
Transforms.MissingValuesDropper Removes NAs from vector columns. Microsoft.ML.Runtime.Data.NAHandling Drop Microsoft.ML.Runtime.Data.NADropTransform+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
Transforms.MissingValuesRowDropper Filters out rows that contain missing values. Microsoft.ML.Runtime.Data.NAHandling Filter Microsoft.ML.Runtime.Data.NAFilter+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
Transforms.MissingValueSubstitutor Create an output column of the same type and size of the input column, where missing values are replaced with either the default value or the mean/min/max value (for non-text columns only). Microsoft.ML.Runtime.Data.NAHandling Replace Microsoft.ML.Runtime.Data.NAReplaceTransform+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
Expand Down
17 changes: 17 additions & 0 deletions test/BaselineOutput/SingleDebug/NAIndicator/featurized.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#@ TextLoader{
#@ header+
#@ sep=tab
#@ col=ScalarFloat:R4:0
#@ col=ScalarDouble:R8:1
#@ col=VectorFloat:R4:2-5
#@ col=VectorDoulbe:R8:6-9
#@ col=A:BL:10
#@ col=B:BL:11
#@ col=C:BL:12-15
#@ col=D:BL:16-19
#@ }
ScalarFloat ScalarDouble 18 8:A 9:B
5 5 5 1 1 1 5 1 1 1 10 0:0
5 5 5 4 4 5 5 4 4 5 10 0:0
3 3 3 1 1 1 3 1 1 1 10 0:0
6 6 6 8 8 1 6 8 8 1 10 0:0
17 changes: 17 additions & 0 deletions test/BaselineOutput/SingleRelease/NAIndicator/featurized.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#@ TextLoader{
#@ header+
#@ sep=tab
#@ col=ScalarFloat:R4:0
#@ col=ScalarDouble:R8:1
#@ col=VectorFloat:R4:2-5
#@ col=VectorDoulbe:R8:6-9
#@ col=A:BL:10
#@ col=B:BL:11
#@ col=C:BL:12-15
#@ col=D:BL:16-19
#@ }
ScalarFloat ScalarDouble 18 8:A 9:B
5 5 5 1 1 1 5 1 1 1 10 0:0
5 5 5 4 4 5 5 4 4 5 10 0:0
3 3 3 1 1 1 3 1 1 1 10 0:0
6 6 6 8 8 1 6 8 8 1 10 0:0
46 changes: 45 additions & 1 deletion test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using Microsoft.ML.Data;
using Microsoft.ML.Runtime.Data;
using Microsoft.ML.Runtime.Data.IO;
using Microsoft.ML.Runtime.Internal.Utilities;
Expand Down Expand Up @@ -772,6 +773,50 @@ public void PrincipalComponentAnalysis()
Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber);
}

[Fact]
public void NAIndicatorStatic()
{
var Env = new ConsoleEnvironment(seed: 0);

string dataPath = GetDataPath("breast-cancer.txt");
var reader = TextLoader.CreateReader(Env, ctx => (
ScalarFloat: ctx.LoadFloat(1),
ScalarDouble: ctx.LoadDouble(1),
VectorFloat: ctx.LoadFloat(1, 4),
VectorDoulbe: ctx.LoadDouble(1, 4)
));

var data = reader.Read(new MultiFileSource(dataPath));

var est = data.MakeNewEstimator().
Append(row => (
A: row.ScalarFloat.IsMissingValue(),
B: row.ScalarDouble.IsMissingValue(),
C: row.VectorFloat.IsMissingValue(),
D: row.VectorDoulbe.IsMissingValue()
));

IDataView newData = TakeFilter.Create(Env, est.Fit(data).Transform(data).AsDynamic, 4);
Assert.NotNull(newData);
bool[] ScalarFloat = newData.GetColumn<bool>(Env, "A").ToArray();
bool[] ScalarDouble = newData.GetColumn<bool>(Env, "B").ToArray();
bool[][] VectorFloat = newData.GetColumn<bool[]>(Env, "C").ToArray();
bool[][] VectorDoulbe = newData.GetColumn<bool[]>(Env, "D").ToArray();

Assert.NotNull(ScalarFloat);
Assert.NotNull(ScalarDouble);
Assert.NotNull(VectorFloat);
Assert.NotNull(VectorDoulbe);
for (int i = 0; i < 4; i++)
{
Assert.True(!ScalarFloat[i] && !ScalarDouble[i]);
Assert.NotNull(VectorFloat[i]);
Assert.NotNull(VectorDoulbe[i]);
for (int j = 0; j < 4; j++)
Assert.True(!VectorFloat[i][j] && !VectorDoulbe[i][j]);
}
}

[Fact]
public void TextNormalizeStatic()
{
Expand Down Expand Up @@ -810,7 +855,6 @@ public void TextNormalizeStatic()
Assert.True(schema.TryGetColumnIndex("norm_NoNumbers", out int numbers));
type = schema.GetColumnType(numbers);
Assert.True(!type.IsVector && type.ItemType.IsText);

}
}
}
4 changes: 2 additions & 2 deletions test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ public partial class TrainerEstimators
public void TestEstimatorLogisticRegression()
{
(IEstimator<ITransformer> pipe, IDataView dataView) = GetBinaryClassificationPipeline();
pipe.Append(new LogisticRegression(Env, "Features", "Label"));
pipe = pipe.Append(new LogisticRegression(Env, "Features", "Label"));
TestEstimatorCore(pipe, dataView);
Done();
}
Expand All @@ -24,7 +24,7 @@ public void TestEstimatorLogisticRegression()
public void TestEstimatorMulticlassLogisticRegression()
{
(IEstimator<ITransformer> pipe, IDataView dataView) = GetMultiClassPipeline();
pipe.Append(new MulticlassLogisticRegression(Env, "Features", "Label"));
pipe = pipe.Append(new MulticlassLogisticRegression(Env, "Features", "Label"));
TestEstimatorCore(pipe, dataView);
Done();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ public partial class TrainerEstimators
public void TestEstimatorSymSgdClassificationTrainer()
{
(var pipe, var dataView) = GetBinaryClassificationPipeline();
pipe.Append(new SymSgdClassificationTrainer(Env, "Features", "Label"));
pipe = pipe.Append(new SymSgdClassificationTrainer(Env, "Features", "Label"));
TestEstimatorCore(pipe, dataView);
Done();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ public void KMeansEstimator()
public void TestEstimatorHogwildSGD()
{
(IEstimator<ITransformer> pipe, IDataView dataView) = GetBinaryClassificationPipeline();
pipe.Append(new StochasticGradientDescentClassificationTrainer(Env, "Features", "Label"));
pipe = pipe.Append(new StochasticGradientDescentClassificationTrainer(Env, "Features", "Label"));
Copy link
Contributor

@Zruty0 Zruty0 Oct 18, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pipe [](start = 19, length = 4)

nice catch :) #Resolved

TestEstimatorCore(pipe, dataView);
Done();
}
Expand All @@ -96,7 +96,7 @@ public void TestEstimatorHogwildSGD()
public void TestEstimatorMultiClassNaiveBayesTrainer()
{
(IEstimator<ITransformer> pipe, IDataView dataView) = GetMultiClassPipeline();
pipe.Append(new MultiClassNaiveBayesTrainer(Env, "Features", "Label"));
pipe = pipe.Append(new MultiClassNaiveBayesTrainer(Env, "Features", "Label"));
TestEstimatorCore(pipe, dataView);
Done();
}
Expand Down
148 changes: 148 additions & 0 deletions test/Microsoft.ML.Tests/Transformers/NAIndicatorTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using Microsoft.ML.Runtime.Api;
using Microsoft.ML.Runtime.Data;
using Microsoft.ML.Runtime.Data.IO;
using Microsoft.ML.Runtime.Model;
using Microsoft.ML.Runtime.RunTests;
using Microsoft.ML.Runtime.Tools;
using Microsoft.ML.Transforms;
using System;
using System.IO;
using Xunit;
using Xunit.Abstractions;

namespace Microsoft.ML.Tests.Transformers
{
public class NAIndicatorTests : TestDataPipeBase
{
private class TestClass
{
public float A;
public double B;
[VectorType(2)]
public float[] C;
[VectorType(2)]
public double[] D;
}

public NAIndicatorTests(ITestOutputHelper output) : base(output)
{
}

[Fact]
public void NAIndicatorWorkout()
{
var data = new[] {
new TestClass() { A = 1, B = 3, C = new float[2]{ 1, 2 } , D = new double[2]{ 3,4} },
new TestClass() { A = float.NaN, B = double.NaN, C = new float[2]{ float.NaN, float.NaN } , D = new double[2]{ double.NaN,double.NaN}},
new TestClass() { A = float.NegativeInfinity, B = double.NegativeInfinity, C = new float[2]{ float.NegativeInfinity, float.NegativeInfinity } , D = new double[2]{ double.NegativeInfinity, double.NegativeInfinity}},
new TestClass() { A = float.PositiveInfinity, B = double.PositiveInfinity, C = new float[2]{ float.PositiveInfinity, float.PositiveInfinity, } , D = new double[2]{ double.PositiveInfinity, double.PositiveInfinity}},
new TestClass() { A = 2, B = 1, C = new float[2]{ 3, 4 } , D = new double[2]{ 5,6}},
};

var dataView = ComponentCreation.CreateDataView(Env, data);
var pipe = new NAIndicatorEstimator(Env,
new (string input, string output)[] { ("A", "NAA"), ("B", "NAB"), ("C", "NAC"), ("D", "NAD") });
TestEstimatorCore(pipe, dataView);
Done();
}

[Fact]
public void TestCommandLine()
{
Assert.Equal(Maml.Main(new[] { @"showschema loader=Text{col=A:R4:0} xf=NAIndicator{col=B:A} in=f:\2.txt" }), (int)0);
}

[Fact]
public void TestOldSavingAndLoading()
Copy link
Contributor

@Ivanidzo4ka Ivanidzo4ka Oct 16, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add small metadata test? #Closed

{
var data = new[] {
new TestClass() { A = 1, B = 3, C = new float[2]{ 1, 2 } , D = new double[2]{ 3,4} },
new TestClass() { A = float.NaN, B = double.NaN, C = new float[2]{ float.NaN, float.NaN } , D = new double[2]{ double.NaN,double.NaN}},
new TestClass() { A = float.NegativeInfinity, B = double.NegativeInfinity, C = new float[2]{ float.NegativeInfinity, float.NegativeInfinity } , D = new double[2]{ double.NegativeInfinity, double.NegativeInfinity}},
new TestClass() { A = float.PositiveInfinity, B = double.PositiveInfinity, C = new float[2]{ float.PositiveInfinity, float.PositiveInfinity, } , D = new double[2]{ double.PositiveInfinity, double.PositiveInfinity}},
new TestClass() { A = 2, B = 1 , C = new float[2]{ 3, 4 } , D = new double[2]{ 5,6}},
};

var dataView = ComponentCreation.CreateDataView(Env, data);
var pipe = new NAIndicatorEstimator(Env,
new (string input, string output)[] { ("A", "NAA"), ("B", "NAB"), ("C", "NAC"), ("D", "NAD") });
var result = pipe.Fit(dataView).Transform(dataView);
var resultRoles = new RoleMappedData(result);
using (var ms = new MemoryStream())
{
TrainUtils.SaveModel(Env, Env.Start("saving"), ms, null, resultRoles);
ms.Position = 0;
var loadedView = ModelFileUtils.LoadTransforms(Env, dataView, ms);
}
}

[Fact]
public void NAIndicatorFileOutput()
{
string dataPath = GetDataPath("breast-cancer.txt");
var reader = TextLoader.CreateReader(Env, ctx => (
ScalarFloat: ctx.LoadFloat(1),
ScalarDouble: ctx.LoadDouble(1),
VectorFloat: ctx.LoadFloat(1, 4),
VectorDoulbe: ctx.LoadDouble(1, 4)
));

var data = reader.Read(new MultiFileSource(dataPath)).AsDynamic;
var wrongCollection = new[] { new TestClass() { A = 1, B = 3, C = new float[2] { 1, 2 }, D = new double[2] { 3, 4 } } };
var invalidData = ComponentCreation.CreateDataView(Env, wrongCollection);
var est = new NAIndicatorEstimator(Env,
new (string input, string output)[] { ("ScalarFloat", "A"), ("ScalarDouble", "B"), ("VectorFloat", "C"), ("VectorDoulbe", "D") });

TestEstimatorCore(est, data, invalidInput: invalidData);
var outputPath = GetOutputPath("NAIndicator", "featurized.tsv");
using (var ch = Env.Start("save"))
{
var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true });
IDataView savedData = TakeFilter.Create(Env, est.Fit(data).Transform(data), 4);
using (var fs = File.Create(outputPath))
DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
}

CheckEquality("NAIndicator", "featurized.tsv");
Done();
}

[Fact]
public void NAIndicatorMetadataTest()
{
var data = new[] {
new TestClass() { A = 1, B = 3, C = new float[2]{ 1, 2 } , D = new double[2]{ 3,4} },
new TestClass() { A = float.NaN, B = double.NaN, C = new float[2]{ float.NaN, float.NaN } , D = new double[2]{ double.NaN,double.NaN}},
new TestClass() { A = float.NegativeInfinity, B = double.NegativeInfinity, C = new float[2]{ float.NegativeInfinity, float.NegativeInfinity } , D = new double[2]{ double.NegativeInfinity, double.NegativeInfinity}},
new TestClass() { A = float.PositiveInfinity, B = double.PositiveInfinity, C = new float[2]{ float.PositiveInfinity, float.PositiveInfinity, } , D = new double[2]{ double.PositiveInfinity, double.PositiveInfinity}},
new TestClass() { A = 2, B = 1, C = new float[2]{ 3, 4 } , D = new double[2]{ 5,6}},
};

var dataView = ComponentCreation.CreateDataView(Env, data);
var pipe = new CategoricalEstimator(Env, new CategoricalEstimator.ColumnInfo("A", "CatA"));
var newpipe = pipe.Append(new NAIndicatorEstimator(Env, new (string input, string output)[] { ("CatA", "NAA") }));
var result = newpipe.Fit(dataView).Transform(dataView);
Assert.True(result.Schema.TryGetColumnIndex("NAA", out var col));
// Check that the column is normalized.
Assert.True(result.Schema.IsNormalized(col));
// Check that slot names metadata was correctly created.
var value = new VBuffer<ReadOnlyMemory<char>>();
var type = result.Schema.GetMetadataTypeOrNull(MetadataUtils.Kinds.SlotNames, col);
result.Schema.GetMetadata(MetadataUtils.Kinds.SlotNames, col, ref value);
Assert.True(value.Length == 4);
var mem = new ReadOnlyMemory<char>();
value.GetItemOrDefault(0, ref mem);
Assert.True(mem.ToString() == "1");
value.GetItemOrDefault(1, ref mem);
Assert.True(mem.ToString() == "-Infinity");
value.GetItemOrDefault(2, ref mem);
Assert.True(mem.ToString() == "Infinity");
value.GetItemOrDefault(3, ref mem);
Assert.True(mem.ToString() == "2");
}
}
}