Skip to content

Fix ResultProcessor bug, LogisticRegression bug and missing value conversion bug #1236

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 23 commits into from
Oct 20, 2018
Merged
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/Microsoft.ML.Data/Data/Conversion.cs
Original file line number Diff line number Diff line change
@@ -1170,7 +1170,7 @@ private bool IsStdMissing(ref ReadOnlySpan<char> span)
public bool TryParseKey(ref TX src, U8 min, U8 max, out U8 dst)
{
var span = src.Span;
Contracts.Check(!IsStdMissing(ref span), "Missing text value cannot be converted to unsigned integer type.");
Contracts.Check(span.IsEmpty || !IsStdMissing(ref span), "Missing text value cannot be converted to unsigned integer type.");
Copy link
Contributor

@TomFinley TomFinley Oct 12, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

span.IsEmpty [](start = 28, length = 12)

Heh heh. Whoops! #Resolved

Contracts.Assert(min <= max);

// This simply ensures we don't have min == 0 and max == U8.MaxValue. This is illegal since
@@ -1530,7 +1530,7 @@ public bool TryParse(ref TX src, out BL dst)
{
var span = src.Span;

Contracts.Check(!IsStdMissing(ref span), "Missing text values cannot be converted to bool value.");
Contracts.Check(span.IsEmpty || !IsStdMissing(ref span), "Missing text value cannot be converted to bool type.");

char ch;
switch (src.Length)
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Data/Transforms/ConcatTransform.cs
Original file line number Diff line number Diff line change
@@ -384,7 +384,7 @@ public static IDataTransform Create(IHostEnvironment env, TaggedArguments args,
env.CheckUserArg(Utils.Size(args.Column[i].Source) > 0, nameof(args.Column));

var cols = args.Column
.Select(c => new ColumnInfo(c.Name, c.Source.Select(kvp => (kvp.Value, kvp.Key))))
.Select(c => new ColumnInfo(c.Name, c.Source.Select(kvp => (kvp.Value, kvp.Key != "" ? kvp.Key : null))))
.ToArray();
var transformer = new ConcatTransform(env, cols);
return transformer.MakeDataTransform(input);
14 changes: 11 additions & 3 deletions src/Microsoft.ML.Data/Transforms/TermEstimator.cs
Original file line number Diff line number Diff line change
@@ -21,6 +21,9 @@ public static class Defaults

private readonly IHost _host;
private readonly TermTransform.ColumnInfo[] _columns;
private readonly string _file;
private readonly string _termsColumn;
private readonly IComponentFactory<IMultiStreamSource, IDataLoader> _loaderFactory;

/// <summary>
/// Convenience constructor for public facing API.
@@ -32,18 +35,23 @@ public static class Defaults
/// <param name="sort">How items should be ordered when vectorized. By default, they will be in the order encountered.
/// If by value items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').</param>
public TermEstimator(IHostEnvironment env, string inputColumn, string outputColumn = null, int maxNumTerms = Defaults.MaxNumTerms, TermTransform.SortOrder sort = Defaults.Sort) :
this(env, new TermTransform.ColumnInfo(inputColumn, outputColumn ?? inputColumn, maxNumTerms, sort))
this(env, new[] { new TermTransform.ColumnInfo(inputColumn, outputColumn ?? inputColumn, maxNumTerms, sort) })
{
}

public TermEstimator(IHostEnvironment env, params TermTransform.ColumnInfo[] columns)
public TermEstimator(IHostEnvironment env, TermTransform.ColumnInfo[] columns,
string file = null, string termsColumn = null,
IComponentFactory<IMultiStreamSource, IDataLoader> loaderFactory = null)
{
Contracts.CheckValue(env, nameof(env));
_host = env.Register(nameof(TermEstimator));
_columns = columns;
_file = file;
_termsColumn = termsColumn;
_loaderFactory = loaderFactory;
}

public TermTransform Fit(IDataView input) => new TermTransform(_host, input, _columns);
public TermTransform Fit(IDataView input) => new TermTransform(_host, input, _columns, _file, _termsColumn, _loaderFactory);

public SchemaShape GetOutputSchema(SchemaShape inputSchema)
{
6 changes: 3 additions & 3 deletions src/Microsoft.ML.Data/Transforms/TermTransform.cs
Original file line number Diff line number Diff line change
@@ -268,7 +268,7 @@ public TermTransform(IHostEnvironment env, IDataView input,
this(env, input, columns, null, null, null)
{ }

private TermTransform(IHostEnvironment env, IDataView input,
internal TermTransform(IHostEnvironment env, IDataView input,
ColumnInfo[] columns,
string file = null, string termsColumn = null,
IComponentFactory<IMultiStreamSource, IDataLoader> loaderFactory = null)
@@ -314,13 +314,13 @@ public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataV
if (!Enum.IsDefined(typeof(SortOrder), sortOrder))
throw env.ExceptUserArg(nameof(args.Sort), "Undefined sorting criteria '{0}' detected for column '{1}'", sortOrder, item.Name);

cols[i] = new ColumnInfo(item.Source,
cols[i] = new ColumnInfo(item.Source ?? item.Name,
item.Name,
item.MaxNumTerms ?? args.MaxNumTerms,
sortOrder,
item.Term,
item.TextKeyValues ?? args.TextKeyValues);
cols[i].Terms = item.Terms;
cols[i].Terms = item.Terms ?? args.Terms;
};
}
return new TermTransform(env, input, cols, args.DataFile, args.TermsColumn, args.Loader).MakeDataTransform(input);
9 changes: 6 additions & 3 deletions src/Microsoft.ML.ResultProcessor/ResultProcessor.cs
Original file line number Diff line number Diff line change
@@ -1063,10 +1063,10 @@ private static Experiment CreateVisualizationExperiment(ExperimentItemResult res
var experiment = new ML.Runtime.ExperimentVisualization.Experiment
{
Key = index.ToString(),
CompareGroup = string.IsNullOrEmpty(result.CustomizedTag) ? result.Trainer.Kind : result.CustomizedTag,
CompareGroup = string.IsNullOrEmpty(result.CustomizedTag) ? result.TrainerKind : result.CustomizedTag,
Trainer = new ML.Runtime.ExperimentVisualization.Trainer
{
Name = result.Trainer.Kind,
Name = result.TrainerKind,
ParameterSets = new List<ML.Runtime.ExperimentVisualization.Item>()
},
DataSet = new ML.Runtime.ExperimentVisualization.DataSet { File = result.Datafile },
@@ -1152,7 +1152,10 @@ private static object Load(Stream stream)

public static int Main(string[] args)
{
return Main(new ConsoleEnvironment(42), args);
string currentDirectory = Path.GetDirectoryName(typeof(ResultProcessor).Module.FullyQualifiedName);
using (var env = new ConsoleEnvironment(42))
using (AssemblyLoadingUtils.CreateAssemblyRegistrar(env, currentDirectory))
return Main(env, args);
}

public static int Main(IHostEnvironment env, string[] args)
Original file line number Diff line number Diff line change
@@ -373,7 +373,7 @@ protected override ParameterMixingCalibratedPredictor CreatePredictor()
CurrentWeights.GetItemOrDefault(0, ref bias);
CurrentWeights.CopyTo(ref weights, 1, CurrentWeights.Length - 1);
return new ParameterMixingCalibratedPredictor(Host,
new LinearBinaryPredictor(Host, ref weights, bias),
new LinearBinaryPredictor(Host, ref weights, bias, _stats),
new PlattCalibrator(Host, -1, 0));
}

2 changes: 1 addition & 1 deletion src/Microsoft.ML.Sweeper/Algorithms/NelderMead.cs
Original file line number Diff line number Diff line change
@@ -26,7 +26,7 @@ public sealed class Arguments
public IComponentFactory<IValueGenerator>[] SweptParameters;

[Argument(ArgumentType.LastOccurenceWins, HelpText = "The sweeper used to get the initial results.", ShortName = "init", SignatureType = typeof(SignatureSweeperFromParameterList))]
public IComponentFactory<IValueGenerator[], ISweeper> FirstBatchSweeper;
public IComponentFactory<IValueGenerator[], ISweeper> FirstBatchSweeper = ComponentFactoryUtils.CreateFromFunction<IValueGenerator[], ISweeper>((host, array) => new UniformRandomSweeper(host, new SweeperBase.ArgumentsBase(), array));

[Argument(ArgumentType.AtMostOnce, HelpText = "Seed for the random number generator for the first batch sweeper", ShortName = "seed")]
public int RandomSeed;
5 changes: 4 additions & 1 deletion src/Microsoft.ML.Sweeper/ConfigRunner.cs
Original file line number Diff line number Diff line change
@@ -107,7 +107,10 @@ public virtual void Finish()
if (Exe == null || Exe.EndsWith("maml", StringComparison.OrdinalIgnoreCase) ||
Exe.EndsWith("maml.exe", StringComparison.OrdinalIgnoreCase))
{
string currentDirectory = Path.GetDirectoryName(typeof(ExeConfigRunnerBase).Module.FullyQualifiedName);

using (var ch = Host.Start("Finish"))
using (AssemblyLoadingUtils.CreateAssemblyRegistrar(Host, currentDirectory))
{
var runs = RunNums.ToArray();
var args = Utils.BuildArray(RunNums.Count + 2,
@@ -120,7 +123,7 @@ public virtual void Finish()
return string.Format("{{{0}}}", GetFilePath(runs[i], "out"));
});

ResultProcessorInternal.ResultProcessor.Main (args);
ResultProcessorInternal.ResultProcessor.Main(args);

ch.Info(@"The summary of the run results has been saved to the file {0}\{1}.summary.txt", OutputFolder, Prefix);
}
4 changes: 4 additions & 0 deletions src/Microsoft.ML.Sweeper/Microsoft.ML.Sweeper.csproj
Original file line number Diff line number Diff line change
@@ -16,4 +16,8 @@

</ItemGroup>

<ItemGroup>
<Compile Include="..\Common\AssemblyLoadingUtils.cs" Link="Common\AssemblyLoadingUtils.cs" />
</ItemGroup>

</Project>
18 changes: 11 additions & 7 deletions src/Microsoft.ML.Transforms/CategoricalTransform.cs
Original file line number Diff line number Diff line change
@@ -135,18 +135,20 @@ public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataV
column.MaxNumTerms ?? args.MaxNumTerms,
column.Sort ?? args.Sort,
column.Term ?? args.Term);
col.SetTerms(column.Terms);
col.SetTerms(column.Terms ?? args.Terms);
columns.Add(col);
}
return new CategoricalEstimator(env, columns.ToArray()).Fit(input).Transform(input) as IDataTransform;
return new CategoricalEstimator(env, columns.ToArray(), args.DataFile, args.TermsColumn, args.Loader).Fit(input).Transform(input) as IDataTransform;
}

private readonly TransformerChain<ITransformer> _transformer;

public CategoricalTransform(TermEstimator term, IEstimator<ITransformer> toVector, IDataView input)
{
var chain = term.Append(toVector);
_transformer = chain.Fit(input);
if (toVector != null)
_transformer = term.Append(toVector).Fit(input);
else
_transformer = new TransformerChain<ITransformer>(term.Fit(input));
}

public Schema GetOutputSchema(Schema inputSchema) => _transformer.GetOutputSchema(inputSchema);
@@ -198,15 +200,17 @@ internal void SetTerms(string terms)
/// <param name="outputKind">The type of output expected.</param>
public CategoricalEstimator(IHostEnvironment env, string input,
string output = null, CategoricalTransform.OutputKind outputKind = Defaults.OutKind)
: this(env, new ColumnInfo(input, output ?? input, outputKind))
: this(env, new[] { new ColumnInfo(input, output ?? input, outputKind) })
{
}

public CategoricalEstimator(IHostEnvironment env, params ColumnInfo[] columns)
public CategoricalEstimator(IHostEnvironment env, ColumnInfo[] columns,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

public [](start = 8, length = 6)

out of curiosity can you make this constructor internal? I would prefer to not pollute our public API with these things.
Same for TermEstimator.

string file = null, string termsColumn = null,
IComponentFactory<IMultiStreamSource, IDataLoader> loaderFactory = null)
{
Contracts.CheckValue(env, nameof(env));
_host = env.Register(nameof(TermEstimator));
_term = new TermEstimator(_host, columns);
_term = new TermEstimator(_host, columns, file, termsColumn, loaderFactory);
var binaryCols = new List<(string input, string output)>();
var cols = new List<(string input, string output, bool bag)>();
for (int i = 0; i < columns.Length; i++)
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Saving predictor summary
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
maml.exe Train tr=MultiClassLogisticRegression{maxiter=100 t=- stat=+} loader=TextLoader{col=Label:TX:4 col=Features:R4:0-3 sep=,} data=%Data% out=%Output% seed=1 xf=Term{col=Label}
Automatically adding a MinMax normalization transform, use 'norm=Warn' or 'norm=No' to turn this behavior off.
Beginning optimization
num vars: 15
improvement criterion: Mean Improvement
L1 regularization selected 11 of 15 weights.
Model trained with 150 training examples.
Residual Deviance: 132.0122
Null Deviance: 329.5837
AIC: 154.0122
Not training a calibrator because it is not needed.
Physical memory usage(MB): %Number%
Virtual memory usage(MB): %Number%
%DateTime% Time elapsed(s): %Number%

Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
MulticlassLogisticRegression bias and non-zero weights
Iris-setosa+(Bias) 2.265129
Iris-versicolor+(Bias) 0.7695086
Iris-virginica+(Bias) -3.034663
Iris-setosa+f3 -3.180634
Iris-setosa+f2 -2.88663
Iris-setosa+f1 0.5392878
Iris-setosa+f0 -0.03958065
Iris-versicolor+f1 -0.7073272
Iris-virginica+f3 3.158146
Iris-virginica+f2 1.907791
Iris-virginica+f0 0.01793481

*** MODEL STATISTICS SUMMARY ***
Count of training examples: 150
Residual Deviance: 132.0122
Null Deviance: 329.5837
AIC: 154.0122
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Saving predictor summary
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
maml.exe Train feat=Num lab=Lab tr=lr{t=- stat=+} loader=text{header+ sep=comma col=Lab:14 col=Num:0,2,4,10-12} data=%Data% out=%Output%
Automatically adding a MinMax normalization transform, use 'norm=Warn' or 'norm=No' to turn this behavior off.
Beginning optimization
num vars: 7
improvement criterion: Mean Improvement
L1 regularization selected 7 of 7 weights.
Model trained with 32561 training examples.
Residual Deviance: 26705.74 (on 32554 degrees of freedom)
Null Deviance: 35948.08 (on 32560 degrees of freedom)
AIC: 26719.74
Not training a calibrator because it is not needed.
Physical memory usage(MB): %Number%
Virtual memory usage(MB): %Number%
%DateTime% Time elapsed(s): %Number%

Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
Linear Binary Classification Predictor non-zero weights

(Bias) -8.228298
capital-gain 18.58347
education-num 5.066041
hours-per-week 3.946534
age 3.86064
capital-loss 2.81616
fnlwgt 0.7489593

*** MODEL STATISTICS SUMMARY ***
Count of training examples: 32561
Residual Deviance: 26705.74
Null Deviance: 35948.08
AIC: 26719.74
10 changes: 10 additions & 0 deletions test/BaselineOutput/Common/EntryPoints/ensemble-model0-stats.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#@ TextLoader{
#@ header+
#@ sep=tab
#@ col={name={Count of training examples} type=I8 src=0}
#@ col={name={Residual Deviance} type=R4 src=1}
#@ col={name={Null Deviance} type=R4 src=2}
#@ col=AIC:R4:3
#@ }
Count of training examples Residual Deviance Null Deviance AIC
521 98.29433 669.0935 118.294327
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#@ TextLoader{
#@ header+
#@ sep=tab
#@ col=Bias:R4:0
#@ col=Weights:R4:1-17
#@ }
Bias Features.thickness Features.uniform_size Features.uniform_shape Features.adhesion Features.epit_size Features.bare_nuclei Features.bland_chromatin Features.normal_nucleoli Cat.1 Cat.5 Cat.4 Cat.3 Cat.2 Cat.7 Cat.10 Cat.8 Cat.6
-5.120674 2.353567 1.78653753 1.9442488 1.38072 1.0831089 2.43588924 1.61141682 1.34575915 -0.7715381 0 0 0 0 0 0 0 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#@ TextLoader{
#@ header+
#@ sep=tab
#@ col=Gains:R4:0-16
#@ }
Cat.1 Cat.4 Cat.2 Cat.5 Cat.10 Cat.3 Cat.7 Cat.8 Cat.6 Features.thickness Features.uniform_size Features.uniform_shape Features.adhesion Features.epit_size Features.bare_nuclei Features.bland_chromatin Features.normal_nucleoli
0.0607880056 0 0.0249023773 0 0 4.10026857E-09 0 0 0 0.190965369 1 0.7112387 0.14315024 0.222178861 0.413435966 0.254190356 0.2604484
10 changes: 10 additions & 0 deletions test/BaselineOutput/Common/EntryPoints/ensemble-model2-stats.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#@ TextLoader{
#@ header+
#@ sep=tab
#@ col={name={Count of training examples} type=I8 src=0}
#@ col={name={Residual Deviance} type=R4 src=1}
#@ col={name={Null Deviance} type=R4 src=2}
#@ col=AIC:R4:3
#@ }
Count of training examples Residual Deviance Null Deviance AIC
520 94.1969452 673.3445 114.196945
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#@ TextLoader{
#@ header+
#@ sep=tab
#@ col=Bias:R4:0
#@ col=Weights:R4:1-17
#@ }
Bias Features.thickness Features.uniform_size Features.uniform_shape Features.adhesion Features.epit_size Features.bare_nuclei Features.bland_chromatin Features.normal_nucleoli Cat.1 Cat.5 Cat.4 Cat.2 Cat.3 Cat.7 Cat.10 Cat.8 Cat.6
-4.860323 2.143086 1.49418533 1.71121442 1.38318741 0.883200347 3.16845965 1.38684654 1.51904845 -0.8226236 0 0 0 0 0 0 0 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#@ TextLoader{
#@ header+
#@ sep=tab
#@ col=Gains:R4:0-16
#@ }
Cat.1 Cat.5 Cat.2 Cat.4 Cat.3 Cat.7 Cat.10 Cat.8 Cat.6 Features.thickness Features.uniform_size Features.uniform_shape Features.adhesion Features.epit_size Features.bare_nuclei Features.bland_chromatin Features.normal_nucleoli
0.009761757 0 0.0203766879 0 0.000928933 0 0 0 0 0.308038682 1 0.5590685 0.125412315 0.118880585 0.488731444 0.308761537 0.132577017
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
Partition model 0 summary:
Linear Binary Classification Predictor non-zero weights
(Bias): -5.120674
Features.bare_nuclei: 2.435889
Features.thickness: 2.353567
Features.uniform_shape: 1.944249
Features.uniform_size: 1.786538
Features.bland_chromatin: 1.611417
Features.adhesion: 1.38072
Features.normal_nucleoli: 1.345759
Features.epit_size: 1.083109
Cat.1: -0.7715381
Count of training examples: 521
Residual Deviance: 98.29433
Null Deviance: 669.0935
AIC: 118.2943
Partition model 1 summary:
Per-feature gain summary for the boosted tree ensemble:
Features.uniform_size: 1
Features.uniform_shape: 0.711238682354263
Features.bare_nuclei: 0.413435971399054
Features.normal_nucleoli: 0.260448393604327
Features.bland_chromatin: 0.254190368593018
Features.epit_size: 0.222178863469679
Features.thickness: 0.190965373645692
Features.adhesion: 0.143150245168852
Cat.1: 0.0607880054395048
Cat.2: 0.0249023775790133
Cat.3: 4.10026871732935E-09
Partition model 2 summary:
Linear Binary Classification Predictor non-zero weights
(Bias): -4.860323
Features.bare_nuclei: 3.16846
Features.thickness: 2.143086
Features.uniform_shape: 1.711214
Features.normal_nucleoli: 1.519048
Features.uniform_size: 1.494185
Features.bland_chromatin: 1.386847
Features.adhesion: 1.383187
Features.epit_size: 0.8832003
Cat.1: -0.8226236
Count of training examples: 520
Residual Deviance: 94.19695
Null Deviance: 673.3445
AIC: 114.1969
Partition model 3 summary:
Per-feature gain summary for the boosted tree ensemble:
Features.uniform_size: 1
Features.uniform_shape: 0.559068504082849
Features.bare_nuclei: 0.488731457203164
Features.bland_chromatin: 0.308761540884501
Features.thickness: 0.308038677882308
Features.normal_nucleoli: 0.132577017456797
Features.adhesion: 0.125412316945858
Features.epit_size: 0.118880587537871
Cat.2: 0.0203766881332348
Cat.1: 0.00976175711400017
Cat.3: 0.000928932959407758
Loading