Skip to content

Multiple feature columns in FFM #2205

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jan 23, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/Microsoft.ML.FastTree/TreeEnsemble/TreeEnsemble.cs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ namespace Microsoft.ML.Trainers.FastTree.Internal
{
public class TreeEnsemble
{
/// <summary>
/// String appended to the text representation of <see cref="TreeEnsemble"/>. This is mainly used in <see cref="ToTreeEnsembleIni"/>.
/// </summary>
private readonly string _firstInputInitializationContent;
private readonly List<RegressionTree> _trees;

Expand Down
54 changes: 53 additions & 1 deletion src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ public static IEnumerable<BinaryLabelFloatFeatureVectorSample> GenerateBinaryLa
// Initialize an example with a random label and an empty feature vector.
var sample = new BinaryLabelFloatFeatureVectorSample() { Label = rnd.Next() % 2 == 0, Features = new float[_simpleBinaryClassSampleFeatureLength] };
// Fill feature vector according the assigned label.
for (int j = 0; j < 10; ++j)
for (int j = 0; j < _simpleBinaryClassSampleFeatureLength; ++j)
{
var value = (float)rnd.NextDouble();
// Positive class gets larger feature value.
Expand All @@ -271,6 +271,58 @@ public static IEnumerable<BinaryLabelFloatFeatureVectorSample> GenerateBinaryLa
return data;
}

public class FfmExample
{
public bool Label;

[VectorType(_simpleBinaryClassSampleFeatureLength)]
Copy link
Member

@abgoswam abgoswam Jan 22, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

VectorType [](start = 13, length = 10)

am curious - is this attribute required ? #Closed

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is required.


In reply to: 249994977 [](ancestors = 249994977)

public float[] Field0;

[VectorType(_simpleBinaryClassSampleFeatureLength)]
public float[] Field1;

[VectorType(_simpleBinaryClassSampleFeatureLength)]
public float[] Field2;
}

public static IEnumerable<FfmExample> GenerateFfmSamples(int exampleCount)
{
var rnd = new Random(0);
var data = new List<FfmExample>();
for (int i = 0; i < exampleCount; ++i)
{
// Initialize an example with a random label and an empty feature vector.
var sample = new FfmExample() { Label = rnd.Next() % 2 == 0,
Field0 = new float[_simpleBinaryClassSampleFeatureLength],
Field1 = new float[_simpleBinaryClassSampleFeatureLength],
Field2 = new float[_simpleBinaryClassSampleFeatureLength] };
// Fill feature vector according the assigned label.
for (int j = 0; j < 10; ++j)
Copy link
Member

@abgoswam abgoswam Jan 22, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

10 [](start = 36, length = 2)

_simpleBinaryClassSampleFeatureLength ? #Closed

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice catch. Fixed.


In reply to: 249993941 [](ancestors = 249993941)

{
var value0 = (float)rnd.NextDouble();
// Positive class gets larger feature value.
if (sample.Label)
value0 += 0.2f;
sample.Field0[j] = value0;

var value1 = (float)rnd.NextDouble();
// Positive class gets smaller feature value.
if (sample.Label)
value1 -= 0.2f;
sample.Field1[j] = value1;

var value2 = (float)rnd.NextDouble();
// Positive class gets larger feature value.
if (sample.Label)
value2 += 0.8f;
sample.Field2[j] = value2;
}

data.Add(sample);
}
return data;
}

/// <summary>
/// feature vector's length in <see cref="MulticlassClassificationExample"/>.
/// </summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public sealed class FieldAwareFactorizationMachineTrainer : TrainerBase<FieldAwa
internal const string LoadName = "FieldAwareFactorizationMachine";
internal const string ShortName = "ffm";

public sealed class Arguments : LearnerInputBaseWithLabel
public sealed class Arguments : LearnerInputBaseWithWeight
{
[Argument(ArgumentType.AtMostOnce, HelpText = "Initial learning rate", ShortName = "lr", SortOrder = 1)]
[TlcModule.SweepableFloatParam(0.001f, 1.0f, isLogScale: true)]
Expand All @@ -65,6 +65,15 @@ public sealed class Arguments : LearnerInputBaseWithLabel
[Argument(ArgumentType.AtMostOnce, HelpText = "Whether to normalize the input vectors so that the concatenation of all fields' feature vectors is unit-length", ShortName = "norm", SortOrder = 6)]
public bool Norm = true;

/// <summary>
/// Extra feature column names. The column named <see cref="LearnerInputBase.FeatureColumn"/> stores features from the first field.
/// The i-th string in <see cref="ExtraFeatureColumns"/> stores the name of the (i+1)-th field's feature column.
/// </summary>
[Argument(ArgumentType.Multiple, HelpText = "Extra columns to use for feature vectors. The i-th specified string denotes the column containing features form the (i+1)-th field." +
" Note that the first field is specified by \"feat\" instead of \"exfeat\".",
ShortName = "exfeat", SortOrder = 7)]
public string[] ExtraFeatureColumns;

[Argument(ArgumentType.AtMostOnce, HelpText = "Whether to shuffle for each training iteration", ShortName = "shuf", SortOrder = 90)]
public bool Shuffle = true;

Expand Down Expand Up @@ -122,13 +131,26 @@ public FieldAwareFactorizationMachineTrainer(IHostEnvironment env, Arguments arg
{
Initialize(env, args);
Info = new TrainerInfo(supportValid: true, supportIncrementalTrain: true);

// There can be multiple feature columns in FFM, jointly specified by args.FeatureColumn and args.ExtraFeatureColumns.
FeatureColumns = new SchemaShape.Column[1 + args.ExtraFeatureColumns.Length];

// Treat the default feature column as the 1st field.
FeatureColumns[0] = new SchemaShape.Column(args.FeatureColumn, SchemaShape.Column.VectorKind.Vector, NumberType.R4, false);

// Add 2nd, 3rd, and other fields from a FFM-specific argument, args.ExtraFeatureColumns.
for (int i = 0; args.ExtraFeatureColumns != null && i < args.ExtraFeatureColumns.Length; i++)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for (int i = 0; args.ExtraFeatureColumns != null && i < args.ExtraFeatureColumns.Length; i++) [](start = 12, length = 93)

could this be a simple foreach loop instead?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess it can't. The array size is pre-defined and I prefer not to create an intermediate list just for calling Add.


In reply to: 250316802 [](ancestors = 250316802)

FeatureColumns[i + 1] = new SchemaShape.Column(args.ExtraFeatureColumns[i], SchemaShape.Column.VectorKind.Vector, NumberType.R4, false);

LabelColumn = new SchemaShape.Column(args.LabelColumn, SchemaShape.Column.VectorKind.Scalar, BoolType.Instance, false);
WeightColumn = args.WeightColumn.IsExplicit ? new SchemaShape.Column(args.WeightColumn, SchemaShape.Column.VectorKind.Scalar, NumberType.R4, false) : default;
}

/// <summary>
/// Initializing a new instance of <see cref="FieldAwareFactorizationMachineTrainer"/>.
/// </summary>
/// <param name="env">The private instance of <see cref="IHostEnvironment"/>.</param>
/// <param name="featureColumns">The name of column hosting the features.</param>
/// <param name="featureColumns">The name of column hosting the features. The i-th element stores feature column of the i-th field.</param>
/// <param name="labelColumn">The name of the label column.</param>
/// <param name="advancedSettings">A delegate to apply all the advanced arguments to the algorithm.</param>
/// <param name="weights">The name of the optional weights' column.</param>
Expand Down
28 changes: 28 additions & 0 deletions test/BaselineOutput/Common/EntryPoints/core_manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -10222,6 +10222,18 @@
"IsLogScale": true
}
},
{
"Name": "WeightColumn",
"Type": "String",
"Desc": "Column to use for example weight",
"Aliases": [
"weight"
],
"Required": false,
"SortOrder": 4.0,
"IsNullable": false,
"Default": "Weight"
},
{
"Name": "LambdaLatent",
"Type": "Float",
Expand Down Expand Up @@ -10292,6 +10304,21 @@
"IsNullable": false,
"Default": "Auto"
},
{
"Name": "ExtraFeatureColumns",
"Type": {
"Kind": "Array",
"ItemType": "String"
},
"Desc": "Extra columns to use for feature vectors. The i-th specified string denotes the column containing features form the (i+1)-th field. Note that the first field is specified by \"feat\" instead of \"exfeat\".",
"Aliases": [
"exfeat"
],
"Required": false,
"SortOrder": 7.0,
"IsNullable": false,
"Default": null
},
{
"Name": "Shuffle",
"Type": "Bool",
Expand Down Expand Up @@ -10342,6 +10369,7 @@
}
],
"InputKind": [
"ITrainerInputWithWeight",
"ITrainerInputWithLabel",
"ITrainerInput"
],
Expand Down
28 changes: 28 additions & 0 deletions test/Microsoft.ML.Tests/TrainerEstimators/FAFMEstimator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,43 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Linq;
using Microsoft.ML.Data;
using Microsoft.ML.FactorizationMachine;
using Microsoft.ML.RunTests;
using Microsoft.ML.SamplesUtils;
using Xunit;

namespace Microsoft.ML.Tests.TrainerEstimators
{
public partial class TrainerEstimators : TestDataPipeBase
{
[Fact]
public void FfmBinaryClassificationWithAdvancedArguments()
{
var mlContext = new MLContext(seed: 0);
var data = DatasetUtils.GenerateFfmSamples(500);
var dataView = ComponentCreation.CreateDataView(mlContext, data.ToList());

var ffmArgs = new FieldAwareFactorizationMachineTrainer.Arguments();

// Customized the field names.
ffmArgs.FeatureColumn = nameof(DatasetUtils.FfmExample.Field0); // First field.
ffmArgs.ExtraFeatureColumns = new[]{ nameof(DatasetUtils.FfmExample.Field1), nameof(DatasetUtils.FfmExample.Field2) };
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ExtraFeatureColumns [](start = 20, length = 19)

this looks slightly odd . isn't it ?

am curious -- why move away from the convention used in iteration #2, where we were re-defining FeatureColumn as a string[]

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Framework of generating entry points generates the hidden field (old feature column name) and therefore we have two fields with the same name and an error.


In reply to: 250309618 [](ancestors = 250309618)


var pipeline = new FieldAwareFactorizationMachineTrainer(mlContext, ffmArgs);

var model = pipeline.Fit(dataView);
var prediction = model.Transform(dataView);

var metrics = mlContext.BinaryClassification.Evaluate(prediction);

// Run a sanity check against a few of the metrics.
Assert.InRange(metrics.Accuracy, 0.9, 1);
Assert.InRange(metrics.Auc, 0.9, 1);
Assert.InRange(metrics.Auprc, 0.9, 1);
}

[Fact]
public void FieldAwareFactorizationMachine_Estimator()
{
Expand Down