Skip to content

Docs & samples for SDCA-based trainers #2771

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Mar 4, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 0 additions & 43 deletions docs/samples/Microsoft.ML.Samples/Dynamic/SDCARegression.cs

This file was deleted.

Original file line number Diff line number Diff line change
@@ -5,7 +5,7 @@

namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification
Copy link
Member

@sfilipi sfilipi Feb 28, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Microsoft.ML.Samples.Dynamic [](start = 10, length = 28)

let's keep the namespace for all samples Microsoft.ML.Samples.Dynamic. The nesting is unecessary #Resolved

Copy link
Author

@shmoradims shmoradims Feb 28, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've explained the reasoning here:
#2729 (comment)

btw, we don't need namespace nesting for transforms. only for trainers, because of naming conflicts.


In reply to: 261050465 [](ancestors = 261050465)

{
public static class SDCALogisticRegression
public static class StochasticDualCoordinateAscent
{
public static void Example()
{
Original file line number Diff line number Diff line change
@@ -4,7 +4,7 @@

namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification
{
public static class SDCASupportVectorMachine
public static class StochasticDualCoordinateAscentNonCalibrated
{
public static void Example()
{
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
using Microsoft.ML;
using Microsoft.ML.Trainers;

namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification
{
public static class StochasticDualCoordinateAscentWithOptions
{
// In this examples we will use the adult income dataset. The goal is to predict
// if a person's income is above $50K or not, based on demographic information about that person.
// For more details about this dataset, please see https://archive.ics.uci.edu/ml/datasets/adult.
public static void Example()
{
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
// as a catalog of available operations and as the source of randomness.
// Setting the seed to a fixed number in this example to make outputs deterministic.
var mlContext = new MLContext(seed: 0);

// Download and featurize the dataset.
var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);

// Leave out 10% of data for testing.
var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);

// Define the trainer options.
var options = new SdcaBinaryTrainer.Options()
{
// Make the convergence tolerance tighter.
ConvergenceTolerance = 0.05f,
// Increase the maximum number of passes over training data.
MaxIterations = 30,
// Give the instances of the positive class slightly more weight.
PositiveInstanceWeight = 1.2f,
};

// Create data training pipeline.
var pipeline = mlContext.BinaryClassification.Trainers.StochasticDualCoordinateAscent(options);

// Fit this pipeline to the training data.
var model = pipeline.Fit(trainTestData.TrainSet);

// Evaluate how the model is doing on the test data.
var dataWithPredictions = model.Transform(trainTestData.TestSet);
var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions);
SamplesUtils.ConsoleUtils.PrintMetrics(metrics);

// Expected output:
// Accuracy: 0.85
// AUC: 0.90
// F1 Score: 0.66
// Negative Precision: 0.89
// Negative Recall: 0.92
// Positive Precision: 0.70
// Positive Recall: 0.63
// LogLoss: 0.47
// LogLossReduction: 39.77
// Entropy: 0.78
}
}
}
Original file line number Diff line number Diff line change
@@ -5,7 +5,7 @@

namespace Microsoft.ML.Samples.Dynamic.Trainers.MulticlassClassification
{
class LightGbm
public static class LightGbm
{
// This example requires installation of additional nuget package <a href="https://www.nuget.org/packages/Microsoft.ML.LightGBM/">Microsoft.ML.LightGBM</a>.
public static void Example()
@@ -14,10 +14,10 @@ public static void Example()
// as a catalog of available operations and as the source of randomness.
var mlContext = new MLContext();

// Create in-memory examples as C# native class.
// Create a list of data examples.
var examples = DatasetUtils.GenerateRandomMulticlassClassificationExamples(1000);

// Convert native C# class to IDataView, a consumble format to ML.NET functions.
// Convert the examples list to an IDataView object, which is consumable by ML.NET API.
var dataView = mlContext.Data.LoadFromEnumerable(examples);

//////////////////// Data Preview ////////////////////
Original file line number Diff line number Diff line change
@@ -7,7 +7,7 @@

namespace Microsoft.ML.Samples.Dynamic.Trainers.MulticlassClassification
{
class LightGbmWithOptions
public static class LightGbmWithOptions
{
// This example requires installation of additional nuget package <a href="https://www.nuget.org/packages/Microsoft.ML.LightGBM/">Microsoft.ML.LightGBM</a>.
public static void Example()
@@ -16,10 +16,10 @@ public static void Example()
// as a catalog of available operations and as the source of randomness.
var mlContext = new MLContext(seed: 0);

// Create in-memory examples as C# native class.
// Create a list of data examples.
var examples = DatasetUtils.GenerateRandomMulticlassClassificationExamples(1000);

// Convert native C# class to IDataView, a consumble format to ML.NET functions.
// Convert the examples list to an IDataView object, which is consumable by ML.NET API.
var dataView = mlContext.Data.LoadFromEnumerable(examples);

//////////////////// Data Preview ////////////////////
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
using Microsoft.ML.Data;
using Microsoft.ML.SamplesUtils;

namespace Microsoft.ML.Samples.Dynamic.Trainers.MulticlassClassification
{
public static class StochasticDualCoordinateAscent
{
public static void Example()
{
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
// as a catalog of available operations and as the source of randomness.
// Setting the seed to a fixed number in this example to make outputs deterministic.
var mlContext = new MLContext(seed: 0);

// Create a list of data examples.
var examples = DatasetUtils.GenerateRandomMulticlassClassificationExamples(1000);
Copy link
Member

@wschin wschin Feb 28, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately, GenerateRandomMulticlassClassificationExamples is not searchable on the doc site, so the only way to fully learn this pipeline is to clone ML.NET. Because SDCA can work with very tiny data set, we could add things like this

        private class DataPoint
        {
            [VectorType(3)]
            public float[] Features;
        }
        var samples = new List<DataPoint>() 
        { 
             new DataPoint(){ Features= new float[3] {1, 0, 0} }, 
             new DataPoint(){ Features= new float[3] {0, 2, 1} }, 
             new DataPoint(){ Features= new float[3] {1, 2, 3} }, 
             new DataPoint(){ Features= new float[3] {0, 1, 0} }, 
             new DataPoint(){ Features= new float[3] {0, 2, 1} },
             new DataPoint(){ Features= new float[3] {-100, 50, -100} } 
         };

into this file and use them. #Resolved

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the type is visible if they use the example, and they can inspect the values with the debugger, BUT moving Featurization into the Samples Utils is a real problem..


In reply to: 261013871 [](ancestors = 261013871)

Copy link
Member

@wschin wschin Feb 28, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, we can't expect user will have visual studio, on for example, Linux. I'd say the best case is that user knows everything they need after reading this example. Please take a look at an scikit-learn example.

X = [[0], [1], [2], [3]]
Y = [0, 1, 2, 3]
clf = svm.SVC(gamma='scale', decision_function_shape='ovo')
clf.fit(X, Y)

Does scikit-learn ask user to go outside the text above to understand that example? In addition, those functions are not searchable on ML.NET doc site, which means a big hole to new users. Honestly, I am not sure if SamplesUtils should be used because it hides some vital information and therefore pushes our examples away from those scikit-learn ones (in terms of readibility). #Pending

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please see my latest comment on #2627. Removing SamplesUtils is a design decision that needs to be made first.

Please see also see my response in another comment where I have the doc links. All of DatasetUtils are searchable:
https://docs.microsoft.com/en-us/dotnet/api/?view=ml-dotnet&term=Microsoft.ML.SamplesUtils.DatasetUtils


In reply to: 261068842 [](ancestors = 261068842)

Copy link
Member

@wschin wschin Mar 1, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My bad. Even if it's searchable, it still has no meaningful document at this page. #Resolved

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Our documentation coverage is low but we're actively working on it, hence this PR. So that page will become meaningful eventually.


In reply to: 261718177 [](ancestors = 261718177)

Copy link
Member

@wschin wschin Mar 1, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please do not put things separated if they are considered a whole example. The organization of the entire documentation will never be organized and learned in a structured way--- this is how computer science world works. Let me give you another example. How would user learn the definition of a vector column by adding VectorType attribute? Assume that he already finds the doc of GenerateRandomMulticlass. He still need to click on the returned type of GenerateRandomMulticlass, which is List<DatasetUtils.MulticlassClassificationExample>. Then, another page will be opened. Where is the vector attribute of my Features? User needs to click on Fields again to open the 3rd page which contains

[Microsoft.ML.Data.VectorType(new System.Int32[] { 10 })]
public float[] Features;

Hiding things in this hierarchical way is definitely a learning barrier. #WontFix

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As pointed by Shaauheen, let's keep the samples as is for V1. Post V1, we can address this by proper discussions that were canceled in favor of API work. For now, some sample for V1, is better than no sample.


In reply to: 261763964 [](ancestors = 261763964)


// Convert the examples list to an IDataView object, which is consumable by ML.NET API.
var dataView = mlContext.Data.LoadFromEnumerable(examples);

//////////////////// Data Preview ////////////////////
// Label Features
// AA 0.7262433,0.8173254,0.7680227,0.5581612,0.2060332,0.5588848,0.9060271,0.4421779,0.9775497,0.2737045
// BB 0.4919063,0.6673147,0.8326591,0.6695119,1.182151,0.230367,1.06237,1.195347,0.8771811,0.5145918
// CC 1.216908,1.248052,1.391902,0.4326252,1.099942,0.9262842,1.334019,1.08762,0.9468155,0.4811099
// DD 0.7871246,1.053327,0.8971719,1.588544,1.242697,1.362964,0.6303943,0.9810045,0.9431419,1.557455

// Create a pipeline.
var pipeline =
// Convert the string labels into key types.
mlContext.Transforms.Conversion.MapValueToKey("Label")
// Apply StochasticDualCoordinateAscent multiclass trainer.
.Append(mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent());

// Split the data into training and test sets. Only training set is used in fitting
// the created pipeline. Metrics are computed on the test.
var split = mlContext.MulticlassClassification.TrainTestSplit(dataView, testFraction: 0.1);

// Train the model.
var model = pipeline.Fit(split.TrainSet);

// Do prediction on the test set.
var dataWithPredictions = model.Transform(split.TestSet);

// Evaluate the trained model using the test set.
var metrics = mlContext.MulticlassClassification.Evaluate(dataWithPredictions);
SamplesUtils.ConsoleUtils.PrintMetrics(metrics);

// Expected output:
// Micro Accuracy: 0.82
// Macro Accuracy: 0.81
// Log Loss: 0.43
// Log Loss Reduction: 67.93
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
using Microsoft.ML.Data;
using Microsoft.ML.SamplesUtils;
using Microsoft.ML.Trainers;

namespace Microsoft.ML.Samples.Dynamic.Trainers.MulticlassClassification
{
public static class StochasticDualCoordinateAscentWithOptions
{
public static void Example()
{
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
// as a catalog of available operations and as the source of randomness.
// Setting the seed to a fixed number in this example to make outputs deterministic.
var mlContext = new MLContext(seed: 0);

// Create a list of data examples.
var examples = DatasetUtils.GenerateRandomMulticlassClassificationExamples(1000);

// Convert the examples list to an IDataView object, which is consumable by ML.NET API.
var dataView = mlContext.Data.LoadFromEnumerable(examples);

//////////////////// Data Preview ////////////////////
// Label Features
// AA 0.7262433,0.8173254,0.7680227,0.5581612,0.2060332,0.5588848,0.9060271,0.4421779,0.9775497,0.2737045
// BB 0.4919063,0.6673147,0.8326591,0.6695119,1.182151,0.230367,1.06237,1.195347,0.8771811,0.5145918
// CC 1.216908,1.248052,1.391902,0.4326252,1.099942,0.9262842,1.334019,1.08762,0.9468155,0.4811099
// DD 0.7871246,1.053327,0.8971719,1.588544,1.242697,1.362964,0.6303943,0.9810045,0.9431419,1.557455

var options = new SdcaMultiClassTrainer.Options
{
// Add custom loss
LossFunction = new HingeLoss.Options(),
// Make the convergence tolerance tighter.
ConvergenceTolerance = 0.05f,
// Increase the maximum number of passes over training data.
MaxIterations = 30,
};

// Create a pipeline.
var pipeline =
// Convert the string labels into key types.
mlContext.Transforms.Conversion.MapValueToKey("Label")
// Apply StochasticDualCoordinateAscent multiclass trainer.
.Append(mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent(options));

// Split the data into training and test sets. Only training set is used in fitting
// the created pipeline. Metrics are computed on the test.
var split = mlContext.MulticlassClassification.TrainTestSplit(dataView, testFraction: 0.1);

// Train the model.
var model = pipeline.Fit(split.TrainSet);

// Do prediction on the test set.
var dataWithPredictions = model.Transform(split.TestSet);

// Evaluate the trained model using the test set.
var metrics = mlContext.MulticlassClassification.Evaluate(dataWithPredictions);
SamplesUtils.ConsoleUtils.PrintMetrics(metrics);

// Expected output:
// Micro Accuracy: 0.82
// Macro Accuracy: 0.81
// Log Loss: 0.64
// Log Loss Reduction: 52.51
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
using System;
using System.Linq;
using Microsoft.ML.Data;

namespace Microsoft.ML.Samples.Dynamic.Trainers.Regression
{
public static class StochasticDualCoordinateAscent
{
public static void Example()
{
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
// as a catalog of available operations and as the source of randomness.
// Setting the seed to a fixed number in this example to make outputs deterministic.
var mlContext = new MLContext(seed: 0);

// Create in-memory examples as C# native class and convert to IDataView
var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(1000);
var dataView = mlContext.Data.LoadFromEnumerable(data);

// Split the data into training and test sets. Only training set is used in fitting
// the created pipeline. Metrics are computed on the test.
var split = mlContext.MulticlassClassification.TrainTestSplit(dataView, testFraction: 0.1);

// Train the model.
var pipeline = mlContext.Regression.Trainers.StochasticDualCoordinateAscent();
var model = pipeline.Fit(split.TrainSet);

// Do prediction on the test set.
var dataWithPredictions = model.Transform(split.TestSet);

// Evaluate the trained model using the test set.
var metrics = mlContext.Regression.Evaluate(dataWithPredictions);
SamplesUtils.ConsoleUtils.PrintMetrics(metrics);

// Expected output:
// L1: 0.27
// L2: 0.11
// LossFunction: 0.11
// RMS: 0.33
// RSquared: 0.56
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
using Microsoft.ML.Data;
using Microsoft.ML.Trainers;

namespace Microsoft.ML.Samples.Dynamic.Trainers.Regression
{
public static class StochasticDualCoordinateAscentWithOptions
{
public static void Example()
{
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
// as a catalog of available operations and as the source of randomness.
// Setting the seed to a fixed number in this example to make outputs deterministic.
var mlContext = new MLContext(seed: 0);

// Create in-memory examples as C# native class and convert to IDataView
var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(1000);
var dataView = mlContext.Data.LoadFromEnumerable(data);

// Split the data into training and test sets. Only training set is used in fitting
// the created pipeline. Metrics are computed on the test.
var split = mlContext.MulticlassClassification.TrainTestSplit(dataView, testFraction: 0.1);

// Create trainer options.
var options = new SdcaRegressionTrainer.Options
{
// Make the convergence tolerance tighter.
ConvergenceTolerance = 0.02f,
// Increase the maximum number of passes over training data.
MaxIterations = 30,
// Increase learning rate for bias
BiasLearningRate = 0.1f
};

// Train the model.
var pipeline = mlContext.Regression.Trainers.StochasticDualCoordinateAscent(options);
var model = pipeline.Fit(split.TrainSet);

// Do prediction on the test set.
var dataWithPredictions = model.Transform(split.TestSet);

// Evaluate the trained model using the test set.
var metrics = mlContext.Regression.Evaluate(dataWithPredictions);
SamplesUtils.ConsoleUtils.PrintMetrics(metrics);

// Expected output:
// L1: 0.26
// L2: 0.11
// LossFunction: 0.11
// RMS: 0.33
// RSquared: 0.56
}
}
}
Loading