Skip to content

Cleaned and fixed public API surface for KMeans, NaiveBayes, OLS. #2819

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Mar 5, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ private class BinaryOutputRow
private readonly static Action<ContinuousInputRow, BinaryOutputRow> GreaterThanAverage = (input, output)
=> output.AboveAverage = input.MedianHomeValue > 22.6;

public static float[] GetLinearModelWeights(OlsLinearRegressionModelParameters linearModel)
public static float[] GetLinearModelWeights(OrdinaryLeastSquaresRegressionModelParameters linearModel)
{
return linearModel.Weights.ToArray();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
var ml = new MLContext();
var ml = new MLContext(seed: 1, conc: 1);
Copy link
Member

@sfilipi sfilipi Mar 4, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seed: 1, conc: 1 [](start = 35, length = 16)

is this needed? #Resolved

Copy link
Contributor Author

@zeahmed zeahmed Mar 4, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, its needed otherwise results are different on every run.


In reply to: 262274167 [](ancestors = 262274167)


// Get a small dataset as an IEnumerable and convert it to an IDataView.
var data = SamplesUtils.DatasetUtils.GetInfertData();
Expand Down Expand Up @@ -39,7 +39,10 @@ public static void Example()
modelParams.GetClusterCentroids(ref centroids, out k);

var centroid = centroids[0].GetValues();
Console.WriteLine("The coordinates of centroid 0 are: " + string.Join(", ", centroid.ToArray()));
Console.WriteLine($"The coordinates of centroid 0 are: ({string.Join(", ", centroid.ToArray())})");

// Expected Output:
// The coordinates of centroid 0 are: (26, 6, 1)
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
using System;
using Microsoft.ML.Data;
using Microsoft.ML.Trainers;

namespace Microsoft.ML.Samples.Dynamic
{
public class KMeansWithOptions
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
var ml = new MLContext(seed: 1, conc: 1);

// Get a small dataset as an IEnumerable and convert it to an IDataView.
var data = SamplesUtils.DatasetUtils.GetInfertData();
var trainData = ml.Data.LoadFromEnumerable(data);

// Preview of the data.
//
// Age Case Education Induced Parity PooledStratum RowNum ...
// 26 1 0-5yrs 1 6 3 1 ...
// 42 1 0-5yrs 1 1 1 2 ...
// 39 1 0-5yrs 2 6 4 3 ...
// 34 1 0-5yrs 2 4 2 4 ...
// 35 1 6-11yrs 1 3 32 5 ...

// A pipeline for concatenating the age, parity and induced columns together in the Features column and training a KMeans model on them.
string outputColumnName = "Features";
var pipeline = ml.Transforms.Concatenate(outputColumnName, new[] { "Age", "Parity", "Induced" })
.Append(ml.Clustering.Trainers.KMeans(
new KMeansPlusPlusTrainer.Options
{
FeatureColumnName = outputColumnName,
NumberOfClusters = 2,
NumberOfIterations = 100,
OptimizationTolerance = 1e-6f
}
));

var model = pipeline.Fit(trainData);

// Get cluster centroids and the number of clusters k from KMeansModelParameters.
VBuffer<float>[] centroids = default;
int k;

var modelParams = model.LastTransformer.Model;
modelParams.GetClusterCentroids(ref centroids, out k);

var centroid = centroids[0].GetValues();
Console.WriteLine("The coordinates of centroid 0 are: " + string.Join(", ", centroid.ToArray()));

// Expected Output:
// The coordinates of centroid 0 are: (26, 6, 1)
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ public static void Example()
// as data is already processed in a form consumable by the trainer
var pipeline = mlContext.Regression.Trainers.OrdinaryLeastSquares(new OrdinaryLeastSquaresRegressionTrainer.Options()
{
L2Weight = 0.1f,
PerParameterSignificance = false
L2Regularization = 0.1f,
CalculateStatistics = false
});
var model = pipeline.Fit(split.TrainSet);

Expand Down
16 changes: 11 additions & 5 deletions src/Microsoft.ML.KMeansClustering/KMeansCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ namespace Microsoft.ML
public static class KMeansClusteringExtensions
{
/// <summary>
/// Train a KMeans++ clustering algorithm.
/// Train a KMeans++ clustering algorithm using <see cref="KMeansPlusPlusTrainer"/>.
/// </summary>
/// <param name="catalog">The clustering catalog trainer object.</param>
/// <param name="featureColumnName">The name of the feature column.</param>
Expand All @@ -24,13 +24,13 @@ public static class KMeansClusteringExtensions
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[KMeans](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/KMeans.cs)]
/// [!code-csharp[KMeans](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Clustering/KMeans.cs)]
/// ]]></format>
/// </example>
public static KMeansPlusPlusTrainer KMeans(this ClusteringCatalog.ClusteringTrainers catalog,
string featureColumnName = DefaultColumnNames.Features,
string exampleWeightColumnName = null,
int clustersCount = KMeansPlusPlusTrainer.Defaults.ClustersCount)
int clustersCount = KMeansPlusPlusTrainer.Defaults.NumberOfClusters)
{
Contracts.CheckValue(catalog, nameof(catalog));
var env = CatalogUtils.GetEnvironment(catalog);
Expand All @@ -39,16 +39,22 @@ public static KMeansPlusPlusTrainer KMeans(this ClusteringCatalog.ClusteringTrai
{
FeatureColumnName = featureColumnName,
ExampleWeightColumnName = exampleWeightColumnName,
ClustersCount = clustersCount
NumberOfClusters = clustersCount
};
return new KMeansPlusPlusTrainer(env, options);
}

/// <summary>
/// Train a KMeans++ clustering algorithm.
/// Train a KMeans++ clustering algorithm using <see cref="KMeansPlusPlusTrainer"/>.
/// </summary>
/// <param name="catalog">The clustering catalog trainer object.</param>
/// <param name="options">Algorithm advanced options.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[KMeans](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Clustering/KMeansWithOptions.cs)]
/// ]]></format>
/// </example>
public static KMeansPlusPlusTrainer KMeans(this ClusteringCatalog.ClusteringTrainers catalog, KMeansPlusPlusTrainer.Options options)
{
Contracts.CheckValue(catalog, nameof(catalog));
Expand Down
38 changes: 19 additions & 19 deletions src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,18 +36,18 @@ public class KMeansPlusPlusTrainer : TrainerEstimatorBase<ClusteringPredictionTr
+ "number of clusters in order to minimize the within-cluster sum of squares. K-means++ improves upon K-means by using a better "
+ "method for choosing the initial cluster centers.";

public enum InitAlgorithm
public enum InitializationAlgorithm
{
KMeansPlusPlus = 0,
Random = 1,
KMeansParallel = 2
KMeansYinyang = 2
}

[BestFriend]
internal static class Defaults
{
/// <value>The number of clusters.</value>
public const int ClustersCount = 5;
public const int NumberOfClusters = 5;
}

public sealed class Options : UnsupervisedTrainerInputBaseWithWeight
Expand All @@ -58,13 +58,13 @@ public sealed class Options : UnsupervisedTrainerInputBaseWithWeight
[Argument(ArgumentType.AtMostOnce, HelpText = "The number of clusters", SortOrder = 50, Name = "K")]
[TGUI(SuggestedSweeps = "5,10,20,40")]
[TlcModule.SweepableDiscreteParam("K", new object[] { 5, 10, 20, 40 })]
public int ClustersCount = Defaults.ClustersCount;
public int NumberOfClusters = Defaults.NumberOfClusters;

/// <summary>
/// Cluster initialization algorithm.
/// </summary>
[Argument(ArgumentType.AtMostOnce, HelpText = "Cluster initialization algorithm", ShortName = "init")]
public InitAlgorithm InitAlgorithm = InitAlgorithm.KMeansParallel;
public InitializationAlgorithm InitializationAlgorithm = InitializationAlgorithm.KMeansYinyang;

/// <summary>
/// Tolerance parameter for trainer convergence. Low = slower, more accurate.
Expand All @@ -79,7 +79,7 @@ public sealed class Options : UnsupervisedTrainerInputBaseWithWeight
/// </summary>
[Argument(ArgumentType.AtMostOnce, HelpText = "Maximum number of iterations.", ShortName = "maxiter")]
[TGUI(Label = "Max Number of Iterations")]
public int MaxIterations = 1000;
public int NumberOfIterations = 1000;

/// <summary>
/// Memory budget (in MBs) to use for KMeans acceleration.
Expand All @@ -94,7 +94,7 @@ public sealed class Options : UnsupervisedTrainerInputBaseWithWeight
/// </summary>
[Argument(ArgumentType.AtMostOnce, HelpText = "Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed.", ShortName = "nt,t,threads", SortOrder = 50)]
[TGUI(Label = "Number of threads")]
public int? NumThreads;
public int? NumberOfThreads;
}

private readonly int _k;
Expand All @@ -103,7 +103,7 @@ public sealed class Options : UnsupervisedTrainerInputBaseWithWeight
private readonly float _convergenceThreshold; // convergence thresholds

private readonly long _accelMemBudgetMb;
private readonly InitAlgorithm _initAlgorithm;
private readonly InitializationAlgorithm _initAlgorithm;
private readonly int _numThreads;
private readonly string _featureColumn;

Expand All @@ -119,26 +119,26 @@ internal KMeansPlusPlusTrainer(IHostEnvironment env, Options options)
: base(Contracts.CheckRef(env, nameof(env)).Register(LoadNameValue), TrainerUtils.MakeR4VecFeature(options.FeatureColumnName), default, TrainerUtils.MakeR4ScalarWeightColumn(options.ExampleWeightColumnName))
{
Host.CheckValue(options, nameof(options));
Host.CheckUserArg(options.ClustersCount > 0, nameof(options.ClustersCount), "Must be positive");
Host.CheckUserArg(options.NumberOfClusters > 0, nameof(options.NumberOfClusters), "Must be positive");

_featureColumn = options.FeatureColumnName;

_k = options.ClustersCount;
_k = options.NumberOfClusters;

Host.CheckUserArg(options.MaxIterations > 0, nameof(options.MaxIterations), "Must be positive");
_maxIterations = options.MaxIterations;
Host.CheckUserArg(options.NumberOfIterations > 0, nameof(options.NumberOfIterations), "Must be positive");
_maxIterations = options.NumberOfIterations;

Host.CheckUserArg(options.OptimizationTolerance > 0, nameof(options.OptimizationTolerance), "Tolerance must be positive");
_convergenceThreshold = options.OptimizationTolerance;

Host.CheckUserArg(options.AccelerationMemoryBudgetMb > 0, nameof(options.AccelerationMemoryBudgetMb), "Must be positive");
_accelMemBudgetMb = options.AccelerationMemoryBudgetMb;

_initAlgorithm = options.InitAlgorithm;
_initAlgorithm = options.InitializationAlgorithm;

Host.CheckUserArg(!options.NumThreads.HasValue || options.NumThreads > 0, nameof(options.NumThreads),
Host.CheckUserArg(!options.NumberOfThreads.HasValue || options.NumberOfThreads > 0, nameof(options.NumberOfThreads),
"Must be either null or a positive integer.");
_numThreads = ComputeNumThreads(Host, options.NumThreads);
_numThreads = ComputeNumThreads(Host, options.NumberOfThreads);
Info = new TrainerInfo();
}

Expand Down Expand Up @@ -184,12 +184,12 @@ private KMeansModelParameters TrainCore(IChannel ch, RoleMappedData data, int di
// all produce a valid set of output centroids with various trade-offs in runtime (with perhaps
// random initialization creating a set that's not terribly useful.) They could also be extended to
// pay attention to their incoming set of centroids and incrementally train.
if (_initAlgorithm == InitAlgorithm.KMeansPlusPlus)
if (_initAlgorithm == InitializationAlgorithm.KMeansPlusPlus)
{
KMeansPlusPlusInit.Initialize(Host, _numThreads, ch, cursorFactory, _k, dimensionality,
centroids, out missingFeatureCount, out totalTrainingInstances);
}
else if (_initAlgorithm == InitAlgorithm.Random)
else if (_initAlgorithm == InitializationAlgorithm.Random)
{
KMeansRandomInit.Initialize(Host, _numThreads, ch, cursorFactory, _k,
centroids, out missingFeatureCount, out totalTrainingInstances);
Expand Down Expand Up @@ -743,8 +743,8 @@ public static void Initialize(IHost host, int numThreads, IChannel ch, FeatureFl
host.CheckValue(ch, nameof(ch));
ch.CheckValue(cursorFactory, nameof(cursorFactory));
ch.CheckValue(centroids, nameof(centroids));
ch.CheckUserArg(numThreads > 0, nameof(KMeansPlusPlusTrainer.Options.NumThreads), "Must be positive");
ch.CheckUserArg(k > 0, nameof(KMeansPlusPlusTrainer.Options.ClustersCount), "Must be positive");
ch.CheckUserArg(numThreads > 0, nameof(KMeansPlusPlusTrainer.Options.NumberOfThreads), "Must be positive");
ch.CheckUserArg(k > 0, nameof(KMeansPlusPlusTrainer.Options.NumberOfClusters), "Must be positive");
ch.CheckParam(dimensionality > 0, nameof(dimensionality), "Must be positive");
ch.CheckUserArg(accelMemBudgetMb >= 0, nameof(KMeansPlusPlusTrainer.Options.AccelerationMemoryBudgetMb), "Must be non-negative");

Expand Down
Loading