Skip to content

Fix bug in TextLoader #3011

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 19 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
7f4e341
Fix bug in TextLoader
yaeldMS Mar 19, 2019
1a89468
Clean FeatureContributionCalculation and PermutationFeatureImportance…
artidoro Mar 19, 2019
aea88dc
Updating LightGBM Arguments (#2948)
singlis Mar 19, 2019
8b1b14f
Hiding of ColumnOptions (#2959)
artidoro Mar 19, 2019
f03c49d
Updating the FunctionalTests to clearly explain why they are not stro…
eerhardt Mar 19, 2019
00a5b35
Added samples for tree regression trainers. (#2999)
Mar 19, 2019
fd1c700
Cleanup the statistics usage API (#2048)
sfilipi Mar 19, 2019
de5d48a
Refactor cancellation mechanism and make it internal, accessible via …
codemzs Mar 19, 2019
c38f81b
Add functional tests for ONNX scenarios (#2984)
rogancarr Mar 19, 2019
3af9a5d
Make Multiclass Linear Trainers Typed Based on Output Model Types. (#…
wschin Mar 20, 2019
807d813
Clean up the SchemaDefinition class (#2995)
yaeldMS Mar 20, 2019
c8a4c7d
Data catalog done (#3021)
sfilipi Mar 20, 2019
ce56462
Activate OnnxTransform unit tests for MacOS (#2695)
jignparm Mar 20, 2019
e00d19d
Added tests for text featurizer options (Part1). (#3006)
zeahmed Mar 20, 2019
a2d7987
Binary FastTree/Forest samples using T4 templates. (#3035)
Mar 20, 2019
77be9d9
Polish standard trainers' catalog (Just rename some variables) (#3029)
wschin Mar 21, 2019
5b22420
Polish train catalog (renaming only) (#3030)
wschin Mar 21, 2019
ce7f0fb
Merge branch 'tryparseschema' of https://github.com/yaeldekel/machine…
yaeldMS Mar 21, 2019
62dda6f
Add more checks for the syntax of the embedded TextLoader options
yaeldMS Mar 21, 2019
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions Microsoft.ML.sln
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Microsoft.ML.Ensemble", "Mi
pkg\Microsoft.ML.Ensemble\Microsoft.ML.Ensemble.symbols.nupkgproj = pkg\Microsoft.ML.Ensemble\Microsoft.ML.Ensemble.symbols.nupkgproj
EndProjectSection
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.Experimental", "src\Microsoft.ML.Experimental\Microsoft.ML.Experimental.csproj", "{E02DA82D-3FEE-4C60-BD80-9EC3C3448DFC}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -948,6 +950,18 @@ Global
{5E920CAC-5A28-42FB-936E-49C472130953}.Release-Intrinsics|Any CPU.Build.0 = Release-Intrinsics|Any CPU
{5E920CAC-5A28-42FB-936E-49C472130953}.Release-netfx|Any CPU.ActiveCfg = Release-netfx|Any CPU
{5E920CAC-5A28-42FB-936E-49C472130953}.Release-netfx|Any CPU.Build.0 = Release-netfx|Any CPU
{E02DA82D-3FEE-4C60-BD80-9EC3C3448DFC}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{E02DA82D-3FEE-4C60-BD80-9EC3C3448DFC}.Debug|Any CPU.Build.0 = Debug|Any CPU
{E02DA82D-3FEE-4C60-BD80-9EC3C3448DFC}.Debug-Intrinsics|Any CPU.ActiveCfg = Debug-Intrinsics|Any CPU
{E02DA82D-3FEE-4C60-BD80-9EC3C3448DFC}.Debug-Intrinsics|Any CPU.Build.0 = Debug-Intrinsics|Any CPU
{E02DA82D-3FEE-4C60-BD80-9EC3C3448DFC}.Debug-netfx|Any CPU.ActiveCfg = Debug-netfx|Any CPU
{E02DA82D-3FEE-4C60-BD80-9EC3C3448DFC}.Debug-netfx|Any CPU.Build.0 = Debug-netfx|Any CPU
{E02DA82D-3FEE-4C60-BD80-9EC3C3448DFC}.Release|Any CPU.ActiveCfg = Release|Any CPU
{E02DA82D-3FEE-4C60-BD80-9EC3C3448DFC}.Release|Any CPU.Build.0 = Release|Any CPU
{E02DA82D-3FEE-4C60-BD80-9EC3C3448DFC}.Release-Intrinsics|Any CPU.ActiveCfg = Release-Intrinsics|Any CPU
{E02DA82D-3FEE-4C60-BD80-9EC3C3448DFC}.Release-Intrinsics|Any CPU.Build.0 = Release-Intrinsics|Any CPU
{E02DA82D-3FEE-4C60-BD80-9EC3C3448DFC}.Release-netfx|Any CPU.ActiveCfg = Release-netfx|Any CPU
{E02DA82D-3FEE-4C60-BD80-9EC3C3448DFC}.Release-netfx|Any CPU.Build.0 = Release-netfx|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down Expand Up @@ -1033,6 +1047,7 @@ Global
{31D38B21-102B-41C0-9E0A-2FE0BF68D123} = {D3D38B03-B557-484D-8348-8BADEE4DF592}
{5E920CAC-5A28-42FB-936E-49C472130953} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
{AD7058C9-5608-49A8-BE23-58C33A74EE91} = {D3D38B03-B557-484D-8348-8BADEE4DF592}
{E02DA82D-3FEE-4C60-BD80-9EC3C3448DFC} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {41165AF1-35BB-4832-A189-73060F82B01D}
Expand Down
2 changes: 1 addition & 1 deletion build/Dependencies.props
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
<PropertyGroup>
<GoogleProtobufPackageVersion>3.5.1</GoogleProtobufPackageVersion>
<LightGBMPackageVersion>2.2.3</LightGBMPackageVersion>
<MicrosoftMLOnnxRuntimePackageVersion>0.2.1</MicrosoftMLOnnxRuntimePackageVersion>
<MicrosoftMLOnnxRuntimePackageVersion>0.3.0</MicrosoftMLOnnxRuntimePackageVersion>
<MlNetMklDepsPackageVersion>0.0.0.9</MlNetMklDepsPackageVersion>
<ParquetDotNetPackageVersion>2.1.3</ParquetDotNetPackageVersion>
<SystemDrawingCommonPackageVersion>4.5.0</SystemDrawingCommonPackageVersion>
Expand Down
10 changes: 5 additions & 5 deletions docs/code/MlNetCookBook.md
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ We tried to make `Preview` debugger-friendly: our expectation is that, if you en
Here is the code sample:
```csharp
var estimator = mlContext.Transforms.Categorical.MapValueToKey("Label")
.Append(mlContext.MulticlassClassification.Trainers.Sdca())
.Append(mlContext.MulticlassClassification.Trainers.SdcaCalibrated())
.Append(mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel"));

var data = mlContext.Data.LoadFromTextFile(new TextLoader.Column[] {
Expand Down Expand Up @@ -423,7 +423,7 @@ var pipeline =
// Cache data in memory for steps after the cache check point stage.
.AppendCacheCheckpoint(mlContext)
// Use the multi-class SDCA model to predict the label using features.
.Append(mlContext.MulticlassClassification.Trainers.Sdca())
.Append(mlContext.MulticlassClassification.Trainers.SdcaCalibrated())
// Apply the inverse conversion from 'PredictedLabel' column back to string value.
.Append(mlContext.Transforms.Conversion.MapKeyToValue(("PredictedLabel", "Data")));

Expand Down Expand Up @@ -547,13 +547,13 @@ var pipeline =
// Cache data in memory for steps after the cache check point stage.
.AppendCacheCheckpoint(mlContext)
// Use the multi-class SDCA model to predict the label using features.
.Append(mlContext.MulticlassClassification.Trainers.Sdca());
.Append(mlContext.MulticlassClassification.Trainers.SdcaCalibrated());

// Train the model.
var trainedModel = pipeline.Fit(trainData);

// Inspect the model parameters.
var modelParameters = trainedModel.LastTransformer.Model as MulticlassLogisticRegressionModelParameters;
var modelParameters = trainedModel.LastTransformer.Model as MaximumEntropyModelParameters;

// Now we can use 'modelParameters' to look at the weights.
// 'weights' will be an array of weight vectors, one vector per class.
Expand Down Expand Up @@ -822,7 +822,7 @@ var pipeline =
// Notice that unused part in the data may not be cached.
.AppendCacheCheckpoint(mlContext)
// Use the multi-class SDCA model to predict the label using features.
.Append(mlContext.MulticlassClassification.Trainers.Sdca());
.Append(mlContext.MulticlassClassification.Trainers.SdcaCalibrated());

// Split the data 90:10 into train and test sets, train and evaluate.
var split = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

namespace Microsoft.ML.Samples.Dynamic
{
public static class Bootstrap
public static class BootstrapSample
{
public static void Example()
{
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
using System;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Linq;
using Microsoft.ML.Data;
using static Microsoft.ML.DataOperationsCatalog;

namespace Microsoft.ML.Samples.Dynamic
{
/// <summary>
/// Sample class showing how to use TrainTestSplit.
/// </summary>
public static class TrainTestSplit
{
public static void Example()
{
// Creating the ML.Net IHostEnvironment object, needed for the pipeline.
var mlContext = new MLContext();

// Generate some data points.
var examples = GenerateRandomDataPoints(10);

// Convert the examples list to an IDataView object, which is consumable by ML.NET API.
var dataview = mlContext.Data.LoadFromEnumerable(examples);

// Leave out 10% of the dataset for testing.For some types of problems, for example for ranking or anomaly detection,
// we must ensure that the split leaves the rows with the same value in a particular column, in one of the splits.
// So below, we specify Group column as the column containing the sampling keys.
// Notice how keeping the rows with the same value in the Group column overrides the testFraction definition.
TrainTestData split = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumnName: "Group");

PrintPreviewRows(split);

// The data in the Train split.
// [Group, 1], [Features, 0.8173254]
// [Group, 1], [Features, 0.5581612]
// [Group, 1], [Features, 0.5588848]
// [Group, 1], [Features, 0.4421779]
// [Group, 1], [Features, 0.2737045]

// The data in the Test split.
// [Group, 0], [Features, 0.7262433]
// [Group, 0], [Features, 0.7680227]
// [Group, 0], [Features, 0.2060332]
// [Group, 0], [Features, 0.9060271]
// [Group, 0], [Features, 0.9775497]

// Example of a split without specifying a sampling key column.
split = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.2);
PrintPreviewRows(split);

// The data in the Train split.
// [Group, 0], [Features, 0.7262433]
// [Group, 1], [Features, 0.8173254]
// [Group, 0], [Features, 0.7680227]
// [Group, 1], [Features, 0.5581612]
// [Group, 0], [Features, 0.2060332]
// [Group, 1], [Features, 0.4421779]
// [Group, 0], [Features, 0.9775497]
// [Group, 1], [Features, 0.2737045]

// The data in the Test split.
// [Group, 1], [Features, 0.5588848]
// [Group, 0], [Features, 0.9060271]

}

private static IEnumerable<DataPoint> GenerateRandomDataPoints(int count, int seed = 0)
{
var random = new Random(seed);
for (int i = 0; i < count; i++)
{
yield return new DataPoint
{
Group = i % 2,

// Create random features that are correlated with label.
Features = (float)random.NextDouble()
};
}
}

// Example with label and group column. A data set is a collection of such examples.
private class DataPoint
{
public float Group { get; set; }

public float Features { get; set; }
}

// print helper
private static void PrintPreviewRows(TrainTestData split)
{

var trainDataPreview = split.TrainSet.Preview();
var testDataPreview = split.TestSet.Preview();

Console.WriteLine($"The data in the Train split.");
foreach (var row in trainDataPreview.RowView)
Console.WriteLine($"{row.Values[0]}, {row.Values[1]}");

Console.WriteLine($"\nThe data in the Test split.");
foreach (var row in testDataPreview.RowView)
Console.WriteLine($"{row.Values[0]}, {row.Values[1]}");
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,12 @@ public static void Example()
// Create a Feature Contribution Calculator
// Calculate the feature contributions for all features given trained model parameters
// And don't normalize the contribution scores
var featureContributionCalculator = mlContext.Model.Explainability.FeatureContributionCalculation(model.Model, model.FeatureColumnName, numPositiveContributions: 11, normalize: false);
var featureContributionCalculator = mlContext.Transforms.CalculateFeatureContribution(model, numberOfPositiveContributions: 11, normalize: false);
var outputData = featureContributionCalculator.Fit(scoredData).Transform(scoredData);

// FeatureContributionCalculatingEstimator can be use as an intermediary step in a pipeline.
// The features retained by FeatureContributionCalculatingEstimator will be in the FeatureContribution column.
var pipeline = mlContext.Model.Explainability.FeatureContributionCalculation(model.Model, model.FeatureColumnName, numPositiveContributions: 11)
var pipeline = mlContext.Transforms.CalculateFeatureContribution(model, numberOfPositiveContributions: 11)
.Append(mlContext.Regression.Trainers.Ols(featureColumnName: "FeatureContributions"));
var outData = featureContributionCalculator.Fit(scoredData).Transform(scoredData);

Expand Down
3 changes: 2 additions & 1 deletion docs/samples/Microsoft.ML.Samples/Dynamic/Normalizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ public static void Example()

// Composing a different pipeline if we wanted to normalize more than one column at a time.
// Using log scale as the normalization mode.
var multiColPipeline = ml.Transforms.Normalize(NormalizingEstimator.NormalizationMode.LogMeanVariance, new ColumnOptions[] { ("LogInduced", "Induced"), ("LogSpontaneous", "Spontaneous") });
var multiColPipeline = ml.Transforms.Normalize("LogInduced", "Induced", NormalizingEstimator.NormalizationMode.LogMeanVariance)
.Append(ml.Transforms.Normalize("LogSpontaneous", "Spontaneous", NormalizingEstimator.NormalizationMode.LogMeanVariance));
// The transformed data.
var multiColtransformer = multiColPipeline.Fit(trainData);
var multiColtransformedData = multiColtransformer.Transform(trainData);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ public static void Example()
// Compute the permutation metrics using the properly normalized data.
var transformedData = model.Transform(data);
var permutationMetrics = mlContext.Regression.PermutationFeatureImportance(
linearPredictor, transformedData, label: labelName, features: "Features", permutationCount: 3);
linearPredictor, transformedData, labelColumnName: labelName, permutationCount: 3);

// Now let's look at which features are most important to the model overall
// Get the feature indices sorted by their impact on R-Squared
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ public static void Example()
// Compute the permutation metrics using the properly normalized data.
var transformedData = model.Transform(data);
var permutationMetrics = mlContext.BinaryClassification.PermutationFeatureImportance(
linearPredictor, transformedData, label: labelName, features: "Features", permutationCount: 3);
linearPredictor, transformedData, labelColumnName: labelName, permutationCount: 3);

// Now let's look at which features are most important to the model overall.
// Get the feature indices sorted by their impact on AreaUnderRocCurve.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,11 @@ public static void Example()
};

var model = mlContext.Transforms.Text.TokenizeIntoWords("TokenizedWords", "Sentiment_Text")
.Append(mlContext.Transforms.Conversion.MapValue(lookupMap, "Words", "Ids", new ColumnOptions[] { ("VariableLenghtFeatures", "TokenizedWords") }))
.Append(mlContext.Transforms.Conversion.MapValue("VariableLenghtFeatures", lookupMap,
lookupMap.Schema["Words"], lookupMap.Schema["Ids"], "TokenizedWords"))
.Append(mlContext.Transforms.CustomMapping(ResizeFeaturesAction, "Resize"))
.Append(tensorFlowModel.ScoreTensorFlowModel(new[] { "Prediction/Softmax" }, new[] { "Features" }))
.Append(mlContext.Transforms.CopyColumns(("Prediction", "Prediction/Softmax")))
.Append(mlContext.Transforms.CopyColumns("Prediction", "Prediction/Softmax"))
.Fit(dataView);
var engine = mlContext.Model.CreatePredictionEngine<IMDBSentiment, OutputScores>(model);

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.ML.Data;

namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification
{
public static class FastForest
{
// This example requires installation of additional NuGet package
// <a href="https://www.nuget.org/packages/Microsoft.ML.FastTree/">Microsoft.ML.FastTree</a>.
public static void Example()
{
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
// as a catalog of available operations and as the source of randomness.
// Setting the seed to a fixed number in this example to make outputs deterministic.
var mlContext = new MLContext(seed: 0);

// Create a list of training data points.
var dataPoints = GenerateRandomDataPoints(1000);

// Convert the list of data points to an IDataView object, which is consumable by ML.NET API.
var trainingData = mlContext.Data.LoadFromEnumerable(dataPoints);

// Define the trainer.
var pipeline = mlContext.BinaryClassification.Trainers.FastForest();

// Train the model.
var model = pipeline.Fit(trainingData);

// Create testing data. Use different random seed to make it different from training data.
var testData = mlContext.Data.LoadFromEnumerable(GenerateRandomDataPoints(500, seed:123));

// Run the model on test data set.
var transformedTestData = model.Transform(testData);

// Convert IDataView object to a list.
var predictions = mlContext.Data.CreateEnumerable<Prediction>(transformedTestData, reuseRowObject: false).ToList();

// Look at 5 predictions
foreach (var p in predictions.Take(5))
Console.WriteLine($"Label: {p.Label}, Prediction: {p.PredictedLabel}");

// Expected output:
// Label: True, Prediction: True
// Label: False, Prediction: False
// Label: True, Prediction: True
// Label: True, Prediction: True
// Label: False, Prediction: False

// Evaluate the overall metrics
var metrics = mlContext.BinaryClassification.EvaluateNonCalibrated(transformedTestData);
SamplesUtils.ConsoleUtils.PrintMetrics(metrics);

// Expected output:
// Accuracy: 0.74
// AUC: 0.83
// F1 Score: 0.74
// Negative Precision: 0.78
// Negative Recall: 0.71
// Positive Precision: 0.71
// Positive Recall: 0.78
}

private static IEnumerable<DataPoint> GenerateRandomDataPoints(int count, int seed=0)
{
var random = new Random(seed);
float randomFloat() => (float)random.NextDouble();
for (int i = 0; i < count; i++)
{
var label = randomFloat() > 0.5f;
yield return new DataPoint
{
Label = label,
// Create random features that are correlated with the label.
// For data points with false label, the feature values are slightly increased by adding a constant.
Features = Enumerable.Repeat(label, 50).Select(x => x ? randomFloat() : randomFloat() + 0.03f).ToArray()
};
}
}

// Example with label and 50 feature values. A data set is a collection of such examples.
private class DataPoint
{
public bool Label { get; set; }
[VectorType(50)]
public float[] Features { get; set; }
}

// Class used to capture predictions.
private class Prediction
{
// Original label.
public bool Label { get; set; }
// Predicted label from the trainer.
public bool PredictedLabel { get; set; }
}
}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
<#@ include file="TreeSamplesTemplate.ttinclude"#>

<#+
string ClassName="FastForest";
string Trainer = "FastForest";
string TrainerOptions = null;
bool IsCalibrated = false;

string ExpectedOutputPerInstance= @"// Expected output:
// Label: True, Prediction: True
// Label: False, Prediction: False
// Label: True, Prediction: True
// Label: True, Prediction: True
// Label: False, Prediction: False";

string ExpectedOutput = @"// Expected output:
// Accuracy: 0.74
// AUC: 0.83
// F1 Score: 0.74
// Negative Precision: 0.78
// Negative Recall: 0.71
// Positive Precision: 0.71
// Positive Recall: 0.78";
#>
Loading