From c312ed56cb16f2afecf446e37ff95d24fcced75b Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Tue, 12 Mar 2019 16:24:17 -0700 Subject: [PATCH 1/3] Adding Debugging Scenario tests for V1 APIs --- .../Debugging.cs | 213 ++++++++++++++++++ .../Scenarios/Api/Estimators/Visibility.cs | 39 ---- 2 files changed, 213 insertions(+), 39 deletions(-) create mode 100644 test/Microsoft.ML.Functional.Tests/Debugging.cs delete mode 100644 test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Visibility.cs diff --git a/test/Microsoft.ML.Functional.Tests/Debugging.cs b/test/Microsoft.ML.Functional.Tests/Debugging.cs new file mode 100644 index 0000000000..58ccd4e85a --- /dev/null +++ b/test/Microsoft.ML.Functional.Tests/Debugging.cs @@ -0,0 +1,213 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using Microsoft.ML.Data; +using Microsoft.ML.Functional.Tests.Datasets; +using Microsoft.ML.RunTests; +using Microsoft.ML.TestFramework; +using Microsoft.ML.Trainers; +using Microsoft.ML.Transforms.Text; +using Xunit; +using Xunit.Abstractions; + +namespace Microsoft.ML.Functional.Tests +{ + public class Debugging : BaseTestClass + { + public Debugging(ITestOutputHelper output) : base(output) + { + } + + /// + /// Debugging: The individual pipeline steps can be inspected to see what is happening to + /// data as it flows through. + /// + /// + /// It should, possibly through the debugger, be not such a pain to actually + /// see what is happening to your data when you apply this or that transform. For example, if I + /// were to have the text "Help I'm a bug!" I should be able to see the steps where it is + /// normalized to "help i'm a bug" then tokenized into ["help", "i'm", "a", "bug"] then + /// mapped into term numbers [203, 25, 3, 511] then projected into the sparse + /// float vector {3:1, 25:1, 203:1, 511:1}, etc. etc. + /// + [Fact] + void InspectIntermediatePipelineSteps() + { + var mlContext = new MLContext(seed: 1); + + var data = mlContext.Data.LoadFromEnumerable( + new TweetSentiment[] + { + new TweetSentiment { Sentiment = true, SentimentText = "I love ML.NET." }, + new TweetSentiment { Sentiment = true, SentimentText = "I love TLC." }, + new TweetSentiment { Sentiment = false, SentimentText = "I dislike fika." } + }); + + // create a training pipeline. + var pipeline = mlContext.Transforms.Text.FeaturizeText( + "Features", + new TextFeaturizingEstimator.Options { + KeepPunctuations = false, + OutputTokens = true, + UseCharExtractor = false, + UseWordExtractor = true, + VectorNormalizer = TextFeaturizingEstimator.NormFunction.None + }, + "SentimentText"); + + // Fit the pipeline to the data. + var model = pipeline.Fit(data); + + // Transform the data. + var transformedData = model.Transform(data); + + var preview = transformedData.Preview(); + + // Verify that columns can be inspected. + // Validate the tokens column. + var tokensColumn = transformedData.GetColumn(transformedData.Schema["Features_TransformedText"]); + var expectedTokens = new string[3][] + { + new string[] {"i", "love", "mlnet"}, + new string[] {"i", "love", "tlc"}, + new string[] {"i", "dislike", "fika"}, + }; + int i = 0; + foreach (var rowTokens in tokensColumn) + Assert.Equal(expectedTokens[i++], rowTokens); + + // Validate the Features column. + var featuresColumn = transformedData.GetColumn(transformedData.Schema["Features"]); + var expectedFeatures = new float[3][] + { + new float[6] { 1, 1, 1, 0, 0 ,0 }, + new float[6] { 1, 1, 0, 1, 0, 0 }, + new float[6] { 1, 0, 0, 0, 1, 1 } + }; + i = 0; + foreach (var rowFeatures in featuresColumn) + Assert.Equal(expectedFeatures[i++], rowFeatures); + } + + /// + /// Debugging: The schema of the pipeline can be inspected. + /// + [Fact] + public void InspectPipelineSchema() + { + var mlContext = new MLContext(seed: 1); + + // Get the dataset. + var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true); + + // Define a pipeline + var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features) + .Append(mlContext.Transforms.Normalize()) + .AppendCacheCheckpoint(mlContext) + .Append(mlContext.Regression.Trainers.StochasticDualCoordinateAscent( + new SdcaRegressionTrainer.Options { NumberOfThreads = 1, MaximumNumberOfIterations = 20 })); + + // Fit the pipeline to the data. + var model = pipeline.Fit(data); + + // Inspect the model schema, and verify that a Score column is produced. + var outputSchema = model.GetOutputSchema(data.Schema); + var columnNames = new string[outputSchema.Count]; + int i = 0; + foreach (var column in outputSchema) + columnNames[i++] = column.Name; + Assert.Contains("Score", columnNames); + } + + /// + /// Debugging: The schema read in can be verified by inspecting the data. + /// + [Fact] + public void InspectSchemaUponLoadingData() + { + var mlContext = new MLContext(seed: 1); + + // Get the dataset. + var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true); + + // Verify the column names. + int i = 0; + foreach (var column in data.Schema) + { + if (i == 0) + Assert.Equal("Label", column.Name); + else + Assert.Equal(HousingRegression.Features[i-1], column.Name); + i++; + } + + // Verify that I can cast it to the right schema by inspecting the first row. + foreach (var row in mlContext.Data.CreateEnumerable(mlContext.Data.TakeRows(data, 1), true)) + { + // Validate there was data in the row by checking that some values were not zero since zero is the default. + var rowSum = row.MedianHomeValue; + foreach (var property in HousingRegression.Features) + rowSum += (float) row.GetType().GetProperty(property).GetValue(row, null); + + Assert.NotEqual(0, rowSum); + } + } + + /// + /// Debugging: The progress of training can be accessed. + /// + [Fact] + public void ViewTrainingOutput() + { + var mlContext = new MLContext(seed: 1); + + // Attach a listener. + var logWatcher = new LogWatcher(); + mlContext.Log += logWatcher.ObserveEvent; + + // Get the dataset. + var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true); + + // Define a pipeline + var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features) + .Append(mlContext.Transforms.Normalize()) + .AppendCacheCheckpoint(mlContext) + .Append(mlContext.Regression.Trainers.StochasticDualCoordinateAscent( + new SdcaRegressionTrainer.Options { NumberOfThreads = 1, MaximumNumberOfIterations = 20 })); + + // Fit the pipeline to the data. + var model = pipeline.Fit(data); + + // Validate that we can read lines from the file. + var expectedLines = new string[3] { + @"[Source=SdcaTrainerBase; Training, Kind=Info] Auto-tuning parameters: L2 = 0.001.", + @"[Source=SdcaTrainerBase; Training, Kind=Info] Auto-tuning parameters: L1Threshold (L1/L2) = 0.", + @"[Source=SdcaTrainerBase; Training, Kind=Info] Using best model from iteration 7."}; + foreach (var line in expectedLines) + { + Assert.Contains(line, logWatcher.Lines); + Assert.Equal(1, logWatcher.Lines[line]); + } + } + + internal class LogWatcher { + + public readonly IDictionary Lines; + + public LogWatcher() + { + Lines = new Dictionary(); + } + + public void ObserveEvent(object sender, LoggingEventArgs e) + { + if (Lines.ContainsKey(e.Message)) + Lines[e.Message]++; + else + Lines[e.Message] = 1; + } + } + } +} diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Visibility.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Visibility.cs deleted file mode 100644 index 8f4f3a8495..0000000000 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Visibility.cs +++ /dev/null @@ -1,39 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System.Collections.Generic; -using System.Linq; -using Microsoft.ML.Data; -using Microsoft.ML.RunTests; -using Xunit; - -namespace Microsoft.ML.Tests.Scenarios.Api -{ - public partial class ApiScenariosTests - { - /// - /// Visibility: It should, possibly through the debugger, be not such a pain to actually - /// see what is happening to your data when you apply this or that transform. For example, if I - /// were to have the text "Help I'm a bug!" I should be able to see the steps where it is - /// normalized to "help i'm a bug" then tokenized into ["help", "i'm", "a", "bug"] then - /// mapped into term numbers [203, 25, 3, 511] then projected into the sparse - /// float vector {3:1, 25:1, 203:1, 511:1}, etc. etc. - /// - [Fact] - void Visibility() - { - var ml = new MLContext(seed: 1); - var pipeline = ml.Data.CreateTextLoader(TestDatasets.Sentiment.GetLoaderColumns(), hasHeader: true) - .Append(ml.Transforms.Text.FeaturizeText( - "Features", new Transforms.Text.TextFeaturizingEstimator.Options { OutputTokens = true }, "SentimentText")); - - var src = new MultiFileSource(GetDataPath(TestDatasets.Sentiment.trainFilename)); - var data = pipeline.Fit(src).Load(src); - - var textColumn = data.GetColumn(data.Schema["SentimentText"]).Take(20); - var transformedTextColumn = data.GetColumn(data.Schema["Features_TransformedText"]).Take(20); - var features = data.GetColumn(data.Schema["Features"]).Take(20); - } - } -} From 2006be742faea83f29e49ad07390f13b76d5ffdd Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Wed, 13 Mar 2019 13:45:56 -0700 Subject: [PATCH 2/3] Updating to the new APIs in master. --- test/Microsoft.ML.Functional.Tests/Debugging.cs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/test/Microsoft.ML.Functional.Tests/Debugging.cs b/test/Microsoft.ML.Functional.Tests/Debugging.cs index 58ccd4e85a..1e869fdae0 100644 --- a/test/Microsoft.ML.Functional.Tests/Debugging.cs +++ b/test/Microsoft.ML.Functional.Tests/Debugging.cs @@ -48,11 +48,12 @@ void InspectIntermediatePipelineSteps() // create a training pipeline. var pipeline = mlContext.Transforms.Text.FeaturizeText( "Features", - new TextFeaturizingEstimator.Options { + new TextFeaturizingEstimator.Options + { KeepPunctuations = false, OutputTokens = true, - UseCharExtractor = false, - UseWordExtractor = true, + CharFeatureExtractor = null, // new WordBagEstimator.Options { NgramLength = 0, SkipLength = -1 }, + WordFeatureExtractor = new WordBagEstimator.Options { NgramLength = 1}, VectorNormalizer = TextFeaturizingEstimator.NormFunction.None }, "SentimentText"); @@ -106,7 +107,7 @@ public void InspectPipelineSchema() var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features) .Append(mlContext.Transforms.Normalize()) .AppendCacheCheckpoint(mlContext) - .Append(mlContext.Regression.Trainers.StochasticDualCoordinateAscent( + .Append(mlContext.Regression.Trainers.Sdca( new SdcaRegressionTrainer.Options { NumberOfThreads = 1, MaximumNumberOfIterations = 20 })); // Fit the pipeline to the data. @@ -174,7 +175,7 @@ public void ViewTrainingOutput() var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features) .Append(mlContext.Transforms.Normalize()) .AppendCacheCheckpoint(mlContext) - .Append(mlContext.Regression.Trainers.StochasticDualCoordinateAscent( + .Append(mlContext.Regression.Trainers.Sdca( new SdcaRegressionTrainer.Options { NumberOfThreads = 1, MaximumNumberOfIterations = 20 })); // Fit the pipeline to the data. From 1a10d27cc050b192a56c8db1ef682c5ea4d4f376 Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Wed, 13 Mar 2019 15:29:19 -0700 Subject: [PATCH 3/3] Fixing changes against master. --- test/Microsoft.ML.Functional.Tests/Debugging.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Microsoft.ML.Functional.Tests/Debugging.cs b/test/Microsoft.ML.Functional.Tests/Debugging.cs index 1e869fdae0..a495c99c99 100644 --- a/test/Microsoft.ML.Functional.Tests/Debugging.cs +++ b/test/Microsoft.ML.Functional.Tests/Debugging.cs @@ -54,7 +54,7 @@ void InspectIntermediatePipelineSteps() OutputTokens = true, CharFeatureExtractor = null, // new WordBagEstimator.Options { NgramLength = 0, SkipLength = -1 }, WordFeatureExtractor = new WordBagEstimator.Options { NgramLength = 1}, - VectorNormalizer = TextFeaturizingEstimator.NormFunction.None + Norm = TextFeaturizingEstimator.NormFunction.None }, "SentimentText");